diff options
author | dim <dim@FreeBSD.org> | 2015-05-27 20:26:41 +0000 |
---|---|---|
committer | dim <dim@FreeBSD.org> | 2015-05-27 20:26:41 +0000 |
commit | 5ef8fd3549d38e883a31881636be3dc2a275de20 (patch) | |
tree | bd13a22d9db57ccf3eddbc07b32c18109521d050 /contrib/llvm/lib/Transforms | |
parent | 77794ebe2d5718eb502c93ec32f8ccae4d8a0b7b (diff) | |
parent | 782067d0278612ee75d024b9b135c221c327e9e8 (diff) | |
download | FreeBSD-src-5ef8fd3549d38e883a31881636be3dc2a275de20.zip FreeBSD-src-5ef8fd3549d38e883a31881636be3dc2a275de20.tar.gz |
Merge llvm trunk r238337 from ^/vendor/llvm/dist, resolve conflicts, and
preserve our customizations, where necessary.
Diffstat (limited to 'contrib/llvm/lib/Transforms')
137 files changed, 23593 insertions, 11030 deletions
diff --git a/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp index 3282022..7b7672d 100644 --- a/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp +++ b/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp @@ -36,6 +36,7 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/CallGraphSCCPass.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/CFG.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" @@ -69,16 +70,15 @@ namespace { bool runOnSCC(CallGraphSCC &SCC) override; static char ID; // Pass identification, replacement for typeid explicit ArgPromotion(unsigned maxElements = 3) - : CallGraphSCCPass(ID), DL(nullptr), maxElements(maxElements) { + : CallGraphSCCPass(ID), maxElements(maxElements) { initializeArgPromotionPass(*PassRegistry::getPassRegistry()); } /// A vector used to hold the indices of a single GEP instruction typedef std::vector<uint64_t> IndicesVector; - const DataLayout *DL; private: - bool isDenselyPacked(Type *type); + bool isDenselyPacked(Type *type, const DataLayout &DL); bool canPaddingBeAccessed(Argument *Arg); CallGraphNode *PromoteArguments(CallGraphNode *CGN); bool isSafeToPromoteArgument(Argument *Arg, bool isByVal) const; @@ -90,7 +90,7 @@ namespace { bool doInitialization(CallGraph &CG) override; /// The maximum number of elements to expand, or 0 for unlimited. unsigned maxElements; - DenseMap<const Function *, DISubprogram> FunctionDIs; + DenseMap<const Function *, DISubprogram *> FunctionDIs; }; } @@ -109,9 +109,6 @@ Pass *llvm::createArgumentPromotionPass(unsigned maxElements) { bool ArgPromotion::runOnSCC(CallGraphSCC &SCC) { bool Changed = false, LocalChange; - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - DL = DLP ? &DLP->getDataLayout() : nullptr; - do { // Iterate until we stop promoting from this SCC. LocalChange = false; // Attempt to promote arguments from all functions in this SCC. @@ -128,7 +125,7 @@ bool ArgPromotion::runOnSCC(CallGraphSCC &SCC) { } /// \brief Checks if a type could have padding bytes. -bool ArgPromotion::isDenselyPacked(Type *type) { +bool ArgPromotion::isDenselyPacked(Type *type, const DataLayout &DL) { // There is no size information, so be conservative. if (!type->isSized()) @@ -136,7 +133,7 @@ bool ArgPromotion::isDenselyPacked(Type *type) { // If the alloc size is not equal to the storage size, then there are padding // bytes. For x86_fp80 on x86-64, size: 80 alloc size: 128. - if (!DL || DL->getTypeSizeInBits(type) != DL->getTypeAllocSizeInBits(type)) + if (DL.getTypeSizeInBits(type) != DL.getTypeAllocSizeInBits(type)) return false; if (!isa<CompositeType>(type)) @@ -144,19 +141,20 @@ bool ArgPromotion::isDenselyPacked(Type *type) { // For homogenous sequential types, check for padding within members. if (SequentialType *seqTy = dyn_cast<SequentialType>(type)) - return isa<PointerType>(seqTy) || isDenselyPacked(seqTy->getElementType()); + return isa<PointerType>(seqTy) || + isDenselyPacked(seqTy->getElementType(), DL); // Check for padding within and between elements of a struct. StructType *StructTy = cast<StructType>(type); - const StructLayout *Layout = DL->getStructLayout(StructTy); + const StructLayout *Layout = DL.getStructLayout(StructTy); uint64_t StartPos = 0; for (unsigned i = 0, E = StructTy->getNumElements(); i < E; ++i) { Type *ElTy = StructTy->getElementType(i); - if (!isDenselyPacked(ElTy)) + if (!isDenselyPacked(ElTy, DL)) return false; if (StartPos != Layout->getElementOffsetInBits(i)) return false; - StartPos += DL->getTypeAllocSizeInBits(ElTy); + StartPos += DL.getTypeAllocSizeInBits(ElTy); } return true; @@ -210,6 +208,13 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) { // Make sure that it is local to this module. if (!F || !F->hasLocalLinkage()) return nullptr; + // Don't promote arguments for variadic functions. Adding, removing, or + // changing non-pack parameters can change the classification of pack + // parameters. Frontends encode that classification at the call site in the + // IR, while in the callee the classification is determined dynamically based + // on the number of registers consumed so far. + if (F->isVarArg()) return nullptr; + // First check: see if there are any pointer arguments! If not, quick exit. SmallVector<Argument*, 16> PointerArgs; for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; ++I) @@ -230,12 +235,7 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) { isSelfRecursive = true; } - // Don't promote arguments for variadic functions. Adding, removing, or - // changing non-pack parameters can change the classification of pack - // parameters. Frontends encode that classification at the call site in the - // IR, while in the callee the classification is determined dynamically based - // on the number of registers consumed so far. - if (F->isVarArg()) return nullptr; + const DataLayout &DL = F->getParent()->getDataLayout(); // Check to see which arguments are promotable. If an argument is promotable, // add it to ArgsToPromote. @@ -250,8 +250,8 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) { // packed or if we can prove the padding bytes are never accessed. This does // not apply to inalloca. bool isSafeToPromote = - PtrArg->hasByValAttr() && - (isDenselyPacked(AgTy) || !canPaddingBeAccessed(PtrArg)); + PtrArg->hasByValAttr() && + (isDenselyPacked(AgTy, DL) || !canPaddingBeAccessed(PtrArg)); if (isSafeToPromote) { if (StructType *STy = dyn_cast<StructType>(AgTy)) { if (maxElements > 0 && STy->getNumElements() > maxElements) { @@ -310,9 +310,9 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) { /// AllCallersPassInValidPointerForArgument - Return true if we can prove that /// all callees pass in a valid pointer for the specified function argument. -static bool AllCallersPassInValidPointerForArgument(Argument *Arg, - const DataLayout *DL) { +static bool AllCallersPassInValidPointerForArgument(Argument *Arg) { Function *Callee = Arg->getParent(); + const DataLayout &DL = Callee->getParent()->getDataLayout(); unsigned ArgNo = Arg->getArgNo(); @@ -322,7 +322,7 @@ static bool AllCallersPassInValidPointerForArgument(Argument *Arg, CallSite CS(U); assert(CS && "Should only have direct calls!"); - if (!CS.getArgument(ArgNo)->isDereferenceablePointer(DL)) + if (!isDereferenceablePointer(CS.getArgument(ArgNo), DL)) return false; } return true; @@ -430,7 +430,7 @@ bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg, GEPIndicesSet ToPromote; // If the pointer is always valid, any load with first index 0 is valid. - if (isByValOrInAlloca || AllCallersPassInValidPointerForArgument(Arg, DL)) + if (isByValOrInAlloca || AllCallersPassInValidPointerForArgument(Arg)) SafeToUnconditionallyLoad.insert(IndicesVector(1, 0)); // First, iterate the entry block and mark loads of (geps of) arguments as @@ -561,8 +561,7 @@ bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg, // Now check every path from the entry block to the load for transparency. // To do this, we perform a depth first search on the inverse CFG from the // loading block. - for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) { - BasicBlock *P = *PI; + for (BasicBlock *P : predecessors(BB)) { for (BasicBlock *TranspBB : inverse_depth_first_ext(P, TranspBlocks)) if (AA.canBasicBlockModify(*TranspBB, Loc)) return false; @@ -587,7 +586,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, FunctionType *FTy = F->getFunctionType(); std::vector<Type*> Params; - typedef std::set<IndicesVector> ScalarizeTable; + typedef std::set<std::pair<Type *, IndicesVector>> ScalarizeTable; // ScalarizedElements - If we are promoting a pointer that has elements // accessed out of it, keep track of which elements are accessed so that we @@ -624,8 +623,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, // Simple byval argument? Just add all the struct element types. Type *AgTy = cast<PointerType>(I->getType())->getElementType(); StructType *STy = cast<StructType>(AgTy); - for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) - Params.push_back(STy->getElementType(i)); + Params.insert(Params.end(), STy->element_begin(), STy->element_end()); ++NumByValArgsPromoted; } else if (!ArgsToPromote.count(I)) { // Unchanged argument @@ -648,7 +646,11 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, ScalarizeTable &ArgIndices = ScalarizedElements[I]; for (User *U : I->users()) { Instruction *UI = cast<Instruction>(U); - assert(isa<LoadInst>(UI) || isa<GetElementPtrInst>(UI)); + Type *SrcTy; + if (LoadInst *L = dyn_cast<LoadInst>(UI)) + SrcTy = L->getType(); + else + SrcTy = cast<GetElementPtrInst>(UI)->getSourceElementType(); IndicesVector Indices; Indices.reserve(UI->getNumOperands() - 1); // Since loads will only have a single operand, and GEPs only a single @@ -660,7 +662,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, // GEPs with a single 0 index can be merged with direct loads if (Indices.size() == 1 && Indices.front() == 0) Indices.clear(); - ArgIndices.insert(Indices); + ArgIndices.insert(std::make_pair(SrcTy, Indices)); LoadInst *OrigLoad; if (LoadInst *L = dyn_cast<LoadInst>(UI)) OrigLoad = L; @@ -674,11 +676,13 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, for (ScalarizeTable::iterator SI = ArgIndices.begin(), E = ArgIndices.end(); SI != E; ++SI) { // not allowed to dereference ->begin() if size() is 0 - Params.push_back(GetElementPtrInst::getIndexedType(I->getType(), *SI)); + Params.push_back(GetElementPtrInst::getIndexedType( + cast<PointerType>(I->getType()->getScalarType())->getElementType(), + SI->second)); assert(Params.back()); } - if (ArgIndices.size() == 1 && ArgIndices.begin()->empty()) + if (ArgIndices.size() == 1 && ArgIndices.begin()->second.empty()) ++NumArgumentsPromoted; else ++NumAggregatesPromoted; @@ -702,8 +706,8 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, // Patch the pointer to LLVM function in debug info descriptor. auto DI = FunctionDIs.find(F); if (DI != FunctionDIs.end()) { - DISubprogram SP = DI->second; - SP.replaceFunction(NF); + DISubprogram *SP = DI->second; + SP->replaceFunction(NF); // Ensure the map is updated so it can be reused on subsequent argument // promotions of the same function. FunctionDIs.erase(DI); @@ -769,9 +773,8 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, ConstantInt::get(Type::getInt32Ty(F->getContext()), 0), nullptr }; for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { Idxs[1] = ConstantInt::get(Type::getInt32Ty(F->getContext()), i); - Value *Idx = GetElementPtrInst::Create(*AI, Idxs, - (*AI)->getName()+"."+utostr(i), - Call); + Value *Idx = GetElementPtrInst::Create( + STy, *AI, Idxs, (*AI)->getName() + "." + utostr(i), Call); // TODO: Tell AA about the new values? Args.push_back(new LoadInst(Idx, Idx->getName()+".val", Call)); } @@ -784,12 +787,13 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, for (ScalarizeTable::iterator SI = ArgIndices.begin(), E = ArgIndices.end(); SI != E; ++SI) { Value *V = *AI; - LoadInst *OrigLoad = OriginalLoads[std::make_pair(I, *SI)]; - if (!SI->empty()) { - Ops.reserve(SI->size()); + LoadInst *OrigLoad = OriginalLoads[std::make_pair(I, SI->second)]; + if (!SI->second.empty()) { + Ops.reserve(SI->second.size()); Type *ElTy = V->getType(); - for (IndicesVector::const_iterator II = SI->begin(), - IE = SI->end(); II != IE; ++II) { + for (IndicesVector::const_iterator II = SI->second.begin(), + IE = SI->second.end(); + II != IE; ++II) { // Use i32 to index structs, and i64 for others (pointers/arrays). // This satisfies GEP constraints. Type *IdxTy = (ElTy->isStructTy() ? @@ -800,7 +804,8 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, ElTy = cast<CompositeType>(ElTy)->getTypeAtIndex(*II); } // And create a GEP to extract those indices. - V = GetElementPtrInst::Create(V, Ops, V->getName()+".idx", Call); + V = GetElementPtrInst::Create(SI->first, V, Ops, + V->getName() + ".idx", Call); Ops.clear(); AA.copyValue(OrigLoad->getOperand(0), V); } @@ -858,7 +863,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, // Update the callgraph to know that the callsite has been transformed. CallGraphNode *CalleeNode = CG[Call->getParent()->getParent()]; - CalleeNode->replaceCallEdge(Call, New, NF_CGN); + CalleeNode->replaceCallEdge(CS, CallSite(New), NF_CGN); if (!Call->use_empty()) { Call->replaceAllUsesWith(New); @@ -904,10 +909,9 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { Idxs[1] = ConstantInt::get(Type::getInt32Ty(F->getContext()), i); - Value *Idx = - GetElementPtrInst::Create(TheAlloca, Idxs, - TheAlloca->getName()+"."+Twine(i), - InsertPt); + Value *Idx = GetElementPtrInst::Create( + AgTy, TheAlloca, Idxs, TheAlloca->getName() + "." + Twine(i), + InsertPt); I2->setName(I->getName()+"."+Twine(i)); new StoreInst(I2++, Idx, InsertPt); } @@ -940,7 +944,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, while (!I->use_empty()) { if (LoadInst *LI = dyn_cast<LoadInst>(I->user_back())) { - assert(ArgIndices.begin()->empty() && + assert(ArgIndices.begin()->second.empty() && "Load element should sort to front!"); I2->setName(I->getName()+".val"); LI->replaceAllUsesWith(I2); @@ -962,7 +966,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, Function::arg_iterator TheArg = I2; for (ScalarizeTable::iterator It = ArgIndices.begin(); - *It != Operands; ++It, ++TheArg) { + It->second != Operands; ++It, ++TheArg) { assert(It != ArgIndices.end() && "GEP not handled??"); } diff --git a/contrib/llvm/lib/Transforms/IPO/ConstantMerge.cpp b/contrib/llvm/lib/Transforms/IPO/ConstantMerge.cpp index 0b6ade9..8ce7646 100644 --- a/contrib/llvm/lib/Transforms/IPO/ConstantMerge.cpp +++ b/contrib/llvm/lib/Transforms/IPO/ConstantMerge.cpp @@ -52,7 +52,6 @@ namespace { // alignment to a concrete value. unsigned getAlignment(GlobalVariable *GV) const; - const DataLayout *DL; }; } @@ -89,32 +88,22 @@ static bool IsBetterCanonical(const GlobalVariable &A, return A.hasUnnamedAddr(); } -bool ConstantMerge::hasKnownAlignment(GlobalVariable *GV) const { - return DL || GV->getAlignment() != 0; -} - unsigned ConstantMerge::getAlignment(GlobalVariable *GV) const { unsigned Align = GV->getAlignment(); if (Align) return Align; - if (DL) - return DL->getPreferredAlignment(GV); - return 0; + return GV->getParent()->getDataLayout().getPreferredAlignment(GV); } bool ConstantMerge::runOnModule(Module &M) { - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - DL = DLP ? &DLP->getDataLayout() : nullptr; // Find all the globals that are marked "used". These cannot be merged. SmallPtrSet<const GlobalValue*, 8> UsedGlobals; FindUsedValues(M.getGlobalVariable("llvm.used"), UsedGlobals); FindUsedValues(M.getGlobalVariable("llvm.compiler.used"), UsedGlobals); - - // Map unique <constants, has-unknown-alignment> pairs to globals. We don't - // want to merge globals of unknown alignment with those of explicit - // alignment. If we have DataLayout, we always know the alignment. - DenseMap<PointerIntPair<Constant*, 1, bool>, GlobalVariable*> CMap; + + // Map unique constants to globals. + DenseMap<Constant *, GlobalVariable *> CMap; // Replacements - This vector contains a list of replacements to perform. SmallVector<std::pair<GlobalVariable*, GlobalVariable*>, 32> Replacements; @@ -156,8 +145,7 @@ bool ConstantMerge::runOnModule(Module &M) { Constant *Init = GV->getInitializer(); // Check to see if the initializer is already known. - PointerIntPair<Constant*, 1, bool> Pair(Init, hasKnownAlignment(GV)); - GlobalVariable *&Slot = CMap[Pair]; + GlobalVariable *&Slot = CMap[Init]; // If this is the first constant we find or if the old one is local, // replace with the current one. If the current is externally visible @@ -188,8 +176,7 @@ bool ConstantMerge::runOnModule(Module &M) { Constant *Init = GV->getInitializer(); // Check to see if the initializer is already known. - PointerIntPair<Constant*, 1, bool> Pair(Init, hasKnownAlignment(GV)); - GlobalVariable *Slot = CMap[Pair]; + GlobalVariable *Slot = CMap[Init]; if (!Slot || Slot == GV) continue; diff --git a/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp index 4045c09..76898f2 100644 --- a/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp +++ b/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp @@ -73,8 +73,8 @@ namespace { } std::string getDescription() const { - return std::string((IsArg ? "Argument #" : "Return value #")) - + utostr(Idx) + " of function " + F->getName().str(); + return (Twine(IsArg ? "Argument #" : "Return value #") + utostr(Idx) + + " of function " + F->getName()).str(); } }; @@ -127,7 +127,7 @@ namespace { // As the code generation for module is finished (and DIBuilder is // finalized) we assume that subprogram descriptors won't be changed, and // they are stored in map for short duration anyway. - DenseMap<const Function *, DISubprogram> FunctionDIs; + DenseMap<const Function *, DISubprogram *> FunctionDIs; protected: // DAH uses this to specify a different ID. @@ -146,7 +146,7 @@ namespace { private: Liveness MarkIfNotLive(RetOrArg Use, UseVector &MaybeLiveUses); Liveness SurveyUse(const Use *U, UseVector &MaybeLiveUses, - unsigned RetValNum = 0); + unsigned RetValNum = -1U); Liveness SurveyUses(const Value *V, UseVector &MaybeLiveUses); void SurveyFunction(const Function &F); @@ -303,8 +303,8 @@ bool DAE::DeleteDeadVarargs(Function &Fn) { // Patch the pointer to LLVM function in debug info descriptor. auto DI = FunctionDIs.find(&Fn); if (DI != FunctionDIs.end()) { - DISubprogram SP = DI->second; - SP.replaceFunction(NF); + DISubprogram *SP = DI->second; + SP->replaceFunction(NF); // Ensure the map is updated so it can be reused on non-varargs argument // eliminations of the same function. FunctionDIs.erase(DI); @@ -387,14 +387,32 @@ bool DAE::RemoveDeadArgumentsFromCallers(Function &Fn) /// for void functions and 1 for functions not returning a struct. It returns /// the number of struct elements for functions returning a struct. static unsigned NumRetVals(const Function *F) { - if (F->getReturnType()->isVoidTy()) + Type *RetTy = F->getReturnType(); + if (RetTy->isVoidTy()) return 0; - else if (StructType *STy = dyn_cast<StructType>(F->getReturnType())) + else if (StructType *STy = dyn_cast<StructType>(RetTy)) return STy->getNumElements(); + else if (ArrayType *ATy = dyn_cast<ArrayType>(RetTy)) + return ATy->getNumElements(); else return 1; } +/// Returns the sub-type a function will return at a given Idx. Should +/// correspond to the result type of an ExtractValue instruction executed with +/// just that one Idx (i.e. only top-level structure is considered). +static Type *getRetComponentType(const Function *F, unsigned Idx) { + Type *RetTy = F->getReturnType(); + assert(!RetTy->isVoidTy() && "void type has no subtype"); + + if (StructType *STy = dyn_cast<StructType>(RetTy)) + return STy->getElementType(Idx); + else if (ArrayType *ATy = dyn_cast<ArrayType>(RetTy)) + return ATy->getElementType(); + else + return RetTy; +} + /// MarkIfNotLive - This checks Use for liveness in LiveValues. If Use is not /// live, it adds Use to the MaybeLiveUses argument. Returns the determined /// liveness of Use. @@ -425,9 +443,24 @@ DAE::Liveness DAE::SurveyUse(const Use *U, // function's return value is live. We use RetValNum here, for the case // that U is really a use of an insertvalue instruction that uses the // original Use. - RetOrArg Use = CreateRet(RI->getParent()->getParent(), RetValNum); - // We might be live, depending on the liveness of Use. - return MarkIfNotLive(Use, MaybeLiveUses); + const Function *F = RI->getParent()->getParent(); + if (RetValNum != -1U) { + RetOrArg Use = CreateRet(F, RetValNum); + // We might be live, depending on the liveness of Use. + return MarkIfNotLive(Use, MaybeLiveUses); + } else { + DAE::Liveness Result = MaybeLive; + for (unsigned i = 0; i < NumRetVals(F); ++i) { + RetOrArg Use = CreateRet(F, i); + // We might be live, depending on the liveness of Use. If any + // sub-value is live, then the entire value is considered live. This + // is a conservative choice, and better tracking is possible. + DAE::Liveness SubResult = MarkIfNotLive(Use, MaybeLiveUses); + if (Result != Live) + Result = SubResult; + } + return Result; + } } if (const InsertValueInst *IV = dyn_cast<InsertValueInst>(V)) { if (U->getOperandNo() != InsertValueInst::getAggregateOperandIndex() @@ -449,7 +482,7 @@ DAE::Liveness DAE::SurveyUse(const Use *U, return Result; } - if (ImmutableCallSite CS = V) { + if (auto CS = ImmutableCallSite(V)) { const Function *F = CS.getCalledFunction(); if (F) { // Used in a direct call. @@ -541,7 +574,6 @@ void DAE::SurveyFunction(const Function &F) { // Keep track of the number of live retvals, so we can skip checks once all // of them turn out to be live. unsigned NumLiveRetVals = 0; - Type *STy = dyn_cast<StructType>(F.getReturnType()); // Loop all uses of the function. for (const Use &U : F.uses()) { // If the function is PASSED IN as an argument, its address has been @@ -563,34 +595,35 @@ void DAE::SurveyFunction(const Function &F) { // Now, check how our return value(s) is/are used in this caller. Don't // bother checking return values if all of them are live already. - if (NumLiveRetVals != RetCount) { - if (STy) { - // Check all uses of the return value. - for (const User *U : TheCall->users()) { - const ExtractValueInst *Ext = dyn_cast<ExtractValueInst>(U); - if (Ext && Ext->hasIndices()) { - // This use uses a part of our return value, survey the uses of - // that part and store the results for this index only. - unsigned Idx = *Ext->idx_begin(); - if (RetValLiveness[Idx] != Live) { - RetValLiveness[Idx] = SurveyUses(Ext, MaybeLiveRetUses[Idx]); - if (RetValLiveness[Idx] == Live) - NumLiveRetVals++; - } - } else { - // Used by something else than extractvalue. Mark all return - // values as live. - for (unsigned i = 0; i != RetCount; ++i ) - RetValLiveness[i] = Live; - NumLiveRetVals = RetCount; - break; - } + if (NumLiveRetVals == RetCount) + continue; + + // Check all uses of the return value. + for (const Use &U : TheCall->uses()) { + if (ExtractValueInst *Ext = dyn_cast<ExtractValueInst>(U.getUser())) { + // This use uses a part of our return value, survey the uses of + // that part and store the results for this index only. + unsigned Idx = *Ext->idx_begin(); + if (RetValLiveness[Idx] != Live) { + RetValLiveness[Idx] = SurveyUses(Ext, MaybeLiveRetUses[Idx]); + if (RetValLiveness[Idx] == Live) + NumLiveRetVals++; } } else { - // Single return value - RetValLiveness[0] = SurveyUses(TheCall, MaybeLiveRetUses[0]); - if (RetValLiveness[0] == Live) + // Used by something else than extractvalue. Survey, but assume that the + // result applies to all sub-values. + UseVector MaybeLiveAggregateUses; + if (SurveyUse(&U, MaybeLiveAggregateUses) == Live) { NumLiveRetVals = RetCount; + RetValLiveness.assign(RetCount, Live); + break; + } else { + for (unsigned i = 0; i != RetCount; ++i) { + if (RetValLiveness[i] != Live) + MaybeLiveRetUses[i].append(MaybeLiveAggregateUses.begin(), + MaybeLiveAggregateUses.end()); + } + } } } } @@ -775,39 +808,29 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) { if (RetTy->isVoidTy() || HasLiveReturnedArg) { NRetTy = RetTy; } else { - StructType *STy = dyn_cast<StructType>(RetTy); - if (STy) - // Look at each of the original return values individually. - for (unsigned i = 0; i != RetCount; ++i) { - RetOrArg Ret = CreateRet(F, i); - if (LiveValues.erase(Ret)) { - RetTypes.push_back(STy->getElementType(i)); - NewRetIdxs[i] = RetTypes.size() - 1; - } else { - ++NumRetValsEliminated; - DEBUG(dbgs() << "DAE - Removing return value " << i << " from " - << F->getName() << "\n"); - } - } - else - // We used to return a single value. - if (LiveValues.erase(CreateRet(F, 0))) { - RetTypes.push_back(RetTy); - NewRetIdxs[0] = 0; + // Look at each of the original return values individually. + for (unsigned i = 0; i != RetCount; ++i) { + RetOrArg Ret = CreateRet(F, i); + if (LiveValues.erase(Ret)) { + RetTypes.push_back(getRetComponentType(F, i)); + NewRetIdxs[i] = RetTypes.size() - 1; } else { - DEBUG(dbgs() << "DAE - Removing return value from " << F->getName() - << "\n"); ++NumRetValsEliminated; + DEBUG(dbgs() << "DAE - Removing return value " << i << " from " + << F->getName() << "\n"); + } + } + if (RetTypes.size() > 1) { + // More than one return type? Reduce it down to size. + if (StructType *STy = dyn_cast<StructType>(RetTy)) { + // Make the new struct packed if we used to return a packed struct + // already. + NRetTy = StructType::get(STy->getContext(), RetTypes, STy->isPacked()); + } else { + assert(isa<ArrayType>(RetTy) && "unexpected multi-value return"); + NRetTy = ArrayType::get(RetTypes[0], RetTypes.size()); } - if (RetTypes.size() > 1) - // More than one return type? Return a struct with them. Also, if we used - // to return a struct and didn't change the number of return values, - // return a struct again. This prevents changing {something} into - // something and {} into void. - // Make the new struct packed if we used to return a packed struct - // already. - NRetTy = StructType::get(STy->getContext(), RetTypes, STy->isPacked()); - else if (RetTypes.size() == 1) + } else if (RetTypes.size() == 1) // One return type? Just a simple value then, but only if we didn't use to // return a struct with that simple value before. NRetTy = RetTypes.front(); @@ -826,17 +849,12 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) { // here. Currently, this should not be possible, but special handling might be // required when new return value attributes are added. if (NRetTy->isVoidTy()) - RAttrs = - AttributeSet::get(NRetTy->getContext(), AttributeSet::ReturnIndex, - AttrBuilder(RAttrs, AttributeSet::ReturnIndex). - removeAttributes(AttributeFuncs:: - typeIncompatible(NRetTy, AttributeSet::ReturnIndex), - AttributeSet::ReturnIndex)); + RAttrs = RAttrs.removeAttributes(NRetTy->getContext(), + AttributeSet::ReturnIndex, + AttributeFuncs::typeIncompatible(NRetTy)); else assert(!AttrBuilder(RAttrs, AttributeSet::ReturnIndex). - hasAttributes(AttributeFuncs:: - typeIncompatible(NRetTy, AttributeSet::ReturnIndex), - AttributeSet::ReturnIndex) && + overlaps(AttributeFuncs::typeIncompatible(NRetTy)) && "Return attributes no longer compatible?"); if (RAttrs.hasAttributes(AttributeSet::ReturnIndex)) @@ -880,13 +898,9 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) { AttributeSet RAttrs = CallPAL.getRetAttributes(); // Adjust in case the function was changed to return void. - RAttrs = - AttributeSet::get(NF->getContext(), AttributeSet::ReturnIndex, - AttrBuilder(RAttrs, AttributeSet::ReturnIndex). - removeAttributes(AttributeFuncs:: - typeIncompatible(NF->getReturnType(), - AttributeSet::ReturnIndex), - AttributeSet::ReturnIndex)); + RAttrs = RAttrs.removeAttributes(NRetTy->getContext(), + AttributeSet::ReturnIndex, + AttributeFuncs::typeIncompatible(NF->getReturnType())); if (RAttrs.hasAttributes(AttributeSet::ReturnIndex)) AttributesVec.push_back(AttributeSet::get(NF->getContext(), RAttrs)); @@ -959,9 +973,9 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) { if (!Call->getType()->isX86_MMXTy()) Call->replaceAllUsesWith(Constant::getNullValue(Call->getType())); } else { - assert(RetTy->isStructTy() && + assert((RetTy->isStructTy() || RetTy->isArrayTy()) && "Return type changed, but not into a void. The old return type" - " must have been a struct!"); + " must have been a struct or an array!"); Instruction *InsertPt = Call; if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) { BasicBlock::iterator IP = II->getNormalDest()->begin(); @@ -969,9 +983,9 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) { InsertPt = IP; } - // We used to return a struct. Instead of doing smart stuff with all the - // uses of this struct, we will just rebuild it using - // extract/insertvalue chaining and let instcombine clean that up. + // We used to return a struct or array. Instead of doing smart stuff + // with all the uses, we will just rebuild it using extract/insertvalue + // chaining and let instcombine clean that up. // // Start out building up our return value from undef Value *RetVal = UndefValue::get(RetTy); @@ -1034,8 +1048,8 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) { if (NFTy->getReturnType()->isVoidTy()) { RetVal = nullptr; } else { - assert (RetTy->isStructTy()); - // The original return value was a struct, insert + assert(RetTy->isStructTy() || RetTy->isArrayTy()); + // The original return value was a struct or array, insert // extractvalue/insertvalue chains to extract only the values we need // to return and insert them into our new result. // This does generate messy code, but we'll let it to instcombine to @@ -1069,7 +1083,7 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) { // Patch the pointer to LLVM function in debug info descriptor. auto DI = FunctionDIs.find(F); if (DI != FunctionDIs.end()) - DI->second.replaceFunction(NF); + DI->second->replaceFunction(NF); // Now that the old function is dead, delete it. F->eraseFromParent(); diff --git a/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp index 823ae53..92e384a 100644 --- a/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp @@ -31,7 +31,7 @@ #include "llvm/IR/InstIterator.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" using namespace llvm; #define DEBUG_TYPE "functionattrs" @@ -124,7 +124,7 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); AU.addRequired<AliasAnalysis>(); - AU.addRequired<TargetLibraryInfo>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); CallGraphSCCPass::getAnalysisUsage(AU); } @@ -139,7 +139,7 @@ INITIALIZE_PASS_BEGIN(FunctionAttrs, "functionattrs", "Deduce function attributes", false, false) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END(FunctionAttrs, "functionattrs", "Deduce function attributes", false, false) @@ -703,10 +703,14 @@ bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) { } if (ReadAttr != Attribute::None) { - AttrBuilder B; + AttrBuilder B, R; B.addAttribute(ReadAttr); + R.addAttribute(Attribute::ReadOnly) + .addAttribute(Attribute::ReadNone); for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) { Argument *A = ArgumentSCC[i]->Definition; + // Clear out existing readonly/readnone attributes + A->removeAttr(AttributeSet::get(A->getContext(), A->getArgNo() + 1, R)); A->addAttr(AttributeSet::get(A->getContext(), A->getArgNo() + 1, B)); ReadAttr == Attribute::ReadOnly ? ++NumReadOnlyArg : ++NumReadNoneArg; Changed = true; @@ -755,8 +759,8 @@ bool FunctionAttrs::IsFunctionMallocLike(Function *F, } case Instruction::PHI: { PHINode *PN = cast<PHINode>(RVI); - for (int i = 0, e = PN->getNumIncomingValues(); i != e; ++i) - FlowsToReturn.insert(PN->getIncomingValue(i)); + for (Value *IncValue : PN->incoming_values()) + FlowsToReturn.insert(IncValue); continue; } @@ -1702,7 +1706,7 @@ bool FunctionAttrs::annotateLibraryCalls(const CallGraphSCC &SCC) { bool FunctionAttrs::runOnSCC(CallGraphSCC &SCC) { AA = &getAnalysis<AliasAnalysis>(); - TLI = &getAnalysis<TargetLibraryInfo>(); + TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); bool Changed = annotateLibraryCalls(SCC); Changed |= AddReadAttrs(SCC); diff --git a/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp b/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp index 0c844fe..ba04c80 100644 --- a/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp +++ b/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp @@ -24,6 +24,7 @@ #include "llvm/Transforms/Utils/CtorUtils.h" #include "llvm/Transforms/Utils/GlobalStatus.h" #include "llvm/Pass.h" +#include <unordered_map> using namespace llvm; #define DEBUG_TYPE "globaldce" @@ -47,6 +48,7 @@ namespace { private: SmallPtrSet<GlobalValue*, 32> AliveGlobals; SmallPtrSet<Constant *, 8> SeenConstants; + std::unordered_multimap<Comdat *, GlobalValue *> ComdatMembers; /// GlobalIsNeeded - mark the specific global value as needed, and /// recursively mark anything that it uses as also needed. @@ -78,6 +80,17 @@ bool GlobalDCE::runOnModule(Module &M) { // Remove empty functions from the global ctors list. Changed |= optimizeGlobalCtorsList(M, isEmptyFunction); + // Collect the set of members for each comdat. + for (Function &F : M) + if (Comdat *C = F.getComdat()) + ComdatMembers.insert(std::make_pair(C, &F)); + for (GlobalVariable &GV : M.globals()) + if (Comdat *C = GV.getComdat()) + ComdatMembers.insert(std::make_pair(C, &GV)); + for (GlobalAlias &GA : M.aliases()) + if (Comdat *C = GA.getComdat()) + ComdatMembers.insert(std::make_pair(C, &GA)); + // Loop over the module, adding globals which are obviously necessary. for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { Changed |= RemoveUnusedGlobalValue(*I); @@ -177,6 +190,7 @@ bool GlobalDCE::runOnModule(Module &M) { // Make sure that all memory is released AliveGlobals.clear(); SeenConstants.clear(); + ComdatMembers.clear(); return Changed; } @@ -188,17 +202,9 @@ void GlobalDCE::GlobalIsNeeded(GlobalValue *G) { if (!AliveGlobals.insert(G).second) return; - Module *M = G->getParent(); if (Comdat *C = G->getComdat()) { - for (Function &F : *M) - if (F.getComdat() == C) - GlobalIsNeeded(&F); - for (GlobalVariable &GV : M->globals()) - if (GV.getComdat() == C) - GlobalIsNeeded(&GV); - for (GlobalAlias &GA : M->aliases()) - if (GA.getComdat() == C) - GlobalIsNeeded(&GA); + for (auto &&CM : make_range(ComdatMembers.equal_range(C))) + GlobalIsNeeded(CM.second); } if (GlobalVariable *GV = dyn_cast<GlobalVariable>(G)) { diff --git a/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp index 6e0ae83..cc4a79f 100644 --- a/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -22,6 +22,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/MemoryBuiltins.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/Constants.h" @@ -38,7 +39,6 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/CtorUtils.h" #include "llvm/Transforms/Utils/GlobalStatus.h" #include "llvm/Transforms/Utils/ModuleUtils.h" @@ -68,7 +68,7 @@ STATISTIC(NumCXXDtorsRemoved, "Number of global C++ destructors removed"); namespace { struct GlobalOpt : public ModulePass { void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<TargetLibraryInfo>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); } static char ID; // Pass identification, replacement for typeid GlobalOpt() : ModulePass(ID) { @@ -86,7 +86,6 @@ namespace { const GlobalStatus &GS); bool OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn); - const DataLayout *DL; TargetLibraryInfo *TLI; SmallSet<const Comdat *, 8> NotDiscardableComdats; }; @@ -95,7 +94,7 @@ namespace { char GlobalOpt::ID = 0; INITIALIZE_PASS_BEGIN(GlobalOpt, "globalopt", "Global Variable Optimizer", false, false) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END(GlobalOpt, "globalopt", "Global Variable Optimizer", false, false) @@ -269,7 +268,7 @@ static bool CleanupPointerRootUsers(GlobalVariable *GV, /// quick scan over the use list to clean up the easy and obvious cruft. This /// returns true if it made a change. static bool CleanupConstantGlobalUsers(Value *V, Constant *Init, - const DataLayout *DL, + const DataLayout &DL, TargetLibraryInfo *TLI) { bool Changed = false; // Note that we need to use a weak value handle for the worklist items. When @@ -318,8 +317,8 @@ static bool CleanupConstantGlobalUsers(Value *V, Constant *Init, // and will invalidate our notion of what Init is. Constant *SubInit = nullptr; if (!isa<ConstantExpr>(GEP->getOperand(0))) { - ConstantExpr *CE = - dyn_cast_or_null<ConstantExpr>(ConstantFoldInstruction(GEP, DL, TLI)); + ConstantExpr *CE = dyn_cast_or_null<ConstantExpr>( + ConstantFoldInstruction(GEP, DL, TLI)); if (Init && CE && CE->getOpcode() == Instruction::GetElementPtr) SubInit = ConstantFoldLoadThroughGEPConstantExpr(Init, CE); @@ -565,6 +564,7 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) { if (Val >= NewGlobals.size()) Val = 0; // Out of bound array access. Value *NewPtr = NewGlobals[Val]; + Type *NewTy = NewGlobals[Val]->getValueType(); // Form a shorter GEP if needed. if (GEP->getNumOperands() > 3) { @@ -573,15 +573,16 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) { Idxs.push_back(NullInt); for (unsigned i = 3, e = CE->getNumOperands(); i != e; ++i) Idxs.push_back(CE->getOperand(i)); - NewPtr = ConstantExpr::getGetElementPtr(cast<Constant>(NewPtr), Idxs); + NewPtr = + ConstantExpr::getGetElementPtr(NewTy, cast<Constant>(NewPtr), Idxs); } else { GetElementPtrInst *GEPI = cast<GetElementPtrInst>(GEP); SmallVector<Value*, 8> Idxs; Idxs.push_back(NullInt); for (unsigned i = 3, e = GEPI->getNumOperands(); i != e; ++i) Idxs.push_back(GEPI->getOperand(i)); - NewPtr = GetElementPtrInst::Create(NewPtr, Idxs, - GEPI->getName()+"."+Twine(Val),GEPI); + NewPtr = GetElementPtrInst::Create( + NewTy, NewPtr, Idxs, GEPI->getName() + "." + Twine(Val), GEPI); } } GEP->replaceAllUsesWith(NewPtr); @@ -721,8 +722,8 @@ static bool OptimizeAwayTrappingUsesOfValue(Value *V, Constant *NewV) { else break; if (Idxs.size() == GEPI->getNumOperands()-1) - Changed |= OptimizeAwayTrappingUsesOfValue(GEPI, - ConstantExpr::getGetElementPtr(NewV, Idxs)); + Changed |= OptimizeAwayTrappingUsesOfValue( + GEPI, ConstantExpr::getGetElementPtr(nullptr, NewV, Idxs)); if (GEPI->use_empty()) { Changed = true; GEPI->eraseFromParent(); @@ -739,7 +740,7 @@ static bool OptimizeAwayTrappingUsesOfValue(Value *V, Constant *NewV) { /// if the loaded value is dynamically null, then we know that they cannot be /// reachable with a null optimize away the load. static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV, - const DataLayout *DL, + const DataLayout &DL, TargetLibraryInfo *TLI) { bool Changed = false; @@ -802,7 +803,7 @@ static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV, /// ConstantPropUsersOf - Walk the use list of V, constant folding all of the /// instructions that are foldable. -static void ConstantPropUsersOf(Value *V, const DataLayout *DL, +static void ConstantPropUsersOf(Value *V, const DataLayout &DL, TargetLibraryInfo *TLI) { for (Value::user_iterator UI = V->user_begin(), E = V->user_end(); UI != E; ) if (Instruction *I = dyn_cast<Instruction>(*UI++)) @@ -822,12 +823,10 @@ static void ConstantPropUsersOf(Value *V, const DataLayout *DL, /// the specified malloc. Because it is always the result of the specified /// malloc, there is no reason to actually DO the malloc. Instead, turn the /// malloc into a global, and any loads of GV as uses of the new global. -static GlobalVariable *OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, - CallInst *CI, - Type *AllocTy, - ConstantInt *NElements, - const DataLayout *DL, - TargetLibraryInfo *TLI) { +static GlobalVariable * +OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy, + ConstantInt *NElements, const DataLayout &DL, + TargetLibraryInfo *TLI) { DEBUG(errs() << "PROMOTING GLOBAL: " << *GV << " CALL = " << *CI << '\n'); Type *GlobalType; @@ -1167,7 +1166,8 @@ static Value *GetHeapSROAValue(Value *V, unsigned FieldNo, InsertedScalarizedValues, PHIsToRewrite), LI->getName()+".f"+Twine(FieldNo), LI); - } else if (PHINode *PN = dyn_cast<PHINode>(V)) { + } else { + PHINode *PN = cast<PHINode>(V); // PN's type is pointer to struct. Make a new PHI of pointer to struct // field. @@ -1181,8 +1181,6 @@ static Value *GetHeapSROAValue(Value *V, unsigned FieldNo, PN->getName()+".f"+Twine(FieldNo), PN); Result = NewPN; PHIsToRewrite.push_back(std::make_pair(PN, FieldNo)); - } else { - llvm_unreachable("Unknown usable value"); } return FieldVals[FieldNo] = Result; @@ -1224,7 +1222,7 @@ static void RewriteHeapSROALoadUser(Instruction *LoadUser, GEPIdx.push_back(GEPI->getOperand(1)); GEPIdx.append(GEPI->op_begin()+3, GEPI->op_end()); - Value *NGEPI = GetElementPtrInst::Create(NewPtr, GEPIdx, + Value *NGEPI = GetElementPtrInst::Create(GEPI->getResultElementType(), NewPtr, GEPIdx, GEPI->getName(), GEPI); GEPI->replaceAllUsesWith(NGEPI); GEPI->eraseFromParent(); @@ -1271,7 +1269,7 @@ static void RewriteUsesOfLoadForHeapSRoA(LoadInst *Load, /// PerformHeapAllocSRoA - CI is an allocation of an array of structures. Break /// it up into multiple allocations of arrays of the fields. static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI, - Value *NElems, const DataLayout *DL, + Value *NElems, const DataLayout &DL, const TargetLibraryInfo *TLI) { DEBUG(dbgs() << "SROA HEAP ALLOC: " << *GV << " MALLOC = " << *CI << '\n'); Type *MAT = getMallocAllocatedType(CI, TLI); @@ -1301,10 +1299,10 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI, GV->getThreadLocalMode()); FieldGlobals.push_back(NGV); - unsigned TypeSize = DL->getTypeAllocSize(FieldTy); + unsigned TypeSize = DL.getTypeAllocSize(FieldTy); if (StructType *ST = dyn_cast<StructType>(FieldTy)) - TypeSize = DL->getStructLayout(ST)->getSizeInBytes(); - Type *IntPtrTy = DL->getIntPtrType(CI->getType()); + TypeSize = DL.getStructLayout(ST)->getSizeInBytes(); + Type *IntPtrTy = DL.getIntPtrType(CI->getType()); Value *NMI = CallInst::CreateMalloc(CI, IntPtrTy, FieldTy, ConstantInt::get(IntPtrTy, TypeSize), NElems, nullptr, @@ -1459,16 +1457,12 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI, /// TryToOptimizeStoreOfMallocToGlobal - This function is called when we see a /// pointer global variable with a single value stored it that is a malloc or /// cast of malloc. -static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, - CallInst *CI, +static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, CallInst *CI, Type *AllocTy, AtomicOrdering Ordering, Module::global_iterator &GVI, - const DataLayout *DL, + const DataLayout &DL, TargetLibraryInfo *TLI) { - if (!DL) - return false; - // If this is a malloc of an abstract type, don't touch it. if (!AllocTy->isSized()) return false; @@ -1504,7 +1498,7 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, // Restrict this transformation to only working on small allocations // (2048 bytes currently), as we don't want to introduce a 16M global or // something. - if (NElements->getZExtValue() * DL->getTypeAllocSize(AllocTy) < 2048) { + if (NElements->getZExtValue() * DL.getTypeAllocSize(AllocTy) < 2048) { GVI = OptimizeGlobalAddressOfMalloc(GV, CI, AllocTy, NElements, DL, TLI); return true; } @@ -1534,8 +1528,8 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, // If this is a fixed size array, transform the Malloc to be an alloc of // structs. malloc [100 x struct],1 -> malloc struct, 100 if (ArrayType *AT = dyn_cast<ArrayType>(getMallocAllocatedType(CI, TLI))) { - Type *IntPtrTy = DL->getIntPtrType(CI->getType()); - unsigned TypeSize = DL->getStructLayout(AllocSTy)->getSizeInBytes(); + Type *IntPtrTy = DL.getIntPtrType(CI->getType()); + unsigned TypeSize = DL.getStructLayout(AllocSTy)->getSizeInBytes(); Value *AllocSize = ConstantInt::get(IntPtrTy, TypeSize); Value *NumElements = ConstantInt::get(IntPtrTy, AT->getNumElements()); Instruction *Malloc = CallInst::CreateMalloc(CI, IntPtrTy, AllocSTy, @@ -1563,7 +1557,7 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, static bool OptimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal, AtomicOrdering Ordering, Module::global_iterator &GVI, - const DataLayout *DL, + const DataLayout &DL, TargetLibraryInfo *TLI) { // Ignore no-op GEPs and bitcasts. StoredOnceVal = StoredOnceVal->stripPointerCasts(); @@ -1733,6 +1727,7 @@ bool GlobalOpt::ProcessGlobal(GlobalVariable *GV, bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV, Module::global_iterator &GVI, const GlobalStatus &GS) { + auto &DL = GV->getParent()->getDataLayout(); // If this is a first class global and has only one accessing function // and this function is main (which we know is not recursive), we replace // the global with a local alloca in this function. @@ -1804,12 +1799,10 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV, ++NumMarked; return true; } else if (!GV->getInitializer()->getType()->isSingleValueType()) { - if (DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>()) { - const DataLayout &DL = DLP->getDataLayout(); - if (GlobalVariable *FirstNewGV = SRAGlobal(GV, DL)) { - GVI = FirstNewGV; // Don't skip the newly produced globals! - return true; - } + const DataLayout &DL = GV->getParent()->getDataLayout(); + if (GlobalVariable *FirstNewGV = SRAGlobal(GV, DL)) { + GVI = FirstNewGV; // Don't skip the newly produced globals! + return true; } } else if (GS.StoredType == GlobalStatus::StoredOnce) { // If the initial value for the global was an undef value, and if only @@ -1954,6 +1947,7 @@ bool GlobalOpt::OptimizeGlobalVars(Module &M) { // Simplify the initializer. if (GV->hasInitializer()) if (ConstantExpr *CE = dyn_cast<ConstantExpr>(GV->getInitializer())) { + auto &DL = M.getDataLayout(); Constant *New = ConstantFoldConstantExpression(CE, DL, TLI); if (New && New != CE) GV->setInitializer(New); @@ -1971,9 +1965,8 @@ bool GlobalOpt::OptimizeGlobalVars(Module &M) { static inline bool isSimpleEnoughValueToCommit(Constant *C, - SmallPtrSetImpl<Constant*> &SimpleConstants, - const DataLayout *DL); - + SmallPtrSetImpl<Constant *> &SimpleConstants, + const DataLayout &DL); /// isSimpleEnoughValueToCommit - Return true if the specified constant can be /// handled by the code generator. We don't want to generate something like: @@ -1983,9 +1976,10 @@ isSimpleEnoughValueToCommit(Constant *C, /// This function should be called if C was not found (but just got inserted) /// in SimpleConstants to avoid having to rescan the same constants all the /// time. -static bool isSimpleEnoughValueToCommitHelper(Constant *C, - SmallPtrSetImpl<Constant*> &SimpleConstants, - const DataLayout *DL) { +static bool +isSimpleEnoughValueToCommitHelper(Constant *C, + SmallPtrSetImpl<Constant *> &SimpleConstants, + const DataLayout &DL) { // Simple global addresses are supported, do not allow dllimport or // thread-local globals. if (auto *GV = dyn_cast<GlobalValue>(C)) @@ -2019,8 +2013,8 @@ static bool isSimpleEnoughValueToCommitHelper(Constant *C, case Instruction::PtrToInt: // int <=> ptr is fine if the int type is the same size as the // pointer type. - if (!DL || DL->getTypeSizeInBits(CE->getType()) != - DL->getTypeSizeInBits(CE->getOperand(0)->getType())) + if (DL.getTypeSizeInBits(CE->getType()) != + DL.getTypeSizeInBits(CE->getOperand(0)->getType())) return false; return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, DL); @@ -2042,8 +2036,8 @@ static bool isSimpleEnoughValueToCommitHelper(Constant *C, static inline bool isSimpleEnoughValueToCommit(Constant *C, - SmallPtrSetImpl<Constant*> &SimpleConstants, - const DataLayout *DL) { + SmallPtrSetImpl<Constant *> &SimpleConstants, + const DataLayout &DL) { // If we already checked this constant, we win. if (!SimpleConstants.insert(C).second) return true; @@ -2174,8 +2168,8 @@ namespace { /// Once an evaluation call fails, the evaluation object should not be reused. class Evaluator { public: - Evaluator(const DataLayout *DL, const TargetLibraryInfo *TLI) - : DL(DL), TLI(TLI) { + Evaluator(const DataLayout &DL, const TargetLibraryInfo *TLI) + : DL(DL), TLI(TLI) { ValueStack.emplace_back(); } @@ -2249,7 +2243,7 @@ private: /// simple enough to live in a static initializer of a global. SmallPtrSet<Constant*, 8> SimpleConstants; - const DataLayout *DL; + const DataLayout &DL; const TargetLibraryInfo *TLI; }; @@ -2345,7 +2339,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, Constant *IdxZero = ConstantInt::get(IdxTy, 0, false); Constant * const IdxList[] = {IdxZero, IdxZero}; - Ptr = ConstantExpr::getGetElementPtr(Ptr, IdxList); + Ptr = ConstantExpr::getGetElementPtr(nullptr, Ptr, IdxList); if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr)) Ptr = ConstantFoldConstantExpression(CE, DL, TLI); @@ -2409,8 +2403,8 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, i != e; ++i) GEPOps.push_back(getVal(*i)); InstResult = - ConstantExpr::getGetElementPtr(P, GEPOps, - cast<GEPOperator>(GEP)->isInBounds()); + ConstantExpr::getGetElementPtr(GEP->getSourceElementType(), P, GEPOps, + cast<GEPOperator>(GEP)->isInBounds()); DEBUG(dbgs() << "Found a GEP! Simplifying: " << *InstResult << "\n"); } else if (LoadInst *LI = dyn_cast<LoadInst>(CurInst)) { @@ -2498,9 +2492,9 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, Value *Ptr = PtrArg->stripPointerCasts(); if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Ptr)) { Type *ElemTy = cast<PointerType>(GV->getType())->getElementType(); - if (DL && !Size->isAllOnesValue() && + if (!Size->isAllOnesValue() && Size->getValue().getLimitedValue() >= - DL->getTypeStoreSize(ElemTy)) { + DL.getTypeStoreSize(ElemTy)) { Invariants.insert(GV); DEBUG(dbgs() << "Found a global var that is an invariant: " << *GV << "\n"); @@ -2689,7 +2683,7 @@ bool Evaluator::EvaluateFunction(Function *F, Constant *&RetVal, /// EvaluateStaticConstructor - Evaluate static constructors in the function, if /// we can. Return true if we can, false otherwise. -static bool EvaluateStaticConstructor(Function *F, const DataLayout *DL, +static bool EvaluateStaticConstructor(Function *F, const DataLayout &DL, const TargetLibraryInfo *TLI) { // Call the function. Evaluator Eval(DL, TLI); @@ -3040,9 +3034,8 @@ bool GlobalOpt::OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn) { bool GlobalOpt::runOnModule(Module &M) { bool Changed = false; - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - DL = DLP ? &DLP->getDataLayout() : nullptr; - TLI = &getAnalysis<TargetLibraryInfo>(); + auto &DL = M.getDataLayout(); + TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); bool LocalChange = true; while (LocalChange) { diff --git a/contrib/llvm/lib/Transforms/IPO/IPO.cpp b/contrib/llvm/lib/Transforms/IPO/IPO.cpp index b4d31d8..fcacec328 100644 --- a/contrib/llvm/lib/Transforms/IPO/IPO.cpp +++ b/contrib/llvm/lib/Transforms/IPO/IPO.cpp @@ -16,7 +16,7 @@ #include "llvm-c/Initialization.h" #include "llvm-c/Transforms/IPO.h" #include "llvm/InitializePasses.h" -#include "llvm/PassManager.h" +#include "llvm/IR/LegacyPassManager.h" #include "llvm/Transforms/IPO.h" using namespace llvm; @@ -36,6 +36,7 @@ void llvm::initializeIPO(PassRegistry &Registry) { initializeLoopExtractorPass(Registry); initializeBlockExtractorPassPass(Registry); initializeSingleLoopExtractorPass(Registry); + initializeLowerBitSetsPass(Registry); initializeMergeFunctionsPass(Registry); initializePartialInlinerPass(Registry); initializePruneEHPass(Registry); diff --git a/contrib/llvm/lib/Transforms/IPO/Inliner.cpp b/contrib/llvm/lib/Transforms/IPO/Inliner.cpp index 6686743..8f65a98 100644 --- a/contrib/llvm/lib/Transforms/IPO/Inliner.cpp +++ b/contrib/llvm/lib/Transforms/IPO/Inliner.cpp @@ -20,6 +20,7 @@ #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/InlineCost.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DiagnosticInfo.h" @@ -29,7 +30,6 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; @@ -72,8 +72,8 @@ Inliner::Inliner(char &ID, int Threshold, bool InsertLifetime) InlineLimit : Threshold), InsertLifetime(InsertLifetime) {} -/// getAnalysisUsage - For this class, we declare that we require and preserve -/// the call graph. If the derived class implements this method, it should +/// For this class, we declare that we require and preserve the call graph. +/// If the derived class implements this method, it should /// always explicitly call the implementation here. void Inliner::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<AliasAnalysis>(); @@ -97,40 +97,31 @@ static void AdjustCallerSSPLevel(Function *Caller, Function *Callee) { AttributeSet OldSSPAttr = AttributeSet::get(Caller->getContext(), AttributeSet::FunctionIndex, B); - AttributeSet CallerAttr = Caller->getAttributes(), - CalleeAttr = Callee->getAttributes(); - if (CalleeAttr.hasAttribute(AttributeSet::FunctionIndex, - Attribute::StackProtectReq)) { + if (Callee->hasFnAttribute(Attribute::StackProtectReq)) { Caller->removeAttributes(AttributeSet::FunctionIndex, OldSSPAttr); Caller->addFnAttr(Attribute::StackProtectReq); - } else if (CalleeAttr.hasAttribute(AttributeSet::FunctionIndex, - Attribute::StackProtectStrong) && - !CallerAttr.hasAttribute(AttributeSet::FunctionIndex, - Attribute::StackProtectReq)) { + } else if (Callee->hasFnAttribute(Attribute::StackProtectStrong) && + !Caller->hasFnAttribute(Attribute::StackProtectReq)) { Caller->removeAttributes(AttributeSet::FunctionIndex, OldSSPAttr); Caller->addFnAttr(Attribute::StackProtectStrong); - } else if (CalleeAttr.hasAttribute(AttributeSet::FunctionIndex, - Attribute::StackProtect) && - !CallerAttr.hasAttribute(AttributeSet::FunctionIndex, - Attribute::StackProtectReq) && - !CallerAttr.hasAttribute(AttributeSet::FunctionIndex, - Attribute::StackProtectStrong)) + } else if (Callee->hasFnAttribute(Attribute::StackProtect) && + !Caller->hasFnAttribute(Attribute::StackProtectReq) && + !Caller->hasFnAttribute(Attribute::StackProtectStrong)) Caller->addFnAttr(Attribute::StackProtect); } -/// InlineCallIfPossible - If it is possible to inline the specified call site, +/// If it is possible to inline the specified call site, /// do so and update the CallGraph for this operation. /// /// This function also does some basic book-keeping to update the IR. The /// InlinedArrayAllocas map keeps track of any allocas that are already -/// available from other functions inlined into the caller. If we are able to +/// available from other functions inlined into the caller. If we are able to /// inline this call site we attempt to reuse already available allocas or add /// any new allocas to the set if not possible. static bool InlineCallIfPossible(CallSite CS, InlineFunctionInfo &IFI, InlinedArrayAllocasTy &InlinedArrayAllocas, - int InlineHistory, bool InsertLifetime, - const DataLayout *DL) { + int InlineHistory, bool InsertLifetime) { Function *Callee = CS.getCalledFunction(); Function *Caller = CS.getCaller(); @@ -206,11 +197,6 @@ static bool InlineCallIfPossible(CallSite CS, InlineFunctionInfo &IFI, unsigned Align1 = AI->getAlignment(), Align2 = AvailableAlloca->getAlignment(); - // If we don't have data layout information, and only one alloca is using - // the target default, then we can't safely merge them because we can't - // pick the greater alignment. - if (!DL && (!Align1 || !Align2) && Align1 != Align2) - continue; // The available alloca has to be in the right function, not in some other // function in this SCC. @@ -231,8 +217,8 @@ static bool InlineCallIfPossible(CallSite CS, InlineFunctionInfo &IFI, if (Align1 != Align2) { if (!Align1 || !Align2) { - assert(DL && "DataLayout required to compare default alignments"); - unsigned TypeAlign = DL->getABITypeAlignment(AI->getAllocatedType()); + const DataLayout &DL = Caller->getParent()->getDataLayout(); + unsigned TypeAlign = DL.getABITypeAlignment(AI->getAllocatedType()); Align1 = Align1 ? Align1 : TypeAlign; Align2 = Align2 ? Align2 : TypeAlign; @@ -273,8 +259,7 @@ unsigned Inliner::getInlineThreshold(CallSite CS) const { // would decrease the threshold. Function *Caller = CS.getCaller(); bool OptSize = Caller && !Caller->isDeclaration() && - Caller->getAttributes().hasAttribute(AttributeSet::FunctionIndex, - Attribute::OptimizeForSize); + Caller->hasFnAttribute(Attribute::OptimizeForSize); if (!(InlineLimit.getNumOccurrences() > 0) && OptSize && OptSizeThreshold < thres) thres = OptSizeThreshold; @@ -283,17 +268,14 @@ unsigned Inliner::getInlineThreshold(CallSite CS) const { // and the caller does not need to minimize its size. Function *Callee = CS.getCalledFunction(); bool InlineHint = Callee && !Callee->isDeclaration() && - Callee->getAttributes().hasAttribute(AttributeSet::FunctionIndex, - Attribute::InlineHint); - if (InlineHint && HintThreshold > thres - && !Caller->getAttributes().hasAttribute(AttributeSet::FunctionIndex, - Attribute::MinSize)) + Callee->hasFnAttribute(Attribute::InlineHint); + if (InlineHint && HintThreshold > thres && + !Caller->hasFnAttribute(Attribute::MinSize)) thres = HintThreshold; // Listen to the cold attribute when it would decrease the threshold. bool ColdCallee = Callee && !Callee->isDeclaration() && - Callee->getAttributes().hasAttribute(AttributeSet::FunctionIndex, - Attribute::Cold); + Callee->hasFnAttribute(Attribute::Cold); // Command line argument for InlineLimit will override the default // ColdThreshold. If we have -inline-threshold but no -inlinecold-threshold, // do not use the default cold threshold even if it is smaller. @@ -312,8 +294,7 @@ static void emitAnalysis(CallSite CS, const Twine &Msg) { emitOptimizationRemarkAnalysis(Ctx, DEBUG_TYPE, *Caller, DLoc, Msg); } -/// shouldInline - Return true if the inliner should attempt to inline -/// at the given CallSite. +/// Return true if the inliner should attempt to inline at the given CallSite. bool Inliner::shouldInline(CallSite CS) { InlineCost IC = getInlineCost(CS); @@ -427,7 +408,7 @@ bool Inliner::shouldInline(CallSite CS) { return true; } -/// InlineHistoryIncludes - Return true if the specified inline history ID +/// Return true if the specified inline history ID /// indicates an inline history that includes the specified function. static bool InlineHistoryIncludes(Function *F, int InlineHistoryID, const SmallVectorImpl<std::pair<Function*, int> > &InlineHistory) { @@ -444,9 +425,8 @@ static bool InlineHistoryIncludes(Function *F, int InlineHistoryID, bool Inliner::runOnSCC(CallGraphSCC &SCC) { CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph(); AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>(); - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr; - const TargetLibraryInfo *TLI = getAnalysisIfAvailable<TargetLibraryInfo>(); + auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); + const TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr; AliasAnalysis *AA = &getAnalysis<AliasAnalysis>(); SmallPtrSet<Function*, 8> SCCFunctions; @@ -506,7 +486,7 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) { InlinedArrayAllocasTy InlinedArrayAllocas; - InlineFunctionInfo InlineInfo(&CG, DL, AA, ACT); + InlineFunctionInfo InlineInfo(&CG, AA, ACT); // Now that we have all of the call sites, loop over them and inline them if // it looks profitable to do so. @@ -564,7 +544,7 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) { // Attempt to inline the function. if (!InlineCallIfPossible(CS, InlineInfo, InlinedArrayAllocas, - InlineHistoryID, InsertLifetime, DL)) { + InlineHistoryID, InsertLifetime)) { emitOptimizationRemarkMissed(CallerCtx, DEBUG_TYPE, *Caller, DLoc, Twine(Callee->getName() + " will not be inlined into " + @@ -636,16 +616,30 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) { return Changed; } -// doFinalization - Remove now-dead linkonce functions at the end of -// processing to avoid breaking the SCC traversal. +/// Remove now-dead linkonce functions at the end of +/// processing to avoid breaking the SCC traversal. bool Inliner::doFinalization(CallGraph &CG) { return removeDeadFunctions(CG); } -/// removeDeadFunctions - Remove dead functions that are not included in -/// DNR (Do Not Remove) list. +/// Remove dead functions that are not included in DNR (Do Not Remove) list. bool Inliner::removeDeadFunctions(CallGraph &CG, bool AlwaysInlineOnly) { SmallVector<CallGraphNode*, 16> FunctionsToRemove; + SmallVector<CallGraphNode *, 16> DeadFunctionsInComdats; + SmallDenseMap<const Comdat *, int, 16> ComdatEntriesAlive; + + auto RemoveCGN = [&](CallGraphNode *CGN) { + // Remove any call graph edges from the function to its callees. + CGN->removeAllCalledFunctions(); + + // Remove any edges from the external node to the function's call graph + // node. These edges might have been made irrelegant due to + // optimization of the program. + CG.getExternalCallingNode()->removeAnyCallEdgeTo(CGN); + + // Removing the node for callee from the call graph and delete it. + FunctionsToRemove.push_back(CGN); + }; // Scan for all of the functions, looking for ones that should now be removed // from the program. Insert the dead ones in the FunctionsToRemove set. @@ -658,9 +652,7 @@ bool Inliner::removeDeadFunctions(CallGraph &CG, bool AlwaysInlineOnly) { // Handle the case when this function is called and we only want to care // about always-inline functions. This is a bit of a hack to share code // between here and the InlineAlways pass. - if (AlwaysInlineOnly && - !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, - Attribute::AlwaysInline)) + if (AlwaysInlineOnly && !F->hasFnAttribute(Attribute::AlwaysInline)) continue; // If the only remaining users of the function are dead constants, remove @@ -674,20 +666,45 @@ bool Inliner::removeDeadFunctions(CallGraph &CG, bool AlwaysInlineOnly) { // without also dropping the other members of the COMDAT. // The inliner doesn't visit non-function entities which are in COMDAT // groups so it is unsafe to do so *unless* the linkage is local. - if (!F->hasLocalLinkage() && F->hasComdat()) - continue; - - // Remove any call graph edges from the function to its callees. - CGN->removeAllCalledFunctions(); + if (!F->hasLocalLinkage()) { + if (const Comdat *C = F->getComdat()) { + --ComdatEntriesAlive[C]; + DeadFunctionsInComdats.push_back(CGN); + continue; + } + } - // Remove any edges from the external node to the function's call graph - // node. These edges might have been made irrelegant due to - // optimization of the program. - CG.getExternalCallingNode()->removeAnyCallEdgeTo(CGN); + RemoveCGN(CGN); + } + if (!DeadFunctionsInComdats.empty()) { + // Count up all the entities in COMDAT groups + auto ComdatGroupReferenced = [&](const Comdat *C) { + auto I = ComdatEntriesAlive.find(C); + if (I != ComdatEntriesAlive.end()) + ++(I->getSecond()); + }; + for (const Function &F : CG.getModule()) + if (const Comdat *C = F.getComdat()) + ComdatGroupReferenced(C); + for (const GlobalVariable &GV : CG.getModule().globals()) + if (const Comdat *C = GV.getComdat()) + ComdatGroupReferenced(C); + for (const GlobalAlias &GA : CG.getModule().aliases()) + if (const Comdat *C = GA.getComdat()) + ComdatGroupReferenced(C); + for (CallGraphNode *CGN : DeadFunctionsInComdats) { + Function *F = CGN->getFunction(); + const Comdat *C = F->getComdat(); + int NumAlive = ComdatEntriesAlive[C]; + // We can remove functions in a COMDAT group if the entire group is dead. + assert(NumAlive >= 0); + if (NumAlive > 0) + continue; - // Removing the node for callee from the call graph and delete it. - FunctionsToRemove.push_back(CGN); + RemoveCGN(CGN); + } } + if (FunctionsToRemove.empty()) return false; diff --git a/contrib/llvm/lib/Transforms/IPO/LoopExtractor.cpp b/contrib/llvm/lib/Transforms/IPO/LoopExtractor.cpp index 20414aa..41334ca 100644 --- a/contrib/llvm/lib/Transforms/IPO/LoopExtractor.cpp +++ b/contrib/llvm/lib/Transforms/IPO/LoopExtractor.cpp @@ -242,7 +242,7 @@ void BlockExtractorPass::SplitLandingPadPreds(Function *F) { if (!Split) continue; SmallVector<BasicBlock*, 2> NewBBs; - SplitLandingPadPredecessors(LPad, Parent, ".1", ".2", nullptr, NewBBs); + SplitLandingPadPredecessors(LPad, Parent, ".1", ".2", NewBBs); } } diff --git a/contrib/llvm/lib/Transforms/IPO/LowerBitSets.cpp b/contrib/llvm/lib/Transforms/IPO/LowerBitSets.cpp new file mode 100644 index 0000000..bffeebb --- /dev/null +++ b/contrib/llvm/lib/Transforms/IPO/LowerBitSets.cpp @@ -0,0 +1,732 @@ +//===-- LowerBitSets.cpp - Bitset lowering pass ---------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass lowers bitset metadata and calls to the llvm.bitset.test intrinsic. +// See http://llvm.org/docs/LangRef.html#bitsets for more information. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/IPO/LowerBitSets.h" +#include "llvm/Transforms/IPO.h" +#include "llvm/ADT/EquivalenceClasses.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/Triple.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Operator.h" +#include "llvm/Pass.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "lowerbitsets" + +STATISTIC(ByteArraySizeBits, "Byte array size in bits"); +STATISTIC(ByteArraySizeBytes, "Byte array size in bytes"); +STATISTIC(NumByteArraysCreated, "Number of byte arrays created"); +STATISTIC(NumBitSetCallsLowered, "Number of bitset calls lowered"); +STATISTIC(NumBitSetDisjointSets, "Number of disjoint sets of bitsets"); + +static cl::opt<bool> AvoidReuse( + "lowerbitsets-avoid-reuse", + cl::desc("Try to avoid reuse of byte array addresses using aliases"), + cl::Hidden, cl::init(true)); + +bool BitSetInfo::containsGlobalOffset(uint64_t Offset) const { + if (Offset < ByteOffset) + return false; + + if ((Offset - ByteOffset) % (uint64_t(1) << AlignLog2) != 0) + return false; + + uint64_t BitOffset = (Offset - ByteOffset) >> AlignLog2; + if (BitOffset >= BitSize) + return false; + + return Bits.count(BitOffset); +} + +bool BitSetInfo::containsValue( + const DataLayout &DL, + const DenseMap<GlobalVariable *, uint64_t> &GlobalLayout, Value *V, + uint64_t COffset) const { + if (auto GV = dyn_cast<GlobalVariable>(V)) { + auto I = GlobalLayout.find(GV); + if (I == GlobalLayout.end()) + return false; + return containsGlobalOffset(I->second + COffset); + } + + if (auto GEP = dyn_cast<GEPOperator>(V)) { + APInt APOffset(DL.getPointerSizeInBits(0), 0); + bool Result = GEP->accumulateConstantOffset(DL, APOffset); + if (!Result) + return false; + COffset += APOffset.getZExtValue(); + return containsValue(DL, GlobalLayout, GEP->getPointerOperand(), + COffset); + } + + if (auto Op = dyn_cast<Operator>(V)) { + if (Op->getOpcode() == Instruction::BitCast) + return containsValue(DL, GlobalLayout, Op->getOperand(0), COffset); + + if (Op->getOpcode() == Instruction::Select) + return containsValue(DL, GlobalLayout, Op->getOperand(1), COffset) && + containsValue(DL, GlobalLayout, Op->getOperand(2), COffset); + } + + return false; +} + +BitSetInfo BitSetBuilder::build() { + if (Min > Max) + Min = 0; + + // Normalize each offset against the minimum observed offset, and compute + // the bitwise OR of each of the offsets. The number of trailing zeros + // in the mask gives us the log2 of the alignment of all offsets, which + // allows us to compress the bitset by only storing one bit per aligned + // address. + uint64_t Mask = 0; + for (uint64_t &Offset : Offsets) { + Offset -= Min; + Mask |= Offset; + } + + BitSetInfo BSI; + BSI.ByteOffset = Min; + + BSI.AlignLog2 = 0; + if (Mask != 0) + BSI.AlignLog2 = countTrailingZeros(Mask, ZB_Undefined); + + // Build the compressed bitset while normalizing the offsets against the + // computed alignment. + BSI.BitSize = ((Max - Min) >> BSI.AlignLog2) + 1; + for (uint64_t Offset : Offsets) { + Offset >>= BSI.AlignLog2; + BSI.Bits.insert(Offset); + } + + return BSI; +} + +void GlobalLayoutBuilder::addFragment(const std::set<uint64_t> &F) { + // Create a new fragment to hold the layout for F. + Fragments.emplace_back(); + std::vector<uint64_t> &Fragment = Fragments.back(); + uint64_t FragmentIndex = Fragments.size() - 1; + + for (auto ObjIndex : F) { + uint64_t OldFragmentIndex = FragmentMap[ObjIndex]; + if (OldFragmentIndex == 0) { + // We haven't seen this object index before, so just add it to the current + // fragment. + Fragment.push_back(ObjIndex); + } else { + // This index belongs to an existing fragment. Copy the elements of the + // old fragment into this one and clear the old fragment. We don't update + // the fragment map just yet, this ensures that any further references to + // indices from the old fragment in this fragment do not insert any more + // indices. + std::vector<uint64_t> &OldFragment = Fragments[OldFragmentIndex]; + Fragment.insert(Fragment.end(), OldFragment.begin(), OldFragment.end()); + OldFragment.clear(); + } + } + + // Update the fragment map to point our object indices to this fragment. + for (uint64_t ObjIndex : Fragment) + FragmentMap[ObjIndex] = FragmentIndex; +} + +void ByteArrayBuilder::allocate(const std::set<uint64_t> &Bits, + uint64_t BitSize, uint64_t &AllocByteOffset, + uint8_t &AllocMask) { + // Find the smallest current allocation. + unsigned Bit = 0; + for (unsigned I = 1; I != BitsPerByte; ++I) + if (BitAllocs[I] < BitAllocs[Bit]) + Bit = I; + + AllocByteOffset = BitAllocs[Bit]; + + // Add our size to it. + unsigned ReqSize = AllocByteOffset + BitSize; + BitAllocs[Bit] = ReqSize; + if (Bytes.size() < ReqSize) + Bytes.resize(ReqSize); + + // Set our bits. + AllocMask = 1 << Bit; + for (uint64_t B : Bits) + Bytes[AllocByteOffset + B] |= AllocMask; +} + +namespace { + +struct ByteArrayInfo { + std::set<uint64_t> Bits; + uint64_t BitSize; + GlobalVariable *ByteArray; + Constant *Mask; +}; + +struct LowerBitSets : public ModulePass { + static char ID; + LowerBitSets() : ModulePass(ID) { + initializeLowerBitSetsPass(*PassRegistry::getPassRegistry()); + } + + Module *M; + + bool LinkerSubsectionsViaSymbols; + IntegerType *Int1Ty; + IntegerType *Int8Ty; + IntegerType *Int32Ty; + Type *Int32PtrTy; + IntegerType *Int64Ty; + Type *IntPtrTy; + + // The llvm.bitsets named metadata. + NamedMDNode *BitSetNM; + + // Mapping from bitset mdstrings to the call sites that test them. + DenseMap<MDString *, std::vector<CallInst *>> BitSetTestCallSites; + + std::vector<ByteArrayInfo> ByteArrayInfos; + + BitSetInfo + buildBitSet(MDString *BitSet, + const DenseMap<GlobalVariable *, uint64_t> &GlobalLayout); + ByteArrayInfo *createByteArray(BitSetInfo &BSI); + void allocateByteArrays(); + Value *createBitSetTest(IRBuilder<> &B, BitSetInfo &BSI, ByteArrayInfo *&BAI, + Value *BitOffset); + Value * + lowerBitSetCall(CallInst *CI, BitSetInfo &BSI, ByteArrayInfo *&BAI, + GlobalVariable *CombinedGlobal, + const DenseMap<GlobalVariable *, uint64_t> &GlobalLayout); + void buildBitSetsFromGlobals(const std::vector<MDString *> &BitSets, + const std::vector<GlobalVariable *> &Globals); + bool buildBitSets(); + bool eraseBitSetMetadata(); + + bool doInitialization(Module &M) override; + bool runOnModule(Module &M) override; +}; + +} // namespace + +INITIALIZE_PASS_BEGIN(LowerBitSets, "lowerbitsets", + "Lower bitset metadata", false, false) +INITIALIZE_PASS_END(LowerBitSets, "lowerbitsets", + "Lower bitset metadata", false, false) +char LowerBitSets::ID = 0; + +ModulePass *llvm::createLowerBitSetsPass() { return new LowerBitSets; } + +bool LowerBitSets::doInitialization(Module &Mod) { + M = &Mod; + const DataLayout &DL = Mod.getDataLayout(); + + Triple TargetTriple(M->getTargetTriple()); + LinkerSubsectionsViaSymbols = TargetTriple.isMacOSX(); + + Int1Ty = Type::getInt1Ty(M->getContext()); + Int8Ty = Type::getInt8Ty(M->getContext()); + Int32Ty = Type::getInt32Ty(M->getContext()); + Int32PtrTy = PointerType::getUnqual(Int32Ty); + Int64Ty = Type::getInt64Ty(M->getContext()); + IntPtrTy = DL.getIntPtrType(M->getContext(), 0); + + BitSetNM = M->getNamedMetadata("llvm.bitsets"); + + BitSetTestCallSites.clear(); + + return false; +} + +/// Build a bit set for BitSet using the object layouts in +/// GlobalLayout. +BitSetInfo LowerBitSets::buildBitSet( + MDString *BitSet, + const DenseMap<GlobalVariable *, uint64_t> &GlobalLayout) { + BitSetBuilder BSB; + + // Compute the byte offset of each element of this bitset. + if (BitSetNM) { + for (MDNode *Op : BitSetNM->operands()) { + if (Op->getOperand(0) != BitSet || !Op->getOperand(1)) + continue; + auto OpGlobal = cast<GlobalVariable>( + cast<ConstantAsMetadata>(Op->getOperand(1))->getValue()); + uint64_t Offset = + cast<ConstantInt>(cast<ConstantAsMetadata>(Op->getOperand(2)) + ->getValue())->getZExtValue(); + + Offset += GlobalLayout.find(OpGlobal)->second; + + BSB.addOffset(Offset); + } + } + + return BSB.build(); +} + +/// Build a test that bit BitOffset mod sizeof(Bits)*8 is set in +/// Bits. This pattern matches to the bt instruction on x86. +static Value *createMaskedBitTest(IRBuilder<> &B, Value *Bits, + Value *BitOffset) { + auto BitsType = cast<IntegerType>(Bits->getType()); + unsigned BitWidth = BitsType->getBitWidth(); + + BitOffset = B.CreateZExtOrTrunc(BitOffset, BitsType); + Value *BitIndex = + B.CreateAnd(BitOffset, ConstantInt::get(BitsType, BitWidth - 1)); + Value *BitMask = B.CreateShl(ConstantInt::get(BitsType, 1), BitIndex); + Value *MaskedBits = B.CreateAnd(Bits, BitMask); + return B.CreateICmpNE(MaskedBits, ConstantInt::get(BitsType, 0)); +} + +ByteArrayInfo *LowerBitSets::createByteArray(BitSetInfo &BSI) { + // Create globals to stand in for byte arrays and masks. These never actually + // get initialized, we RAUW and erase them later in allocateByteArrays() once + // we know the offset and mask to use. + auto ByteArrayGlobal = new GlobalVariable( + *M, Int8Ty, /*isConstant=*/true, GlobalValue::PrivateLinkage, nullptr); + auto MaskGlobal = new GlobalVariable( + *M, Int8Ty, /*isConstant=*/true, GlobalValue::PrivateLinkage, nullptr); + + ByteArrayInfos.emplace_back(); + ByteArrayInfo *BAI = &ByteArrayInfos.back(); + + BAI->Bits = BSI.Bits; + BAI->BitSize = BSI.BitSize; + BAI->ByteArray = ByteArrayGlobal; + BAI->Mask = ConstantExpr::getPtrToInt(MaskGlobal, Int8Ty); + return BAI; +} + +void LowerBitSets::allocateByteArrays() { + std::stable_sort(ByteArrayInfos.begin(), ByteArrayInfos.end(), + [](const ByteArrayInfo &BAI1, const ByteArrayInfo &BAI2) { + return BAI1.BitSize > BAI2.BitSize; + }); + + std::vector<uint64_t> ByteArrayOffsets(ByteArrayInfos.size()); + + ByteArrayBuilder BAB; + for (unsigned I = 0; I != ByteArrayInfos.size(); ++I) { + ByteArrayInfo *BAI = &ByteArrayInfos[I]; + + uint8_t Mask; + BAB.allocate(BAI->Bits, BAI->BitSize, ByteArrayOffsets[I], Mask); + + BAI->Mask->replaceAllUsesWith(ConstantInt::get(Int8Ty, Mask)); + cast<GlobalVariable>(BAI->Mask->getOperand(0))->eraseFromParent(); + } + + Constant *ByteArrayConst = ConstantDataArray::get(M->getContext(), BAB.Bytes); + auto ByteArray = + new GlobalVariable(*M, ByteArrayConst->getType(), /*isConstant=*/true, + GlobalValue::PrivateLinkage, ByteArrayConst); + + for (unsigned I = 0; I != ByteArrayInfos.size(); ++I) { + ByteArrayInfo *BAI = &ByteArrayInfos[I]; + + Constant *Idxs[] = {ConstantInt::get(IntPtrTy, 0), + ConstantInt::get(IntPtrTy, ByteArrayOffsets[I])}; + Constant *GEP = ConstantExpr::getInBoundsGetElementPtr( + ByteArrayConst->getType(), ByteArray, Idxs); + + // Create an alias instead of RAUW'ing the gep directly. On x86 this ensures + // that the pc-relative displacement is folded into the lea instead of the + // test instruction getting another displacement. + if (LinkerSubsectionsViaSymbols) { + BAI->ByteArray->replaceAllUsesWith(GEP); + } else { + GlobalAlias *Alias = + GlobalAlias::create(PointerType::getUnqual(Int8Ty), + GlobalValue::PrivateLinkage, "bits", GEP, M); + BAI->ByteArray->replaceAllUsesWith(Alias); + } + BAI->ByteArray->eraseFromParent(); + } + + ByteArraySizeBits = BAB.BitAllocs[0] + BAB.BitAllocs[1] + BAB.BitAllocs[2] + + BAB.BitAllocs[3] + BAB.BitAllocs[4] + BAB.BitAllocs[5] + + BAB.BitAllocs[6] + BAB.BitAllocs[7]; + ByteArraySizeBytes = BAB.Bytes.size(); +} + +/// Build a test that bit BitOffset is set in BSI, where +/// BitSetGlobal is a global containing the bits in BSI. +Value *LowerBitSets::createBitSetTest(IRBuilder<> &B, BitSetInfo &BSI, + ByteArrayInfo *&BAI, Value *BitOffset) { + if (BSI.BitSize <= 64) { + // If the bit set is sufficiently small, we can avoid a load by bit testing + // a constant. + IntegerType *BitsTy; + if (BSI.BitSize <= 32) + BitsTy = Int32Ty; + else + BitsTy = Int64Ty; + + uint64_t Bits = 0; + for (auto Bit : BSI.Bits) + Bits |= uint64_t(1) << Bit; + Constant *BitsConst = ConstantInt::get(BitsTy, Bits); + return createMaskedBitTest(B, BitsConst, BitOffset); + } else { + if (!BAI) { + ++NumByteArraysCreated; + BAI = createByteArray(BSI); + } + + Constant *ByteArray = BAI->ByteArray; + Type *Ty = BAI->ByteArray->getValueType(); + if (!LinkerSubsectionsViaSymbols && AvoidReuse) { + // Each use of the byte array uses a different alias. This makes the + // backend less likely to reuse previously computed byte array addresses, + // improving the security of the CFI mechanism based on this pass. + ByteArray = GlobalAlias::create(BAI->ByteArray->getType(), + GlobalValue::PrivateLinkage, "bits_use", + ByteArray, M); + } + + Value *ByteAddr = B.CreateGEP(Ty, ByteArray, BitOffset); + Value *Byte = B.CreateLoad(ByteAddr); + + Value *ByteAndMask = B.CreateAnd(Byte, BAI->Mask); + return B.CreateICmpNE(ByteAndMask, ConstantInt::get(Int8Ty, 0)); + } +} + +/// Lower a llvm.bitset.test call to its implementation. Returns the value to +/// replace the call with. +Value *LowerBitSets::lowerBitSetCall( + CallInst *CI, BitSetInfo &BSI, ByteArrayInfo *&BAI, + GlobalVariable *CombinedGlobal, + const DenseMap<GlobalVariable *, uint64_t> &GlobalLayout) { + Value *Ptr = CI->getArgOperand(0); + const DataLayout &DL = M->getDataLayout(); + + if (BSI.containsValue(DL, GlobalLayout, Ptr)) + return ConstantInt::getTrue(CombinedGlobal->getParent()->getContext()); + + Constant *GlobalAsInt = ConstantExpr::getPtrToInt(CombinedGlobal, IntPtrTy); + Constant *OffsetedGlobalAsInt = ConstantExpr::getAdd( + GlobalAsInt, ConstantInt::get(IntPtrTy, BSI.ByteOffset)); + + BasicBlock *InitialBB = CI->getParent(); + + IRBuilder<> B(CI); + + Value *PtrAsInt = B.CreatePtrToInt(Ptr, IntPtrTy); + + if (BSI.isSingleOffset()) + return B.CreateICmpEQ(PtrAsInt, OffsetedGlobalAsInt); + + Value *PtrOffset = B.CreateSub(PtrAsInt, OffsetedGlobalAsInt); + + Value *BitOffset; + if (BSI.AlignLog2 == 0) { + BitOffset = PtrOffset; + } else { + // We need to check that the offset both falls within our range and is + // suitably aligned. We can check both properties at the same time by + // performing a right rotate by log2(alignment) followed by an integer + // comparison against the bitset size. The rotate will move the lower + // order bits that need to be zero into the higher order bits of the + // result, causing the comparison to fail if they are nonzero. The rotate + // also conveniently gives us a bit offset to use during the load from + // the bitset. + Value *OffsetSHR = + B.CreateLShr(PtrOffset, ConstantInt::get(IntPtrTy, BSI.AlignLog2)); + Value *OffsetSHL = B.CreateShl( + PtrOffset, + ConstantInt::get(IntPtrTy, DL.getPointerSizeInBits(0) - BSI.AlignLog2)); + BitOffset = B.CreateOr(OffsetSHR, OffsetSHL); + } + + Constant *BitSizeConst = ConstantInt::get(IntPtrTy, BSI.BitSize); + Value *OffsetInRange = B.CreateICmpULT(BitOffset, BitSizeConst); + + // If the bit set is all ones, testing against it is unnecessary. + if (BSI.isAllOnes()) + return OffsetInRange; + + TerminatorInst *Term = SplitBlockAndInsertIfThen(OffsetInRange, CI, false); + IRBuilder<> ThenB(Term); + + // Now that we know that the offset is in range and aligned, load the + // appropriate bit from the bitset. + Value *Bit = createBitSetTest(ThenB, BSI, BAI, BitOffset); + + // The value we want is 0 if we came directly from the initial block + // (having failed the range or alignment checks), or the loaded bit if + // we came from the block in which we loaded it. + B.SetInsertPoint(CI); + PHINode *P = B.CreatePHI(Int1Ty, 2); + P->addIncoming(ConstantInt::get(Int1Ty, 0), InitialBB); + P->addIncoming(Bit, ThenB.GetInsertBlock()); + return P; +} + +/// Given a disjoint set of bitsets and globals, layout the globals, build the +/// bit sets and lower the llvm.bitset.test calls. +void LowerBitSets::buildBitSetsFromGlobals( + const std::vector<MDString *> &BitSets, + const std::vector<GlobalVariable *> &Globals) { + // Build a new global with the combined contents of the referenced globals. + std::vector<Constant *> GlobalInits; + const DataLayout &DL = M->getDataLayout(); + for (GlobalVariable *G : Globals) { + GlobalInits.push_back(G->getInitializer()); + uint64_t InitSize = DL.getTypeAllocSize(G->getInitializer()->getType()); + + // Compute the amount of padding required to align the next element to the + // next power of 2. + uint64_t Padding = NextPowerOf2(InitSize - 1) - InitSize; + + // Cap at 128 was found experimentally to have a good data/instruction + // overhead tradeoff. + if (Padding > 128) + Padding = RoundUpToAlignment(InitSize, 128) - InitSize; + + GlobalInits.push_back( + ConstantAggregateZero::get(ArrayType::get(Int8Ty, Padding))); + } + if (!GlobalInits.empty()) + GlobalInits.pop_back(); + Constant *NewInit = ConstantStruct::getAnon(M->getContext(), GlobalInits); + auto CombinedGlobal = + new GlobalVariable(*M, NewInit->getType(), /*isConstant=*/true, + GlobalValue::PrivateLinkage, NewInit); + + const StructLayout *CombinedGlobalLayout = + DL.getStructLayout(cast<StructType>(NewInit->getType())); + + // Compute the offsets of the original globals within the new global. + DenseMap<GlobalVariable *, uint64_t> GlobalLayout; + for (unsigned I = 0; I != Globals.size(); ++I) + // Multiply by 2 to account for padding elements. + GlobalLayout[Globals[I]] = CombinedGlobalLayout->getElementOffset(I * 2); + + // For each bitset in this disjoint set... + for (MDString *BS : BitSets) { + // Build the bitset. + BitSetInfo BSI = buildBitSet(BS, GlobalLayout); + + ByteArrayInfo *BAI = 0; + + // Lower each call to llvm.bitset.test for this bitset. + for (CallInst *CI : BitSetTestCallSites[BS]) { + ++NumBitSetCallsLowered; + Value *Lowered = lowerBitSetCall(CI, BSI, BAI, CombinedGlobal, GlobalLayout); + CI->replaceAllUsesWith(Lowered); + CI->eraseFromParent(); + } + } + + // Build aliases pointing to offsets into the combined global for each + // global from which we built the combined global, and replace references + // to the original globals with references to the aliases. + for (unsigned I = 0; I != Globals.size(); ++I) { + // Multiply by 2 to account for padding elements. + Constant *CombinedGlobalIdxs[] = {ConstantInt::get(Int32Ty, 0), + ConstantInt::get(Int32Ty, I * 2)}; + Constant *CombinedGlobalElemPtr = ConstantExpr::getGetElementPtr( + NewInit->getType(), CombinedGlobal, CombinedGlobalIdxs); + if (LinkerSubsectionsViaSymbols) { + Globals[I]->replaceAllUsesWith(CombinedGlobalElemPtr); + } else { + GlobalAlias *GAlias = + GlobalAlias::create(Globals[I]->getType(), Globals[I]->getLinkage(), + "", CombinedGlobalElemPtr, M); + GAlias->takeName(Globals[I]); + Globals[I]->replaceAllUsesWith(GAlias); + } + Globals[I]->eraseFromParent(); + } +} + +/// Lower all bit sets in this module. +bool LowerBitSets::buildBitSets() { + Function *BitSetTestFunc = + M->getFunction(Intrinsic::getName(Intrinsic::bitset_test)); + if (!BitSetTestFunc) + return false; + + // Equivalence class set containing bitsets and the globals they reference. + // This is used to partition the set of bitsets in the module into disjoint + // sets. + typedef EquivalenceClasses<PointerUnion<GlobalVariable *, MDString *>> + GlobalClassesTy; + GlobalClassesTy GlobalClasses; + + for (const Use &U : BitSetTestFunc->uses()) { + auto CI = cast<CallInst>(U.getUser()); + + auto BitSetMDVal = dyn_cast<MetadataAsValue>(CI->getArgOperand(1)); + if (!BitSetMDVal || !isa<MDString>(BitSetMDVal->getMetadata())) + report_fatal_error( + "Second argument of llvm.bitset.test must be metadata string"); + auto BitSet = cast<MDString>(BitSetMDVal->getMetadata()); + + // Add the call site to the list of call sites for this bit set. We also use + // BitSetTestCallSites to keep track of whether we have seen this bit set + // before. If we have, we don't need to re-add the referenced globals to the + // equivalence class. + std::pair<DenseMap<MDString *, std::vector<CallInst *>>::iterator, + bool> Ins = + BitSetTestCallSites.insert( + std::make_pair(BitSet, std::vector<CallInst *>())); + Ins.first->second.push_back(CI); + if (!Ins.second) + continue; + + // Add the bitset to the equivalence class. + GlobalClassesTy::iterator GCI = GlobalClasses.insert(BitSet); + GlobalClassesTy::member_iterator CurSet = GlobalClasses.findLeader(GCI); + + if (!BitSetNM) + continue; + + // Verify the bitset metadata and add the referenced globals to the bitset's + // equivalence class. + for (MDNode *Op : BitSetNM->operands()) { + if (Op->getNumOperands() != 3) + report_fatal_error( + "All operands of llvm.bitsets metadata must have 3 elements"); + + if (Op->getOperand(0) != BitSet || !Op->getOperand(1)) + continue; + + auto OpConstMD = dyn_cast<ConstantAsMetadata>(Op->getOperand(1)); + if (!OpConstMD) + report_fatal_error("Bit set element must be a constant"); + auto OpGlobal = dyn_cast<GlobalVariable>(OpConstMD->getValue()); + if (!OpGlobal) + report_fatal_error("Bit set element must refer to global"); + + auto OffsetConstMD = dyn_cast<ConstantAsMetadata>(Op->getOperand(2)); + if (!OffsetConstMD) + report_fatal_error("Bit set element offset must be a constant"); + auto OffsetInt = dyn_cast<ConstantInt>(OffsetConstMD->getValue()); + if (!OffsetInt) + report_fatal_error( + "Bit set element offset must be an integer constant"); + + CurSet = GlobalClasses.unionSets( + CurSet, GlobalClasses.findLeader(GlobalClasses.insert(OpGlobal))); + } + } + + if (GlobalClasses.empty()) + return false; + + // For each disjoint set we found... + for (GlobalClassesTy::iterator I = GlobalClasses.begin(), + E = GlobalClasses.end(); + I != E; ++I) { + if (!I->isLeader()) continue; + + ++NumBitSetDisjointSets; + + // Build the list of bitsets and referenced globals in this disjoint set. + std::vector<MDString *> BitSets; + std::vector<GlobalVariable *> Globals; + llvm::DenseMap<MDString *, uint64_t> BitSetIndices; + llvm::DenseMap<GlobalVariable *, uint64_t> GlobalIndices; + for (GlobalClassesTy::member_iterator MI = GlobalClasses.member_begin(I); + MI != GlobalClasses.member_end(); ++MI) { + if ((*MI).is<MDString *>()) { + BitSetIndices[MI->get<MDString *>()] = BitSets.size(); + BitSets.push_back(MI->get<MDString *>()); + } else { + GlobalIndices[MI->get<GlobalVariable *>()] = Globals.size(); + Globals.push_back(MI->get<GlobalVariable *>()); + } + } + + // For each bitset, build a set of indices that refer to globals referenced + // by the bitset. + std::vector<std::set<uint64_t>> BitSetMembers(BitSets.size()); + if (BitSetNM) { + for (MDNode *Op : BitSetNM->operands()) { + // Op = { bitset name, global, offset } + if (!Op->getOperand(1)) + continue; + auto I = BitSetIndices.find(cast<MDString>(Op->getOperand(0))); + if (I == BitSetIndices.end()) + continue; + + auto OpGlobal = cast<GlobalVariable>( + cast<ConstantAsMetadata>(Op->getOperand(1))->getValue()); + BitSetMembers[I->second].insert(GlobalIndices[OpGlobal]); + } + } + + // Order the sets of indices by size. The GlobalLayoutBuilder works best + // when given small index sets first. + std::stable_sort( + BitSetMembers.begin(), BitSetMembers.end(), + [](const std::set<uint64_t> &O1, const std::set<uint64_t> &O2) { + return O1.size() < O2.size(); + }); + + // Create a GlobalLayoutBuilder and provide it with index sets as layout + // fragments. The GlobalLayoutBuilder tries to lay out members of fragments + // as close together as possible. + GlobalLayoutBuilder GLB(Globals.size()); + for (auto &&MemSet : BitSetMembers) + GLB.addFragment(MemSet); + + // Build a vector of globals with the computed layout. + std::vector<GlobalVariable *> OrderedGlobals(Globals.size()); + auto OGI = OrderedGlobals.begin(); + for (auto &&F : GLB.Fragments) + for (auto &&Offset : F) + *OGI++ = Globals[Offset]; + + // Order bitsets by name for determinism. + std::sort(BitSets.begin(), BitSets.end(), [](MDString *S1, MDString *S2) { + return S1->getString() < S2->getString(); + }); + + // Build the bitsets from this disjoint set. + buildBitSetsFromGlobals(BitSets, OrderedGlobals); + } + + allocateByteArrays(); + + return true; +} + +bool LowerBitSets::eraseBitSetMetadata() { + if (!BitSetNM) + return false; + + M->eraseNamedMetadata(BitSetNM); + return true; +} + +bool LowerBitSets::runOnModule(Module &M) { + bool Changed = buildBitSets(); + Changed |= eraseBitSetMetadata(); + return Changed; +} diff --git a/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp b/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp index b91ebf2..91a5eef 100644 --- a/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp +++ b/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp @@ -127,9 +127,8 @@ namespace { /// side of claiming that two functions are different). class FunctionComparator { public: - FunctionComparator(const DataLayout *DL, const Function *F1, - const Function *F2) - : FnL(F1), FnR(F2), DL(DL) {} + FunctionComparator(const Function *F1, const Function *F2) + : FnL(F1), FnR(F2) {} /// Test whether the two functions have equivalent behaviour. int compare(); @@ -292,8 +291,7 @@ private: /// Parts to be compared for each comparison stage, /// most significant stage first: /// 1. Address space. As numbers. - /// 2. Constant offset, (if "DataLayout *DL" field is not NULL, - /// using GEPOperator::accumulateConstantOffset method). + /// 2. Constant offset, (using GEPOperator::accumulateConstantOffset method). /// 3. Pointer operand type (using cmpType method). /// 4. Number of operands. /// 5. Compare operands, using cmpValues method. @@ -354,8 +352,6 @@ private: // The two functions undergoing comparison. const Function *FnL, *FnR; - const DataLayout *DL; - /// Assign serial numbers to values from left function, and values from /// right function. /// Explanation: @@ -394,14 +390,13 @@ private: class FunctionNode { AssertingVH<Function> F; - const DataLayout *DL; public: - FunctionNode(Function *F, const DataLayout *DL) : F(F), DL(DL) {} + FunctionNode(Function *F) : F(F) {} Function *getFunc() const { return F; } void release() { F = 0; } bool operator<(const FunctionNode &RHS) const { - return (FunctionComparator(DL, F, RHS.getFunc()).compare()) == -1; + return (FunctionComparator(F, RHS.getFunc()).compare()) == -1; } }; } @@ -620,10 +615,11 @@ int FunctionComparator::cmpTypes(Type *TyL, Type *TyR) const { PointerType *PTyL = dyn_cast<PointerType>(TyL); PointerType *PTyR = dyn_cast<PointerType>(TyR); - if (DL) { - if (PTyL && PTyL->getAddressSpace() == 0) TyL = DL->getIntPtrType(TyL); - if (PTyR && PTyR->getAddressSpace() == 0) TyR = DL->getIntPtrType(TyR); - } + const DataLayout &DL = FnL->getParent()->getDataLayout(); + if (PTyL && PTyL->getAddressSpace() == 0) + TyL = DL.getIntPtrType(TyL); + if (PTyR && PTyR->getAddressSpace() == 0) + TyR = DL.getIntPtrType(TyR); if (TyL == TyR) return 0; @@ -723,6 +719,15 @@ int FunctionComparator::cmpOperations(const Instruction *L, R->getRawSubclassOptionalData())) return Res; + if (const AllocaInst *AI = dyn_cast<AllocaInst>(L)) { + if (int Res = cmpTypes(AI->getAllocatedType(), + cast<AllocaInst>(R)->getAllocatedType())) + return Res; + if (int Res = + cmpNumbers(AI->getAlignment(), cast<AllocaInst>(R)->getAlignment())) + return Res; + } + // We have two instructions of identical opcode and #operands. Check to see // if all operands are the same type for (unsigned i = 0, e = L->getNumOperands(); i != e; ++i) { @@ -855,13 +860,12 @@ int FunctionComparator::cmpGEPs(const GEPOperator *GEPL, // When we have target data, we can reduce the GEP down to the value in bytes // added to the address. - if (DL) { - unsigned BitWidth = DL->getPointerSizeInBits(ASL); - APInt OffsetL(BitWidth, 0), OffsetR(BitWidth, 0); - if (GEPL->accumulateConstantOffset(*DL, OffsetL) && - GEPR->accumulateConstantOffset(*DL, OffsetR)) - return cmpAPInts(OffsetL, OffsetR); - } + const DataLayout &DL = FnL->getParent()->getDataLayout(); + unsigned BitWidth = DL.getPointerSizeInBits(ASL); + APInt OffsetL(BitWidth, 0), OffsetR(BitWidth, 0); + if (GEPL->accumulateConstantOffset(DL, OffsetL) && + GEPR->accumulateConstantOffset(DL, OffsetR)) + return cmpAPInts(OffsetL, OffsetR); if (int Res = cmpNumbers((uint64_t)GEPL->getPointerOperand()->getType(), (uint64_t)GEPR->getPointerOperand()->getType())) @@ -1122,9 +1126,6 @@ private: /// to modify it. FnTreeType FnTree; - /// DataLayout for more accurate GEP comparisons. May be NULL. - const DataLayout *DL; - /// Whether or not the target supports global aliases. bool HasGlobalAliases; }; @@ -1152,8 +1153,8 @@ bool MergeFunctions::doSanityCheck(std::vector<WeakVH> &Worklist) { for (std::vector<WeakVH>::iterator J = I; J != E && j < Max; ++J, ++j) { Function *F1 = cast<Function>(*I); Function *F2 = cast<Function>(*J); - int Res1 = FunctionComparator(DL, F1, F2).compare(); - int Res2 = FunctionComparator(DL, F2, F1).compare(); + int Res1 = FunctionComparator(F1, F2).compare(); + int Res2 = FunctionComparator(F2, F1).compare(); // If F1 <= F2, then F2 >= F1, otherwise report failure. if (Res1 != -Res2) { @@ -1174,8 +1175,8 @@ bool MergeFunctions::doSanityCheck(std::vector<WeakVH> &Worklist) { continue; Function *F3 = cast<Function>(*K); - int Res3 = FunctionComparator(DL, F1, F3).compare(); - int Res4 = FunctionComparator(DL, F2, F3).compare(); + int Res3 = FunctionComparator(F1, F3).compare(); + int Res4 = FunctionComparator(F2, F3).compare(); bool Transitive = true; @@ -1212,8 +1213,6 @@ bool MergeFunctions::doSanityCheck(std::vector<WeakVH> &Worklist) { bool MergeFunctions::runOnModule(Module &M) { bool Changed = false; - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - DL = DLP ? &DLP->getDataLayout() : nullptr; for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { if (!I->isDeclaration() && !I->hasAvailableExternallyLinkage()) @@ -1368,8 +1367,7 @@ void MergeFunctions::writeThunk(Function *F, Function *G) { // Replace G with an alias to F and delete G. void MergeFunctions::writeAlias(Function *F, Function *G) { PointerType *PTy = G->getType(); - auto *GA = GlobalAlias::create(PTy->getElementType(), PTy->getAddressSpace(), - G->getLinkage(), "", F); + auto *GA = GlobalAlias::create(PTy, G->getLinkage(), "", F); F->setAlignment(std::max(F->getAlignment(), G->getAlignment())); GA->takeName(G); GA->setVisibility(G->getVisibility()); @@ -1420,7 +1418,7 @@ void MergeFunctions::mergeTwoFunctions(Function *F, Function *G) { // that was already inserted. bool MergeFunctions::insert(Function *NewFunction) { std::pair<FnTreeType::iterator, bool> Result = - FnTree.insert(FunctionNode(NewFunction, DL)); + FnTree.insert(FunctionNode(NewFunction)); if (Result.second) { DEBUG(dbgs() << "Inserting as unique: " << NewFunction->getName() << '\n'); @@ -1457,7 +1455,7 @@ bool MergeFunctions::insert(Function *NewFunction) { void MergeFunctions::remove(Function *F) { // We need to make sure we remove F, not a function "equal" to F per the // function equality comparator. - FnTreeType::iterator found = FnTree.find(FunctionNode(F, DL)); + FnTreeType::iterator found = FnTree.find(FunctionNode(F)); size_t Erased = 0; if (found != FnTree.end() && found->getFunc() == F) { Erased = 1; diff --git a/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp b/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp index 76d6dfa..4a7cb7b 100644 --- a/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp +++ b/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp @@ -58,13 +58,13 @@ Function* PartialInliner::unswitchFunction(Function* F) { BasicBlock* returnBlock = nullptr; BasicBlock* nonReturnBlock = nullptr; unsigned returnCount = 0; - for (succ_iterator SI = succ_begin(entryBlock), SE = succ_end(entryBlock); - SI != SE; ++SI) - if (isa<ReturnInst>((*SI)->getTerminator())) { - returnBlock = *SI; + for (BasicBlock *BB : successors(entryBlock)) { + if (isa<ReturnInst>(BB->getTerminator())) { + returnBlock = BB; returnCount++; } else - nonReturnBlock = *SI; + nonReturnBlock = BB; + } if (returnCount != 1) return nullptr; diff --git a/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp index fb673dc..7eb0682 100644 --- a/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -19,12 +19,11 @@ #include "llvm/Analysis/Passes.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Verifier.h" -#include "llvm/PassManager.h" +#include "llvm/IR/LegacyPassManager.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ManagedStatic.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Target/TargetMachine.h" -#include "llvm/Target/TargetSubtargetInfo.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Vectorize.h" @@ -60,6 +59,10 @@ static cl::opt<bool> RunLoopRerolling("reroll-loops", cl::Hidden, cl::desc("Run the loop rerolling pass")); +static cl::opt<bool> +RunFloat2Int("float-to-int", cl::Hidden, cl::init(true), + cl::desc("Run the float2int (float demotion) pass")); + static cl::opt<bool> RunLoadCombine("combine-loads", cl::init(false), cl::Hidden, cl::desc("Run the load combining pass")); @@ -78,6 +81,14 @@ static cl::opt<bool> EnableMLSM("mlsm", cl::init(true), cl::Hidden, cl::desc("Enable motion of merged load and store")); +static cl::opt<bool> EnableLoopInterchange( + "enable-loopinterchange", cl::init(false), cl::Hidden, + cl::desc("Enable the new, experimental LoopInterchange Pass")); + +static cl::opt<bool> EnableLoopDistribute( + "enable-loop-distribute", cl::init(false), cl::Hidden, + cl::desc("Enable the new, experimental LoopDistribution Pass")); + static cl::opt<bool> EnableGVN("enable-gvn", cl::init(true), cl::Hidden, cl::desc("Run the global value numbering pass")); @@ -98,7 +109,6 @@ PassManagerBuilder::PassManagerBuilder() { DisableGVNLoadPRE = false; VerifyInput = false; VerifyOutput = false; - StripDebug = false; MergeFunctions = false; } @@ -122,7 +132,7 @@ void PassManagerBuilder::addExtension(ExtensionPointTy Ty, ExtensionFn Fn) { } void PassManagerBuilder::addExtensionsToPM(ExtensionPointTy ETy, - PassManagerBase &PM) const { + legacy::PassManagerBase &PM) const { for (unsigned i = 0, e = GlobalExtensions->size(); i != e; ++i) if ((*GlobalExtensions)[i].first == ETy) (*GlobalExtensions)[i].second(*this, PM); @@ -131,8 +141,8 @@ void PassManagerBuilder::addExtensionsToPM(ExtensionPointTy ETy, Extensions[i].second(*this, PM); } -void -PassManagerBuilder::addInitialAliasAnalysisPasses(PassManagerBase &PM) const { +void PassManagerBuilder::addInitialAliasAnalysisPasses( + legacy::PassManagerBase &PM) const { // Add TypeBasedAliasAnalysis before BasicAliasAnalysis so that // BasicAliasAnalysis wins if they disagree. This is intended to help // support "obvious" type-punning idioms. @@ -143,11 +153,13 @@ PassManagerBuilder::addInitialAliasAnalysisPasses(PassManagerBase &PM) const { PM.add(createBasicAliasAnalysisPass()); } -void PassManagerBuilder::populateFunctionPassManager(FunctionPassManager &FPM) { +void PassManagerBuilder::populateFunctionPassManager( + legacy::FunctionPassManager &FPM) { addExtensionsToPM(EP_EarlyAsPossible, FPM); // Add LibraryInfo if we have some. - if (LibraryInfo) FPM.add(new TargetLibraryInfo(*LibraryInfo)); + if (LibraryInfo) + FPM.add(new TargetLibraryInfoWrapperPass(*LibraryInfo)); if (OptLevel == 0) return; @@ -162,7 +174,8 @@ void PassManagerBuilder::populateFunctionPassManager(FunctionPassManager &FPM) { FPM.add(createLowerExpectIntrinsicPass()); } -void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) { +void PassManagerBuilder::populateModulePassManager( + legacy::PassManagerBase &MPM) { // If all optimizations are disabled, just run the always-inline pass and, // if enabled, the function merging pass. if (OptLevel == 0) { @@ -186,7 +199,8 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) { } // Add LibraryInfo if we have some. - if (LibraryInfo) MPM.add(new TargetLibraryInfo(*LibraryInfo)); + if (LibraryInfo) + MPM.add(new TargetLibraryInfoWrapperPass(*LibraryInfo)); addInitialAliasAnalysisPasses(MPM); @@ -240,7 +254,10 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) { MPM.add(createIndVarSimplifyPass()); // Canonicalize indvars MPM.add(createLoopIdiomPass()); // Recognize idioms like memset. MPM.add(createLoopDeletionPass()); // Delete dead loops - + if (EnableLoopInterchange) { + MPM.add(createLoopInterchangePass()); // Interchange loops + MPM.add(createCFGSimplificationPass()); + } if (!DisableUnrollLoops) MPM.add(createSimpleLoopUnrollPass()); // Unroll small loops addExtensionsToPM(EP_LoopOptimizerEnd, MPM); @@ -254,6 +271,11 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) { MPM.add(createMemCpyOptPass()); // Remove memcpy / form memset MPM.add(createSCCPPass()); // Constant prop with SCCP + // Delete dead bit computations (instcombine runs after to fold away the dead + // computations, and then ADCE will run later to exploit any new DCE + // opportunities that creates). + MPM.add(createBitTrackingDCEPass()); // Delete dead bit computations + // Run instcombine after redundancy elimination to exploit opportunities // opened up by them. MPM.add(createInstructionCombiningPass()); @@ -261,6 +283,7 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) { MPM.add(createJumpThreadingPass()); // Thread jumps MPM.add(createCorrelatedValuePropagationPass()); MPM.add(createDeadStoreEliminationPass()); // Delete dead stores + MPM.add(createLICMPass()); addExtensionsToPM(EP_ScalarOptimizerLate, MPM); @@ -298,11 +321,18 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) { // we must insert a no-op module pass to reset the pass manager. MPM.add(createBarrierNoopPass()); + if (RunFloat2Int) + MPM.add(createFloat2IntPass()); + // Re-rotate loops in all our loop nests. These may have fallout out of // rotated form due to GVN or other transformations, and the vectorizer relies // on the rotated form. - if (ExtraVectorizerPasses) - MPM.add(createLoopRotatePass()); + MPM.add(createLoopRotatePass()); + + // Distribute loops to allow partial vectorization. I.e. isolate dependences + // into separate loop that would otherwise inhibit vectorization. + if (EnableLoopDistribute) + MPM.add(createLoopDistributePass()); MPM.add(createLoopVectorizePass(DisableUnrollLoops, LoopVectorize)); // FIXME: Because of #pragma vectorize enable, the passes below are always @@ -354,9 +384,19 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) { MPM.add(createCFGSimplificationPass()); MPM.add(createInstructionCombiningPass()); - if (!DisableUnrollLoops) + if (!DisableUnrollLoops) { MPM.add(createLoopUnrollPass()); // Unroll small loops + // LoopUnroll may generate some redundency to cleanup. + MPM.add(createInstructionCombiningPass()); + + // Runtime unrolling will introduce runtime check in loop prologue. If the + // unrolled loop is a inner loop, then the prologue will be inside the + // outer loop. LICM pass can help to promote the runtime check out if the + // checked value is loop invariant. + MPM.add(createLICMPass()); + } + // After vectorization and unrolling, assume intrinsics may tell us more // about pointer alignments. MPM.add(createAlignmentFromAssumptionsPass()); @@ -379,7 +419,7 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) { addExtensionsToPM(EP_OptimizerLast, MPM); } -void PassManagerBuilder::addLTOOptimizationPasses(PassManagerBase &PM) { +void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) { // Provide AliasAnalysis services for optimizations. addInitialAliasAnalysisPasses(PM); @@ -450,6 +490,9 @@ void PassManagerBuilder::addLTOOptimizationPasses(PassManagerBase &PM) { // More loops are countable; try to optimize them. PM.add(createIndVarSimplifyPass()); PM.add(createLoopDeletionPass()); + if (EnableLoopInterchange) + PM.add(createLoopInterchangePass()); + PM.add(createLoopVectorizePass(true, LoopVectorize)); // More scalar chains could be vectorized due to more alias information @@ -469,7 +512,10 @@ void PassManagerBuilder::addLTOOptimizationPasses(PassManagerBase &PM) { addExtensionsToPM(EP_Peephole, PM); PM.add(createJumpThreadingPass()); +} +void PassManagerBuilder::addLateLTOOptimizationPasses( + legacy::PassManagerBase &PM) { // Delete basic blocks, which optimization passes may have killed. PM.add(createCFGSimplificationPass()); @@ -482,32 +528,26 @@ void PassManagerBuilder::addLTOOptimizationPasses(PassManagerBase &PM) { PM.add(createMergeFunctionsPass()); } -void PassManagerBuilder::populateLTOPassManager(PassManagerBase &PM, - TargetMachine *TM) { - if (TM) { - PM.add(new DataLayoutPass()); - TM->addAnalysisPasses(PM); - } - +void PassManagerBuilder::populateLTOPassManager(legacy::PassManagerBase &PM) { if (LibraryInfo) - PM.add(new TargetLibraryInfo(*LibraryInfo)); + PM.add(new TargetLibraryInfoWrapperPass(*LibraryInfo)); if (VerifyInput) PM.add(createVerifierPass()); - if (StripDebug) - PM.add(createStripSymbolsPass(true)); + if (OptLevel > 1) + addLTOOptimizationPasses(PM); - if (VerifyInput) - PM.add(createDebugInfoVerifierPass()); + // Lower bit sets to globals. This pass supports Clang's control flow + // integrity mechanisms (-fsanitize=cfi*) and needs to run at link time if CFI + // is enabled. The pass does nothing if CFI is disabled. + PM.add(createLowerBitSetsPass()); if (OptLevel != 0) - addLTOOptimizationPasses(PM); + addLateLTOOptimizationPasses(PM); - if (VerifyOutput) { + if (VerifyOutput) PM.add(createVerifierPass()); - PM.add(createDebugInfoVerifierPass()); - } } inline PassManagerBuilder *unwrap(LLVMPassManagerBuilderRef P) { @@ -573,7 +613,7 @@ void LLVMPassManagerBuilderPopulateFunctionPassManager(LLVMPassManagerBuilderRef PMB, LLVMPassManagerRef PM) { PassManagerBuilder *Builder = unwrap(PMB); - FunctionPassManager *FPM = unwrap<FunctionPassManager>(PM); + legacy::FunctionPassManager *FPM = unwrap<legacy::FunctionPassManager>(PM); Builder->populateFunctionPassManager(*FPM); } @@ -581,7 +621,7 @@ void LLVMPassManagerBuilderPopulateModulePassManager(LLVMPassManagerBuilderRef PMB, LLVMPassManagerRef PM) { PassManagerBuilder *Builder = unwrap(PMB); - PassManagerBase *MPM = unwrap(PM); + legacy::PassManagerBase *MPM = unwrap(PM); Builder->populateModulePassManager(*MPM); } @@ -590,7 +630,7 @@ void LLVMPassManagerBuilderPopulateLTOPassManager(LLVMPassManagerBuilderRef PMB, LLVMBool Internalize, LLVMBool RunInliner) { PassManagerBuilder *Builder = unwrap(PMB); - PassManagerBase *LPM = unwrap(PM); + legacy::PassManagerBase *LPM = unwrap(PM); // A small backwards compatibility hack. populateLTOPassManager used to take // an RunInliner option. diff --git a/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp b/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp index 7bd4ce1..1943b93 100644 --- a/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp +++ b/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp @@ -18,8 +18,10 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/CallGraphSCCPass.h" +#include "llvm/Analysis/LibCallSemantics.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" @@ -175,7 +177,7 @@ bool PruneEH::SimplifyFunction(Function *F) { bool MadeChange = false; for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) { if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) - if (II->doesNotThrow()) { + if (II->doesNotThrow() && canSimplifyInvokeNoUnwind(II)) { SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3); // Insert a call instruction before the invoke. CallInst *Call = CallInst::Create(II->getCalledValue(), Args, "", II); diff --git a/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp b/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp index 816978e..60c9573 100644 --- a/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp +++ b/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp @@ -305,41 +305,31 @@ bool StripDeadDebugInfo::runOnModule(Module &M) { SmallVector<Metadata *, 64> LiveSubprograms; DenseSet<const MDNode *> VisitedSet; - for (DICompileUnit DIC : F.compile_units()) { - assert(DIC.Verify() && "DIC must verify as a DICompileUnit."); - + for (DICompileUnit *DIC : F.compile_units()) { // Create our live subprogram list. - DIArray SPs = DIC.getSubprograms(); bool SubprogramChange = false; - for (unsigned i = 0, e = SPs.getNumElements(); i != e; ++i) { - DISubprogram DISP(SPs.getElement(i)); - assert(DISP.Verify() && "DISP must verify as a DISubprogram."); - + for (DISubprogram *DISP : DIC->getSubprograms()) { // Make sure we visit each subprogram only once. if (!VisitedSet.insert(DISP).second) continue; // If the function referenced by DISP is not null, the function is live. - if (DISP.getFunction()) + if (DISP->getFunction()) LiveSubprograms.push_back(DISP); else SubprogramChange = true; } // Create our live global variable list. - DIArray GVs = DIC.getGlobalVariables(); bool GlobalVariableChange = false; - for (unsigned i = 0, e = GVs.getNumElements(); i != e; ++i) { - DIGlobalVariable DIG(GVs.getElement(i)); - assert(DIG.Verify() && "DIG must verify as DIGlobalVariable."); - + for (DIGlobalVariable *DIG : DIC->getGlobalVariables()) { // Make sure we only visit each global variable only once. if (!VisitedSet.insert(DIG).second) continue; // If the global variable referenced by DIG is not null, the global // variable is live. - if (DIG.getGlobal()) + if (DIG->getVariable()) LiveGlobalVariables.push_back(DIG); else GlobalVariableChange = true; @@ -349,12 +339,12 @@ bool StripDeadDebugInfo::runOnModule(Module &M) { // subprogram list/global variable list with our new live subprogram/global // variable list. if (SubprogramChange) { - DIC.replaceSubprograms(DIArray(MDNode::get(C, LiveSubprograms))); + DIC->replaceSubprograms(MDTuple::get(C, LiveSubprograms)); Changed = true; } if (GlobalVariableChange) { - DIC.replaceGlobalVariables(DIArray(MDNode::get(C, LiveGlobalVariables))); + DIC->replaceGlobalVariables(MDTuple::get(C, LiveGlobalVariables)); Changed = true; } diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp index 6d20384..a8d0172 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -11,7 +11,7 @@ // //===----------------------------------------------------------------------===// -#include "InstCombine.h" +#include "InstCombineInternal.h" #include "llvm/ADT/STLExtras.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/IR/DataLayout.h" @@ -891,7 +891,7 @@ static bool checkRippleForAdd(const APInt &Op0KnownZero, /// This basically requires proving that the add in the original type would not /// overflow to change the sign bit or have a carry out. bool InstCombiner::WillNotOverflowSignedAdd(Value *LHS, Value *RHS, - Instruction *CxtI) { + Instruction &CxtI) { // There are different heuristics we can use for this. Here are some simple // ones. @@ -909,18 +909,18 @@ bool InstCombiner::WillNotOverflowSignedAdd(Value *LHS, Value *RHS, // // Since the carry into the most significant position is always equal to // the carry out of the addition, there is no signed overflow. - if (ComputeNumSignBits(LHS, 0, CxtI) > 1 && - ComputeNumSignBits(RHS, 0, CxtI) > 1) + if (ComputeNumSignBits(LHS, 0, &CxtI) > 1 && + ComputeNumSignBits(RHS, 0, &CxtI) > 1) return true; unsigned BitWidth = LHS->getType()->getScalarSizeInBits(); APInt LHSKnownZero(BitWidth, 0); APInt LHSKnownOne(BitWidth, 0); - computeKnownBits(LHS, LHSKnownZero, LHSKnownOne, 0, CxtI); + computeKnownBits(LHS, LHSKnownZero, LHSKnownOne, 0, &CxtI); APInt RHSKnownZero(BitWidth, 0); APInt RHSKnownOne(BitWidth, 0); - computeKnownBits(RHS, RHSKnownZero, RHSKnownOne, 0, CxtI); + computeKnownBits(RHS, RHSKnownZero, RHSKnownOne, 0, &CxtI); // Addition of two 2's compliment numbers having opposite signs will never // overflow. @@ -943,21 +943,21 @@ bool InstCombiner::WillNotOverflowSignedAdd(Value *LHS, Value *RHS, /// overflow to change the sign bit or have a carry out. /// TODO: Handle this for Vectors. bool InstCombiner::WillNotOverflowSignedSub(Value *LHS, Value *RHS, - Instruction *CxtI) { + Instruction &CxtI) { // If LHS and RHS each have at least two sign bits, the subtraction // cannot overflow. - if (ComputeNumSignBits(LHS, 0, CxtI) > 1 && - ComputeNumSignBits(RHS, 0, CxtI) > 1) + if (ComputeNumSignBits(LHS, 0, &CxtI) > 1 && + ComputeNumSignBits(RHS, 0, &CxtI) > 1) return true; unsigned BitWidth = LHS->getType()->getScalarSizeInBits(); APInt LHSKnownZero(BitWidth, 0); APInt LHSKnownOne(BitWidth, 0); - computeKnownBits(LHS, LHSKnownZero, LHSKnownOne, 0, CxtI); + computeKnownBits(LHS, LHSKnownZero, LHSKnownOne, 0, &CxtI); APInt RHSKnownZero(BitWidth, 0); APInt RHSKnownOne(BitWidth, 0); - computeKnownBits(RHS, RHSKnownZero, RHSKnownOne, 0, CxtI); + computeKnownBits(RHS, RHSKnownZero, RHSKnownOne, 0, &CxtI); // Subtraction of two 2's compliment numbers having identical signs will // never overflow. @@ -972,12 +972,14 @@ bool InstCombiner::WillNotOverflowSignedSub(Value *LHS, Value *RHS, /// \brief Return true if we can prove that: /// (sub LHS, RHS) === (sub nuw LHS, RHS) bool InstCombiner::WillNotOverflowUnsignedSub(Value *LHS, Value *RHS, - Instruction *CxtI) { + Instruction &CxtI) { // If the LHS is negative and the RHS is non-negative, no unsigned wrap. bool LHSKnownNonNegative, LHSKnownNegative; bool RHSKnownNonNegative, RHSKnownNegative; - ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, /*Depth=*/0, CxtI); - ComputeSignBit(RHS, RHSKnownNonNegative, RHSKnownNegative, /*Depth=*/0, CxtI); + ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, /*Depth=*/0, + &CxtI); + ComputeSignBit(RHS, RHSKnownNonNegative, RHSKnownNegative, /*Depth=*/0, + &CxtI); if (LHSKnownNegative && RHSKnownNonNegative) return true; @@ -1046,15 +1048,15 @@ static Value *checkForNegativeOperand(BinaryOperator &I, } Instruction *InstCombiner::visitAdd(BinaryOperator &I) { - bool Changed = SimplifyAssociativeOrCommutative(I); - Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); + bool Changed = SimplifyAssociativeOrCommutative(I); + Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); - if (Value *V = SimplifyVectorOp(I)) - return ReplaceInstUsesWith(I, V); + if (Value *V = SimplifyVectorOp(I)) + return ReplaceInstUsesWith(I, V); - if (Value *V = SimplifyAddInst(LHS, RHS, I.hasNoSignedWrap(), - I.hasNoUnsignedWrap(), DL, TLI, DT, AC)) - return ReplaceInstUsesWith(I, V); + if (Value *V = SimplifyAddInst(LHS, RHS, I.hasNoSignedWrap(), + I.hasNoUnsignedWrap(), DL, TLI, DT, AC)) + return ReplaceInstUsesWith(I, V); // (A*B)+(A*C) -> A*(B+C) etc if (Value *V = SimplifyUsingDistributiveLaws(I)) @@ -1158,20 +1160,8 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) { return ReplaceInstUsesWith(I, V); // A+B --> A|B iff A and B have no bits set in common. - if (IntegerType *IT = dyn_cast<IntegerType>(I.getType())) { - APInt LHSKnownOne(IT->getBitWidth(), 0); - APInt LHSKnownZero(IT->getBitWidth(), 0); - computeKnownBits(LHS, LHSKnownZero, LHSKnownOne, 0, &I); - if (LHSKnownZero != 0) { - APInt RHSKnownOne(IT->getBitWidth(), 0); - APInt RHSKnownZero(IT->getBitWidth(), 0); - computeKnownBits(RHS, RHSKnownZero, RHSKnownOne, 0, &I); - - // No bits in common -> bitwise or. - if ((LHSKnownZero|RHSKnownZero).isAllOnesValue()) - return BinaryOperator::CreateOr(LHS, RHS); - } - } + if (haveNoCommonBitsSet(LHS, RHS, DL, AC, &I, DT)) + return BinaryOperator::CreateOr(LHS, RHS); if (Constant *CRHS = dyn_cast<Constant>(RHS)) { Value *X; @@ -1243,7 +1233,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) { ConstantExpr::getTrunc(RHSC, LHSConv->getOperand(0)->getType()); if (LHSConv->hasOneUse() && ConstantExpr::getSExt(CI, I.getType()) == RHSC && - WillNotOverflowSignedAdd(LHSConv->getOperand(0), CI, &I)) { + WillNotOverflowSignedAdd(LHSConv->getOperand(0), CI, I)) { // Insert the new, smaller add. Value *NewAdd = Builder->CreateNSWAdd(LHSConv->getOperand(0), CI, "addconv"); @@ -1256,10 +1246,11 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) { // Only do this if x/y have the same type, if at last one of them has a // single use (so we don't increase the number of sexts), and if the // integer add will not overflow. - if (LHSConv->getOperand(0)->getType()==RHSConv->getOperand(0)->getType()&& + if (LHSConv->getOperand(0)->getType() == + RHSConv->getOperand(0)->getType() && (LHSConv->hasOneUse() || RHSConv->hasOneUse()) && WillNotOverflowSignedAdd(LHSConv->getOperand(0), - RHSConv->getOperand(0), &I)) { + RHSConv->getOperand(0), I)) { // Insert the new integer add. Value *NewAdd = Builder->CreateNSWAdd(LHSConv->getOperand(0), RHSConv->getOperand(0), "addconv"); @@ -1307,7 +1298,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) { // TODO(jingyue): Consider WillNotOverflowSignedAdd and // WillNotOverflowUnsignedAdd to reduce the number of invocations of // computeKnownBits. - if (!I.hasNoSignedWrap() && WillNotOverflowSignedAdd(LHS, RHS, &I)) { + if (!I.hasNoSignedWrap() && WillNotOverflowSignedAdd(LHS, RHS, I)) { Changed = true; I.setHasNoSignedWrap(true); } @@ -1371,7 +1362,7 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) { ConstantExpr::getFPToSI(CFP, LHSConv->getOperand(0)->getType()); if (LHSConv->hasOneUse() && ConstantExpr::getSIToFP(CI, I.getType()) == CFP && - WillNotOverflowSignedAdd(LHSConv->getOperand(0), CI, &I)) { + WillNotOverflowSignedAdd(LHSConv->getOperand(0), CI, I)) { // Insert the new integer add. Value *NewAdd = Builder->CreateNSWAdd(LHSConv->getOperand(0), CI, "addconv"); @@ -1384,10 +1375,11 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) { // Only do this if x/y have the same type, if at last one of them has a // single use (so we don't increase the number of int->fp conversions), // and if the integer add will not overflow. - if (LHSConv->getOperand(0)->getType()==RHSConv->getOperand(0)->getType()&& + if (LHSConv->getOperand(0)->getType() == + RHSConv->getOperand(0)->getType() && (LHSConv->hasOneUse() || RHSConv->hasOneUse()) && WillNotOverflowSignedAdd(LHSConv->getOperand(0), - RHSConv->getOperand(0), &I)) { + RHSConv->getOperand(0), I)) { // Insert the new integer add. Value *NewAdd = Builder->CreateNSWAdd(LHSConv->getOperand(0), RHSConv->getOperand(0),"addconv"); @@ -1436,8 +1428,6 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) { /// Value *InstCombiner::OptimizePointerDifference(Value *LHS, Value *RHS, Type *Ty) { - assert(DL && "Must have target data info for this"); - // If LHS is a gep based on RHS or RHS is a gep based on LHS, we can optimize // this. bool Swapped = false; @@ -1584,6 +1574,19 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) { CI->getValue() == I.getType()->getPrimitiveSizeInBits() - 1) return BinaryOperator::CreateLShr(X, CI); } + + // Turn this into a xor if LHS is 2^n-1 and the remaining bits are known + // zero. + APInt IntVal = C->getValue(); + if ((IntVal + 1).isPowerOf2()) { + unsigned BitWidth = I.getType()->getScalarSizeInBits(); + APInt KnownZero(BitWidth, 0); + APInt KnownOne(BitWidth, 0); + computeKnownBits(&I, KnownZero, KnownOne, 0, &I); + if ((IntVal | KnownZero).isAllOnesValue()) { + return BinaryOperator::CreateXor(Op1, C); + } + } } @@ -1662,26 +1665,24 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) { // Optimize pointer differences into the same array into a size. Consider: // &A[10] - &A[0]: we should compile this to "10". - if (DL) { - Value *LHSOp, *RHSOp; - if (match(Op0, m_PtrToInt(m_Value(LHSOp))) && - match(Op1, m_PtrToInt(m_Value(RHSOp)))) - if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType())) - return ReplaceInstUsesWith(I, Res); - - // trunc(p)-trunc(q) -> trunc(p-q) - if (match(Op0, m_Trunc(m_PtrToInt(m_Value(LHSOp)))) && - match(Op1, m_Trunc(m_PtrToInt(m_Value(RHSOp))))) - if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType())) - return ReplaceInstUsesWith(I, Res); - } + Value *LHSOp, *RHSOp; + if (match(Op0, m_PtrToInt(m_Value(LHSOp))) && + match(Op1, m_PtrToInt(m_Value(RHSOp)))) + if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType())) + return ReplaceInstUsesWith(I, Res); + + // trunc(p)-trunc(q) -> trunc(p-q) + if (match(Op0, m_Trunc(m_PtrToInt(m_Value(LHSOp)))) && + match(Op1, m_Trunc(m_PtrToInt(m_Value(RHSOp))))) + if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType())) + return ReplaceInstUsesWith(I, Res); bool Changed = false; - if (!I.hasNoSignedWrap() && WillNotOverflowSignedSub(Op0, Op1, &I)) { + if (!I.hasNoSignedWrap() && WillNotOverflowSignedSub(Op0, Op1, I)) { Changed = true; I.setHasNoSignedWrap(true); } - if (!I.hasNoUnsignedWrap() && WillNotOverflowUnsignedSub(Op0, Op1, &I)) { + if (!I.hasNoUnsignedWrap() && WillNotOverflowUnsignedSub(Op0, Op1, I)) { Changed = true; I.setHasNoUnsignedWrap(true); } diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index 74b6970..ee21c81 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -11,7 +11,7 @@ // //===----------------------------------------------------------------------===// -#include "InstCombine.h" +#include "InstCombineInternal.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/IR/ConstantRange.h" #include "llvm/IR/Intrinsics.h" @@ -22,30 +22,12 @@ using namespace PatternMatch; #define DEBUG_TYPE "instcombine" -/// isFreeToInvert - Return true if the specified value is free to invert (apply -/// ~ to). This happens in cases where the ~ can be eliminated. -static inline bool isFreeToInvert(Value *V) { - // ~(~(X)) -> X. - if (BinaryOperator::isNot(V)) - return true; - - // Constants can be considered to be not'ed values. - if (isa<ConstantInt>(V)) - return true; - - // Compares can be inverted if they have a single use. - if (CmpInst *CI = dyn_cast<CmpInst>(V)) - return CI->hasOneUse(); - - return false; -} - static inline Value *dyn_castNotVal(Value *V) { // If this is not(not(x)) don't return that this is a not: we want the two // not's to be folded first. if (BinaryOperator::isNot(V)) { Value *Operand = BinaryOperator::getNotArgument(V); - if (!isFreeToInvert(Operand)) + if (!IsFreeToInvert(Operand, Operand->hasOneUse())) return Operand; } @@ -997,9 +979,9 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) { // Make a constant range that's the intersection of the two icmp ranges. // If the intersection is empty, we know that the result is false. ConstantRange LHSRange = - ConstantRange::makeICmpRegion(LHSCC, LHSCst->getValue()); + ConstantRange::makeAllowedICmpRegion(LHSCC, LHSCst->getValue()); ConstantRange RHSRange = - ConstantRange::makeICmpRegion(RHSCC, RHSCst->getValue()); + ConstantRange::makeAllowedICmpRegion(RHSCC, RHSCst->getValue()); if (LHSRange.intersectWith(RHSRange).isEmptySet()) return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 0); @@ -1727,15 +1709,17 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, Value *Mask = nullptr; Value *Masked = nullptr; if (LAnd->getOperand(0) == RAnd->getOperand(0) && - isKnownToBeAPowerOfTwo(LAnd->getOperand(1), false, 0, AC, CxtI, DT) && - isKnownToBeAPowerOfTwo(RAnd->getOperand(1), false, 0, AC, CxtI, DT)) { + isKnownToBeAPowerOfTwo(LAnd->getOperand(1), DL, false, 0, AC, CxtI, + DT) && + isKnownToBeAPowerOfTwo(RAnd->getOperand(1), DL, false, 0, AC, CxtI, + DT)) { Mask = Builder->CreateOr(LAnd->getOperand(1), RAnd->getOperand(1)); Masked = Builder->CreateAnd(LAnd->getOperand(0), Mask); } else if (LAnd->getOperand(1) == RAnd->getOperand(1) && - isKnownToBeAPowerOfTwo(LAnd->getOperand(0), false, 0, AC, CxtI, - DT) && - isKnownToBeAPowerOfTwo(RAnd->getOperand(0), false, 0, AC, CxtI, - DT)) { + isKnownToBeAPowerOfTwo(LAnd->getOperand(0), DL, false, 0, AC, + CxtI, DT) && + isKnownToBeAPowerOfTwo(RAnd->getOperand(0), DL, false, 0, AC, + CxtI, DT)) { Mask = Builder->CreateOr(LAnd->getOperand(0), RAnd->getOperand(0)); Masked = Builder->CreateAnd(LAnd->getOperand(1), Mask); } @@ -2585,8 +2569,10 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) { // ~(X & Y) --> (~X | ~Y) - De Morgan's Law // ~(X | Y) === (~X & ~Y) - De Morgan's Law - if (isFreeToInvert(Op0I->getOperand(0)) && - isFreeToInvert(Op0I->getOperand(1))) { + if (IsFreeToInvert(Op0I->getOperand(0), + Op0I->getOperand(0)->hasOneUse()) && + IsFreeToInvert(Op0I->getOperand(1), + Op0I->getOperand(1)->hasOneUse())) { Value *NotX = Builder->CreateNot(Op0I->getOperand(0), "notlhs"); Value *NotY = @@ -2604,15 +2590,16 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) { } } - - if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) { - if (RHS->isOne() && Op0->hasOneUse()) + if (Constant *RHS = dyn_cast<Constant>(Op1)) { + if (RHS->isAllOnesValue() && Op0->hasOneUse()) // xor (cmp A, B), true = not (cmp A, B) = !cmp A, B if (CmpInst *CI = dyn_cast<CmpInst>(Op0)) return CmpInst::Create(CI->getOpcode(), CI->getInversePredicate(), CI->getOperand(0), CI->getOperand(1)); + } + if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) { // fold (xor(zext(cmp)), 1) and (xor(sext(cmp)), -1) to ext(!cmp). if (CastInst *Op0C = dyn_cast<CastInst>(Op0)) { if (CmpInst *CI = dyn_cast<CmpInst>(Op0C->getOperand(0))) { diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 83b4b82..e83b9dd 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -11,16 +11,17 @@ // //===----------------------------------------------------------------------===// -#include "InstCombine.h" +#include "InstCombineInternal.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/IR/CallSite.h" -#include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Statepoint.h" #include "llvm/Transforms/Utils/BuildLibCalls.h" #include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/SimplifyLibCalls.h" using namespace llvm; using namespace PatternMatch; @@ -60,8 +61,8 @@ static Type *reduceToSingleValueType(Type *T) { } Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) { - unsigned DstAlign = getKnownAlignment(MI->getArgOperand(0), DL, AC, MI, DT); - unsigned SrcAlign = getKnownAlignment(MI->getArgOperand(1), DL, AC, MI, DT); + unsigned DstAlign = getKnownAlignment(MI->getArgOperand(0), DL, MI, AC, DT); + unsigned SrcAlign = getKnownAlignment(MI->getArgOperand(1), DL, MI, AC, DT); unsigned MinAlign = std::min(DstAlign, SrcAlign); unsigned CopyAlign = MI->getAlignment(); @@ -107,7 +108,7 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) { if (StrippedDest != MI->getArgOperand(0)) { Type *SrcETy = cast<PointerType>(StrippedDest->getType()) ->getElementType(); - if (DL && SrcETy->isSized() && DL->getTypeStoreSize(SrcETy) == Size) { + if (SrcETy->isSized() && DL.getTypeStoreSize(SrcETy) == Size) { // The SrcETy might be something like {{{double}}} or [1 x double]. Rip // down through these levels if so. SrcETy = reduceToSingleValueType(SrcETy); @@ -155,7 +156,7 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) { } Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) { - unsigned Alignment = getKnownAlignment(MI->getDest(), DL, AC, MI, DT); + unsigned Alignment = getKnownAlignment(MI->getDest(), DL, MI, AC, DT); if (MI->getAlignment() < Alignment) { MI->setAlignment(ConstantInt::get(MI->getAlignmentType(), Alignment, false)); @@ -197,11 +198,137 @@ Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) { return nullptr; } +static Value *SimplifyX86insertps(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder) { + if (auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2))) { + VectorType *VecTy = cast<VectorType>(II.getType()); + assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type"); + + // The immediate permute control byte looks like this: + // [3:0] - zero mask for each 32-bit lane + // [5:4] - select one 32-bit destination lane + // [7:6] - select one 32-bit source lane + + uint8_t Imm = CInt->getZExtValue(); + uint8_t ZMask = Imm & 0xf; + uint8_t DestLane = (Imm >> 4) & 0x3; + uint8_t SourceLane = (Imm >> 6) & 0x3; + + ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy); + + // If all zero mask bits are set, this was just a weird way to + // generate a zero vector. + if (ZMask == 0xf) + return ZeroVector; + + // Initialize by passing all of the first source bits through. + int ShuffleMask[4] = { 0, 1, 2, 3 }; + + // We may replace the second operand with the zero vector. + Value *V1 = II.getArgOperand(1); + + if (ZMask) { + // If the zero mask is being used with a single input or the zero mask + // overrides the destination lane, this is a shuffle with the zero vector. + if ((II.getArgOperand(0) == II.getArgOperand(1)) || + (ZMask & (1 << DestLane))) { + V1 = ZeroVector; + // We may still move 32-bits of the first source vector from one lane + // to another. + ShuffleMask[DestLane] = SourceLane; + // The zero mask may override the previous insert operation. + for (unsigned i = 0; i < 4; ++i) + if ((ZMask >> i) & 0x1) + ShuffleMask[i] = i + 4; + } else { + // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle? + return nullptr; + } + } else { + // Replace the selected destination lane with the selected source lane. + ShuffleMask[DestLane] = SourceLane + 4; + } + + return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask); + } + return nullptr; +} + +/// The shuffle mask for a perm2*128 selects any two halves of two 256-bit +/// source vectors, unless a zero bit is set. If a zero bit is set, +/// then ignore that half of the mask and clear that half of the vector. +static Value *SimplifyX86vperm2(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder) { + if (auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2))) { + VectorType *VecTy = cast<VectorType>(II.getType()); + ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy); + + // The immediate permute control byte looks like this: + // [1:0] - select 128 bits from sources for low half of destination + // [2] - ignore + // [3] - zero low half of destination + // [5:4] - select 128 bits from sources for high half of destination + // [6] - ignore + // [7] - zero high half of destination + + uint8_t Imm = CInt->getZExtValue(); + + bool LowHalfZero = Imm & 0x08; + bool HighHalfZero = Imm & 0x80; + + // If both zero mask bits are set, this was just a weird way to + // generate a zero vector. + if (LowHalfZero && HighHalfZero) + return ZeroVector; + + // If 0 or 1 zero mask bits are set, this is a simple shuffle. + unsigned NumElts = VecTy->getNumElements(); + unsigned HalfSize = NumElts / 2; + SmallVector<int, 8> ShuffleMask(NumElts); + + // The high bit of the selection field chooses the 1st or 2nd operand. + bool LowInputSelect = Imm & 0x02; + bool HighInputSelect = Imm & 0x20; + + // The low bit of the selection field chooses the low or high half + // of the selected operand. + bool LowHalfSelect = Imm & 0x01; + bool HighHalfSelect = Imm & 0x10; + + // Determine which operand(s) are actually in use for this instruction. + Value *V0 = LowInputSelect ? II.getArgOperand(1) : II.getArgOperand(0); + Value *V1 = HighInputSelect ? II.getArgOperand(1) : II.getArgOperand(0); + + // If needed, replace operands based on zero mask. + V0 = LowHalfZero ? ZeroVector : V0; + V1 = HighHalfZero ? ZeroVector : V1; + + // Permute low half of result. + unsigned StartIndex = LowHalfSelect ? HalfSize : 0; + for (unsigned i = 0; i < HalfSize; ++i) + ShuffleMask[i] = StartIndex + i; + + // Permute high half of result. + StartIndex = HighHalfSelect ? HalfSize : 0; + StartIndex += NumElts; + for (unsigned i = 0; i < HalfSize; ++i) + ShuffleMask[i + HalfSize] = StartIndex + i; + + return Builder.CreateShuffleVector(V0, V1, ShuffleMask); + } + return nullptr; +} + /// visitCallInst - CallInst simplification. This mostly only handles folding /// of intrinsic instructions. For normal calls, it allows visitCallSite to do /// the heavy lifting. /// Instruction *InstCombiner::visitCallInst(CallInst &CI) { + auto Args = CI.arg_operands(); + if (Value *V = SimplifyCall(CI.getCalledValue(), Args.begin(), Args.end(), DL, + TLI, DT, AC)) + return ReplaceInstUsesWith(CI, V); + if (isFreeCall(&CI, TLI)) return visitFree(CI); @@ -350,112 +477,36 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { } break; - case Intrinsic::uadd_with_overflow: { - Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1); - OverflowResult OR = computeOverflowForUnsignedAdd(LHS, RHS, II); - if (OR == OverflowResult::NeverOverflows) - return CreateOverflowTuple(II, Builder->CreateNUWAdd(LHS, RHS), false); - if (OR == OverflowResult::AlwaysOverflows) - return CreateOverflowTuple(II, Builder->CreateAdd(LHS, RHS), true); - } - // FALL THROUGH uadd into sadd + + case Intrinsic::uadd_with_overflow: case Intrinsic::sadd_with_overflow: - // Canonicalize constants into the RHS. + case Intrinsic::umul_with_overflow: + case Intrinsic::smul_with_overflow: if (isa<Constant>(II->getArgOperand(0)) && !isa<Constant>(II->getArgOperand(1))) { + // Canonicalize constants into the RHS. Value *LHS = II->getArgOperand(0); II->setArgOperand(0, II->getArgOperand(1)); II->setArgOperand(1, LHS); return II; } + // fall through - // X + undef -> undef - if (isa<UndefValue>(II->getArgOperand(1))) - return ReplaceInstUsesWith(CI, UndefValue::get(II->getType())); - - if (ConstantInt *RHS = dyn_cast<ConstantInt>(II->getArgOperand(1))) { - // X + 0 -> {X, false} - if (RHS->isZero()) { - return CreateOverflowTuple(II, II->getArgOperand(0), false, - /*ReUseName*/false); - } - } - - // We can strength reduce reduce this signed add into a regular add if we - // can prove that it will never overflow. - if (II->getIntrinsicID() == Intrinsic::sadd_with_overflow) { - Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1); - if (WillNotOverflowSignedAdd(LHS, RHS, II)) { - return CreateOverflowTuple(II, Builder->CreateNSWAdd(LHS, RHS), false); - } - } - - break; case Intrinsic::usub_with_overflow: case Intrinsic::ssub_with_overflow: { - Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1); - // undef - X -> undef - // X - undef -> undef - if (isa<UndefValue>(LHS) || isa<UndefValue>(RHS)) - return ReplaceInstUsesWith(CI, UndefValue::get(II->getType())); - - if (ConstantInt *ConstRHS = dyn_cast<ConstantInt>(RHS)) { - // X - 0 -> {X, false} - if (ConstRHS->isZero()) { - return CreateOverflowTuple(II, LHS, false, /*ReUseName*/false); - } - } - if (II->getIntrinsicID() == Intrinsic::ssub_with_overflow) { - if (WillNotOverflowSignedSub(LHS, RHS, II)) { - return CreateOverflowTuple(II, Builder->CreateNSWSub(LHS, RHS), false); - } - } else { - if (WillNotOverflowUnsignedSub(LHS, RHS, II)) { - return CreateOverflowTuple(II, Builder->CreateNUWSub(LHS, RHS), false); - } - } - break; - } - case Intrinsic::umul_with_overflow: { - Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1); - OverflowResult OR = computeOverflowForUnsignedMul(LHS, RHS, II); - if (OR == OverflowResult::NeverOverflows) - return CreateOverflowTuple(II, Builder->CreateNUWMul(LHS, RHS), false); - if (OR == OverflowResult::AlwaysOverflows) - return CreateOverflowTuple(II, Builder->CreateMul(LHS, RHS), true); - } // FALL THROUGH - case Intrinsic::smul_with_overflow: - // Canonicalize constants into the RHS. - if (isa<Constant>(II->getArgOperand(0)) && - !isa<Constant>(II->getArgOperand(1))) { - Value *LHS = II->getArgOperand(0); - II->setArgOperand(0, II->getArgOperand(1)); - II->setArgOperand(1, LHS); - return II; - } - - // X * undef -> undef - if (isa<UndefValue>(II->getArgOperand(1))) - return ReplaceInstUsesWith(CI, UndefValue::get(II->getType())); + OverflowCheckFlavor OCF = + IntrinsicIDToOverflowCheckFlavor(II->getIntrinsicID()); + assert(OCF != OCF_INVALID && "unexpected!"); - if (ConstantInt *RHSI = dyn_cast<ConstantInt>(II->getArgOperand(1))) { - // X*0 -> {0, false} - if (RHSI->isZero()) - return ReplaceInstUsesWith(CI, Constant::getNullValue(II->getType())); + Value *OperationResult = nullptr; + Constant *OverflowResult = nullptr; + if (OptimizeOverflowCheck(OCF, II->getArgOperand(0), II->getArgOperand(1), + *II, OperationResult, OverflowResult)) + return CreateOverflowTuple(II, OperationResult, OverflowResult); - // X * 1 -> {X, false} - if (RHSI->equalsInt(1)) { - return CreateOverflowTuple(II, II->getArgOperand(0), false, - /*ReUseName*/false); - } - } - if (II->getIntrinsicID() == Intrinsic::smul_with_overflow) { - Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1); - if (WillNotOverflowSignedMul(LHS, RHS, II)) { - return CreateOverflowTuple(II, Builder->CreateNSWMul(LHS, RHS), false); - } - } break; + } + case Intrinsic::minnum: case Intrinsic::maxnum: { Value *Arg0 = II->getArgOperand(0); @@ -543,7 +594,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { case Intrinsic::ppc_altivec_lvx: case Intrinsic::ppc_altivec_lvxl: // Turn PPC lvx -> load if the pointer is known aligned. - if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, AC, II, DT) >= + if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, II, AC, DT) >= 16) { Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0), PointerType::getUnqual(II->getType())); @@ -560,7 +611,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { case Intrinsic::ppc_altivec_stvx: case Intrinsic::ppc_altivec_stvxl: // Turn stvx -> store if the pointer is known aligned. - if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, DL, AC, II, DT) >= + if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, DL, II, AC, DT) >= 16) { Type *OpPtrTy = PointerType::getUnqual(II->getArgOperand(0)->getType()); @@ -575,11 +626,54 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { Value *Ptr = Builder->CreateBitCast(II->getArgOperand(1), OpPtrTy); return new StoreInst(II->getArgOperand(0), Ptr, false, 1); } + case Intrinsic::ppc_qpx_qvlfs: + // Turn PPC QPX qvlfs -> load if the pointer is known aligned. + if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, II, AC, DT) >= + 16) { + Type *VTy = VectorType::get(Builder->getFloatTy(), + II->getType()->getVectorNumElements()); + Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0), + PointerType::getUnqual(VTy)); + Value *Load = Builder->CreateLoad(Ptr); + return new FPExtInst(Load, II->getType()); + } + break; + case Intrinsic::ppc_qpx_qvlfd: + // Turn PPC QPX qvlfd -> load if the pointer is known aligned. + if (getOrEnforceKnownAlignment(II->getArgOperand(0), 32, DL, II, AC, DT) >= + 32) { + Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0), + PointerType::getUnqual(II->getType())); + return new LoadInst(Ptr); + } + break; + case Intrinsic::ppc_qpx_qvstfs: + // Turn PPC QPX qvstfs -> store if the pointer is known aligned. + if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, DL, II, AC, DT) >= + 16) { + Type *VTy = VectorType::get(Builder->getFloatTy(), + II->getArgOperand(0)->getType()->getVectorNumElements()); + Value *TOp = Builder->CreateFPTrunc(II->getArgOperand(0), VTy); + Type *OpPtrTy = PointerType::getUnqual(VTy); + Value *Ptr = Builder->CreateBitCast(II->getArgOperand(1), OpPtrTy); + return new StoreInst(TOp, Ptr); + } + break; + case Intrinsic::ppc_qpx_qvstfd: + // Turn PPC QPX qvstfd -> store if the pointer is known aligned. + if (getOrEnforceKnownAlignment(II->getArgOperand(1), 32, DL, II, AC, DT) >= + 32) { + Type *OpPtrTy = + PointerType::getUnqual(II->getArgOperand(0)->getType()); + Value *Ptr = Builder->CreateBitCast(II->getArgOperand(1), OpPtrTy); + return new StoreInst(II->getArgOperand(0), Ptr); + } + break; case Intrinsic::x86_sse_storeu_ps: case Intrinsic::x86_sse2_storeu_pd: case Intrinsic::x86_sse2_storeu_dq: // Turn X86 storeu -> store if the pointer is known aligned. - if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, AC, II, DT) >= + if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, II, AC, DT) >= 16) { Type *OpPtrTy = PointerType::getUnqual(II->getArgOperand(1)->getType()); @@ -696,15 +790,18 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { unsigned LowHalfElts = VWidth / 2; APInt InputDemandedElts(APInt::getBitsSet(VWidth, 0, LowHalfElts)); APInt UndefElts(VWidth, 0); - if (Value *TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), - InputDemandedElts, - UndefElts)) { + if (Value *TmpV = SimplifyDemandedVectorElts( + II->getArgOperand(0), InputDemandedElts, UndefElts)) { II->setArgOperand(0, TmpV); return II; } break; } - + case Intrinsic::x86_sse41_insertps: + if (Value *V = SimplifyX86insertps(*II, *Builder)) + return ReplaceInstUsesWith(*II, V); + break; + case Intrinsic::x86_sse4a_insertqi: { // insertqi x, y, 64, 0 can just copy y's lower bits and leave the top // ones undef @@ -867,6 +964,14 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { return ReplaceInstUsesWith(CI, Shuffle); } + case Intrinsic::x86_avx_vperm2f128_pd_256: + case Intrinsic::x86_avx_vperm2f128_ps_256: + case Intrinsic::x86_avx_vperm2f128_si_256: + case Intrinsic::x86_avx2_vperm2i128: + if (Value *V = SimplifyX86vperm2(*II, *Builder)) + return ReplaceInstUsesWith(*II, V); + break; + case Intrinsic::ppc_altivec_vperm: // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant. // Note that ppc_altivec_vperm has a big-endian bias, so when creating @@ -906,12 +1011,12 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { unsigned Idx = cast<ConstantInt>(Mask->getAggregateElement(i))->getZExtValue(); Idx &= 31; // Match the hardware behavior. - if (DL && DL->isLittleEndian()) + if (DL.isLittleEndian()) Idx = 31 - Idx; if (!ExtractedElts[Idx]) { - Value *Op0ToUse = (DL && DL->isLittleEndian()) ? Op1 : Op0; - Value *Op1ToUse = (DL && DL->isLittleEndian()) ? Op0 : Op1; + Value *Op0ToUse = (DL.isLittleEndian()) ? Op1 : Op0; + Value *Op1ToUse = (DL.isLittleEndian()) ? Op0 : Op1; ExtractedElts[Idx] = Builder->CreateExtractElement(Idx < 16 ? Op0ToUse : Op1ToUse, Builder->getInt32(Idx&15)); @@ -940,7 +1045,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { case Intrinsic::arm_neon_vst2lane: case Intrinsic::arm_neon_vst3lane: case Intrinsic::arm_neon_vst4lane: { - unsigned MemAlign = getKnownAlignment(II->getArgOperand(0), DL, AC, II, DT); + unsigned MemAlign = getKnownAlignment(II->getArgOperand(0), DL, II, AC, DT); unsigned AlignArg = II->getNumArgOperands() - 1; ConstantInt *IntrAlign = dyn_cast<ConstantInt>(II->getArgOperand(AlignArg)); if (IntrAlign && IntrAlign->getZExtValue() < MemAlign) { @@ -1079,7 +1184,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { RHS->getType()->isPointerTy() && cast<Constant>(RHS)->isNullValue()) { LoadInst* LI = cast<LoadInst>(LHS); - if (isValidAssumeForContext(II, LI, DL, DT)) { + if (isValidAssumeForContext(II, LI, DT)) { MDNode *MD = MDNode::get(II->getContext(), None); LI->setMetadata(LLVMContext::MD_nonnull, MD); return EraseInstFromFunction(*II); @@ -1102,7 +1207,8 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { // facts about the relocate value, while being careful to // preserve relocation semantics. GCRelocateOperands Operands(II); - Value *DerivedPtr = Operands.derivedPtr(); + Value *DerivedPtr = Operands.getDerivedPtr(); + auto *GCRelocateType = cast<PointerType>(II->getType()); // Remove the relocation if unused, note that this check is required // to prevent the cases below from looping forever. @@ -1113,24 +1219,34 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { // TODO: provide a hook for this in GCStrategy. This is clearly legal for // most practical collectors, but there was discussion in the review thread // about whether it was legal for all possible collectors. - if (isa<UndefValue>(DerivedPtr)) - return ReplaceInstUsesWith(*II, DerivedPtr); + if (isa<UndefValue>(DerivedPtr)) { + // gc_relocate is uncasted. Use undef of gc_relocate's type to replace it. + return ReplaceInstUsesWith(*II, UndefValue::get(GCRelocateType)); + } // The relocation of null will be null for most any collector. // TODO: provide a hook for this in GCStrategy. There might be some weird // collector this property does not hold for. - if (isa<ConstantPointerNull>(DerivedPtr)) - return ReplaceInstUsesWith(*II, DerivedPtr); + if (isa<ConstantPointerNull>(DerivedPtr)) { + // gc_relocate is uncasted. Use null-pointer of gc_relocate's type to replace it. + return ReplaceInstUsesWith(*II, ConstantPointerNull::get(GCRelocateType)); + } // isKnownNonNull -> nonnull attribute if (isKnownNonNull(DerivedPtr)) II->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull); - // TODO: dereferenceable -> deref attribute + // isDereferenceablePointer -> deref attribute + if (isDereferenceablePointer(DerivedPtr, DL)) { + if (Argument *A = dyn_cast<Argument>(DerivedPtr)) { + uint64_t Bytes = A->getDereferenceableBytes(); + II->addDereferenceableAttr(AttributeSet::ReturnIndex, Bytes); + } + } // TODO: bitcast(relocate(p)) -> relocate(bitcast(p)) // Canonicalize on the type from the uses to the defs - + // TODO: relocate((gep p, C, C2, ...)) -> gep(relocate(p), C, C2, ...) } } @@ -1147,8 +1263,8 @@ Instruction *InstCombiner::visitInvokeInst(InvokeInst &II) { /// isSafeToEliminateVarargsCast - If this cast does not affect the value /// passed through the varargs area, we can eliminate the use of the cast. static bool isSafeToEliminateVarargsCast(const CallSite CS, - const CastInst * const CI, - const DataLayout * const DL, + const DataLayout &DL, + const CastInst *const CI, const int ix) { if (!CI->isLosslessCast()) return false; @@ -1172,7 +1288,7 @@ static bool isSafeToEliminateVarargsCast(const CallSite CS, Type* DstTy = cast<PointerType>(CI->getType())->getElementType(); if (!SrcTy->isSized() || !DstTy->isSized()) return false; - if (!DL || DL->getTypeAllocSize(SrcTy) != DL->getTypeAllocSize(DstTy)) + if (DL.getTypeAllocSize(SrcTy) != DL.getTypeAllocSize(DstTy)) return false; return true; } @@ -1181,10 +1297,14 @@ static bool isSafeToEliminateVarargsCast(const CallSite CS, // Currently we're only working with the checking functions, memcpy_chk, // mempcpy_chk, memmove_chk, memset_chk, strcpy_chk, stpcpy_chk, strncpy_chk, // strcat_chk and strncat_chk. -Instruction *InstCombiner::tryOptimizeCall(CallInst *CI, const DataLayout *DL) { +Instruction *InstCombiner::tryOptimizeCall(CallInst *CI) { if (!CI->getCalledFunction()) return nullptr; - if (Value *With = Simplifier->optimizeCall(CI)) { + auto InstCombineRAUW = [this](Instruction *From, Value *With) { + ReplaceInstUsesWith(*From, With); + }; + LibCallSimplifier Simplifier(DL, TLI, InstCombineRAUW); + if (Value *With = Simplifier.optimizeCall(CI)) { ++NumSimplified; return CI->use_empty() ? CI : ReplaceInstUsesWith(*CI, With); } @@ -1342,7 +1462,7 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) { for (CallSite::arg_iterator I = CS.arg_begin() + FTy->getNumParams(), E = CS.arg_end(); I != E; ++I, ++ix) { CastInst *CI = dyn_cast<CastInst>(*I); - if (CI && isSafeToEliminateVarargsCast(CS, CI, DL, ix)) { + if (CI && isSafeToEliminateVarargsCast(CS, DL, CI, ix)) { *I = CI->getOperand(0); Changed = true; } @@ -1359,7 +1479,7 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) { // this. None of these calls are seen as possibly dead so go ahead and // delete the instruction now. if (CallInst *CI = dyn_cast<CallInst>(CS.getInstruction())) { - Instruction *I = tryOptimizeCall(CI, DL); + Instruction *I = tryOptimizeCall(CI); // If we changed something return the result, etc. Otherwise let // the fallthrough check. if (I) return EraseInstFromFunction(*I); @@ -1409,10 +1529,7 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) { if (!CallerPAL.isEmpty() && !Caller->use_empty()) { AttrBuilder RAttrs(CallerPAL, AttributeSet::ReturnIndex); - if (RAttrs. - hasAttributes(AttributeFuncs:: - typeIncompatible(NewRetTy, AttributeSet::ReturnIndex), - AttributeSet::ReturnIndex)) + if (RAttrs.overlaps(AttributeFuncs::typeIncompatible(NewRetTy))) return false; // Attribute not compatible with transformed value. } @@ -1438,7 +1555,10 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) { // // into: // call void @takes_i32_inalloca(i32* null) - if (Callee->getAttributes().hasAttrSomewhere(Attribute::InAlloca)) + // + // Similarly, avoid folding away bitcasts of byval calls. + if (Callee->getAttributes().hasAttrSomewhere(Attribute::InAlloca) || + Callee->getAttributes().hasAttrSomewhere(Attribute::ByVal)) return false; CallSite::arg_iterator AI = CS.arg_begin(); @@ -1450,8 +1570,7 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) { return false; // Cannot transform this parameter value. if (AttrBuilder(CallerPAL.getParamAttributes(i + 1), i + 1). - hasAttributes(AttributeFuncs:: - typeIncompatible(ParamTy, i + 1), i + 1)) + overlaps(AttributeFuncs::typeIncompatible(ParamTy))) return false; // Attribute not compatible with transformed value. if (CS.isInAllocaArgument(i)) @@ -1463,12 +1582,12 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) { CallerPAL.getParamAttributes(i + 1).hasAttribute(i + 1, Attribute::ByVal)) { PointerType *ParamPTy = dyn_cast<PointerType>(ParamTy); - if (!ParamPTy || !ParamPTy->getElementType()->isSized() || !DL) + if (!ParamPTy || !ParamPTy->getElementType()->isSized()) return false; Type *CurElTy = ActTy->getPointerElementType(); - if (DL->getTypeAllocSize(CurElTy) != - DL->getTypeAllocSize(ParamPTy->getElementType())) + if (DL.getTypeAllocSize(CurElTy) != + DL.getTypeAllocSize(ParamPTy->getElementType())) return false; } } @@ -1524,10 +1643,7 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) { // If the return value is not being used, the type may not be compatible // with the existing attributes. Wipe out any problematic attributes. - RAttrs. - removeAttributes(AttributeFuncs:: - typeIncompatible(NewRetTy, AttributeSet::ReturnIndex), - AttributeSet::ReturnIndex); + RAttrs.remove(AttributeFuncs::typeIncompatible(NewRetTy)); // Add the new return attributes. if (RAttrs.hasAttributes()) diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index 5415726..48ab0eb 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -11,11 +11,11 @@ // //===----------------------------------------------------------------------===// -#include "InstCombine.h" +#include "InstCombineInternal.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/PatternMatch.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" using namespace llvm; using namespace PatternMatch; @@ -80,9 +80,6 @@ static Value *DecomposeSimpleLinearExpr(Value *Val, unsigned &Scale, /// try to eliminate the cast by moving the type information into the alloc. Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI, AllocaInst &AI) { - // This requires DataLayout to get the alloca alignment and size information. - if (!DL) return nullptr; - PointerType *PTy = cast<PointerType>(CI.getType()); BuilderTy AllocaBuilder(*Builder); @@ -93,8 +90,8 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI, Type *CastElTy = PTy->getElementType(); if (!AllocElTy->isSized() || !CastElTy->isSized()) return nullptr; - unsigned AllocElTyAlign = DL->getABITypeAlignment(AllocElTy); - unsigned CastElTyAlign = DL->getABITypeAlignment(CastElTy); + unsigned AllocElTyAlign = DL.getABITypeAlignment(AllocElTy); + unsigned CastElTyAlign = DL.getABITypeAlignment(CastElTy); if (CastElTyAlign < AllocElTyAlign) return nullptr; // If the allocation has multiple uses, only promote it if we are strictly @@ -102,14 +99,14 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI, // same, we open the door to infinite loops of various kinds. if (!AI.hasOneUse() && CastElTyAlign == AllocElTyAlign) return nullptr; - uint64_t AllocElTySize = DL->getTypeAllocSize(AllocElTy); - uint64_t CastElTySize = DL->getTypeAllocSize(CastElTy); + uint64_t AllocElTySize = DL.getTypeAllocSize(AllocElTy); + uint64_t CastElTySize = DL.getTypeAllocSize(CastElTy); if (CastElTySize == 0 || AllocElTySize == 0) return nullptr; // If the allocation has multiple uses, only promote it if we're not // shrinking the amount of memory being allocated. - uint64_t AllocElTyStoreSize = DL->getTypeStoreSize(AllocElTy); - uint64_t CastElTyStoreSize = DL->getTypeStoreSize(CastElTy); + uint64_t AllocElTyStoreSize = DL.getTypeStoreSize(AllocElTy); + uint64_t CastElTyStoreSize = DL.getTypeStoreSize(CastElTy); if (!AI.hasOneUse() && CastElTyStoreSize < AllocElTyStoreSize) return nullptr; // See if we can satisfy the modulus by pulling a scale out of the array @@ -215,7 +212,8 @@ Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty, PHINode *OPN = cast<PHINode>(I); PHINode *NPN = PHINode::Create(Ty, OPN->getNumIncomingValues()); for (unsigned i = 0, e = OPN->getNumIncomingValues(); i != e; ++i) { - Value *V =EvaluateInDifferentType(OPN->getIncomingValue(i), Ty, isSigned); + Value *V = + EvaluateInDifferentType(OPN->getIncomingValue(i), Ty, isSigned); NPN->addIncoming(V, OPN->getIncomingBlock(i)); } Res = NPN; @@ -234,25 +232,22 @@ Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty, /// This function is a wrapper around CastInst::isEliminableCastPair. It /// simply extracts arguments and returns what that function returns. static Instruction::CastOps -isEliminableCastPair( - const CastInst *CI, ///< The first cast instruction - unsigned opcode, ///< The opcode of the second cast instruction - Type *DstTy, ///< The target type for the second cast instruction - const DataLayout *DL ///< The target data for pointer size -) { - +isEliminableCastPair(const CastInst *CI, ///< First cast instruction + unsigned opcode, ///< Opcode for the second cast + Type *DstTy, ///< Target type for the second cast + const DataLayout &DL) { Type *SrcTy = CI->getOperand(0)->getType(); // A from above Type *MidTy = CI->getType(); // B from above // Get the opcodes of the two Cast instructions Instruction::CastOps firstOp = Instruction::CastOps(CI->getOpcode()); Instruction::CastOps secondOp = Instruction::CastOps(opcode); - Type *SrcIntPtrTy = DL && SrcTy->isPtrOrPtrVectorTy() ? - DL->getIntPtrType(SrcTy) : nullptr; - Type *MidIntPtrTy = DL && MidTy->isPtrOrPtrVectorTy() ? - DL->getIntPtrType(MidTy) : nullptr; - Type *DstIntPtrTy = DL && DstTy->isPtrOrPtrVectorTy() ? - DL->getIntPtrType(DstTy) : nullptr; + Type *SrcIntPtrTy = + SrcTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(SrcTy) : nullptr; + Type *MidIntPtrTy = + MidTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(MidTy) : nullptr; + Type *DstIntPtrTy = + DstTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(DstTy) : nullptr; unsigned Res = CastInst::isEliminableCastPair(firstOp, secondOp, SrcTy, MidTy, DstTy, SrcIntPtrTy, MidIntPtrTy, DstIntPtrTy); @@ -298,7 +293,7 @@ Instruction *InstCombiner::commonCastTransforms(CastInst &CI) { // eliminate it now. if (CastInst *CSrc = dyn_cast<CastInst>(Src)) { // A->B->C cast if (Instruction::CastOps opc = - isEliminableCastPair(CSrc, CI.getOpcode(), CI.getType(), DL)) { + isEliminableCastPair(CSrc, CI.getOpcode(), CI.getType(), DL)) { // The first cast (CSrc) is eliminable so we need to fix up or replace // the second cast (CI). CSrc will then have a good chance of being dead. return CastInst::Create(opc, CSrc->getOperand(0), CI.getType()); @@ -314,8 +309,7 @@ Instruction *InstCombiner::commonCastTransforms(CastInst &CI) { if (isa<PHINode>(Src)) { // We don't do this if this would create a PHI node with an illegal type if // it is currently legal. - if (!Src->getType()->isIntegerTy() || - !CI.getType()->isIntegerTy() || + if (!Src->getType()->isIntegerTy() || !CI.getType()->isIntegerTy() || ShouldChangeType(CI.getType(), Src->getType())) if (Instruction *NV = FoldOpIntoPhi(CI)) return NV; @@ -424,8 +418,8 @@ static bool CanEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC, // get into trouble with cyclic PHIs here because we only consider // instructions with a single use. PHINode *PN = cast<PHINode>(I); - for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) - if (!CanEvaluateTruncated(PN->getIncomingValue(i), Ty, IC, CxtI)) + for (Value *IncValue : PN->incoming_values()) + if (!CanEvaluateTruncated(IncValue, Ty, IC, CxtI)) return false; return true; } @@ -441,6 +435,15 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) { if (Instruction *Result = commonCastTransforms(CI)) return Result; + // Test if the trunc is the user of a select which is part of a + // minimum or maximum operation. If so, don't do any more simplification. + // Even simplifying demanded bits can break the canonical form of a + // min/max. + Value *LHS, *RHS; + if (SelectInst *SI = dyn_cast<SelectInst>(CI.getOperand(0))) + if (matchSelectPattern(SI, LHS, RHS) != SPF_UNKNOWN) + return nullptr; + // See if we can simplify any instructions used by the input whose sole // purpose is to compute bits we don't care about. if (SimplifyDemandedInstructionBits(CI)) @@ -1035,8 +1038,8 @@ static bool CanEvaluateSExtd(Value *V, Type *Ty) { // get into trouble with cyclic PHIs here because we only consider // instructions with a single use. PHINode *PN = cast<PHINode>(I); - for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) - if (!CanEvaluateSExtd(PN->getIncomingValue(i), Ty)) return false; + for (Value *IncValue : PN->incoming_values()) + if (!CanEvaluateSExtd(IncValue, Ty)) return false; return true; } default: @@ -1064,6 +1067,15 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) { Value *Src = CI.getOperand(0); Type *SrcTy = Src->getType(), *DestTy = CI.getType(); + // If we know that the value being extended is positive, we can use a zext + // instead. + bool KnownZero, KnownOne; + ComputeSignBit(Src, KnownZero, KnownOne, 0, &CI); + if (KnownZero) { + Value *ZExt = Builder->CreateZExt(Src, DestTy); + return ReplaceInstUsesWith(CI, ZExt); + } + // Attempt to extend the entire input expression tree to the destination // type. Only do this if the dest type is a simple type, don't convert the // expression tree to something weird like i93 unless the source is also @@ -1332,22 +1344,57 @@ Instruction *InstCombiner::visitFPExt(CastInst &CI) { return commonCastTransforms(CI); } +// fpto{s/u}i({u/s}itofp(X)) --> X or zext(X) or sext(X) or trunc(X) +// This is safe if the intermediate type has enough bits in its mantissa to +// accurately represent all values of X. For example, this won't work with +// i64 -> float -> i64. +Instruction *InstCombiner::FoldItoFPtoI(Instruction &FI) { + if (!isa<UIToFPInst>(FI.getOperand(0)) && !isa<SIToFPInst>(FI.getOperand(0))) + return nullptr; + Instruction *OpI = cast<Instruction>(FI.getOperand(0)); + + Value *SrcI = OpI->getOperand(0); + Type *FITy = FI.getType(); + Type *OpITy = OpI->getType(); + Type *SrcTy = SrcI->getType(); + bool IsInputSigned = isa<SIToFPInst>(OpI); + bool IsOutputSigned = isa<FPToSIInst>(FI); + + // We can safely assume the conversion won't overflow the output range, + // because (for example) (uint8_t)18293.f is undefined behavior. + + // Since we can assume the conversion won't overflow, our decision as to + // whether the input will fit in the float should depend on the minimum + // of the input range and output range. + + // This means this is also safe for a signed input and unsigned output, since + // a negative input would lead to undefined behavior. + int InputSize = (int)SrcTy->getScalarSizeInBits() - IsInputSigned; + int OutputSize = (int)FITy->getScalarSizeInBits() - IsOutputSigned; + int ActualSize = std::min(InputSize, OutputSize); + + if (ActualSize <= OpITy->getFPMantissaWidth()) { + if (FITy->getScalarSizeInBits() > SrcTy->getScalarSizeInBits()) { + if (IsInputSigned && IsOutputSigned) + return new SExtInst(SrcI, FITy); + return new ZExtInst(SrcI, FITy); + } + if (FITy->getScalarSizeInBits() < SrcTy->getScalarSizeInBits()) + return new TruncInst(SrcI, FITy); + if (SrcTy == FITy) + return ReplaceInstUsesWith(FI, SrcI); + return new BitCastInst(SrcI, FITy); + } + return nullptr; +} + Instruction *InstCombiner::visitFPToUI(FPToUIInst &FI) { Instruction *OpI = dyn_cast<Instruction>(FI.getOperand(0)); if (!OpI) return commonCastTransforms(FI); - // fptoui(uitofp(X)) --> X - // fptoui(sitofp(X)) --> X - // This is safe if the intermediate type has enough bits in its mantissa to - // accurately represent all values of X. For example, do not do this with - // i64->float->i64. This is also safe for sitofp case, because any negative - // 'X' value would cause an undefined result for the fptoui. - if ((isa<UIToFPInst>(OpI) || isa<SIToFPInst>(OpI)) && - OpI->getOperand(0)->getType() == FI.getType() && - (int)FI.getType()->getScalarSizeInBits() < /*extra bit for sign */ - OpI->getType()->getFPMantissaWidth()) - return ReplaceInstUsesWith(FI, OpI->getOperand(0)); + if (Instruction *I = FoldItoFPtoI(FI)) + return I; return commonCastTransforms(FI); } @@ -1357,17 +1404,8 @@ Instruction *InstCombiner::visitFPToSI(FPToSIInst &FI) { if (!OpI) return commonCastTransforms(FI); - // fptosi(sitofp(X)) --> X - // fptosi(uitofp(X)) --> X - // This is safe if the intermediate type has enough bits in its mantissa to - // accurately represent all values of X. For example, do not do this with - // i64->float->i64. This is also safe for sitofp case, because any negative - // 'X' value would cause an undefined result for the fptoui. - if ((isa<UIToFPInst>(OpI) || isa<SIToFPInst>(OpI)) && - OpI->getOperand(0)->getType() == FI.getType() && - (int)FI.getType()->getScalarSizeInBits() <= - OpI->getType()->getFPMantissaWidth()) - return ReplaceInstUsesWith(FI, OpI->getOperand(0)); + if (Instruction *I = FoldItoFPtoI(FI)) + return I; return commonCastTransforms(FI); } @@ -1384,18 +1422,15 @@ Instruction *InstCombiner::visitIntToPtr(IntToPtrInst &CI) { // If the source integer type is not the intptr_t type for this target, do a // trunc or zext to the intptr_t type, then inttoptr of it. This allows the // cast to be exposed to other transforms. - - if (DL) { - unsigned AS = CI.getAddressSpace(); - if (CI.getOperand(0)->getType()->getScalarSizeInBits() != - DL->getPointerSizeInBits(AS)) { - Type *Ty = DL->getIntPtrType(CI.getContext(), AS); - if (CI.getType()->isVectorTy()) // Handle vectors of pointers. - Ty = VectorType::get(Ty, CI.getType()->getVectorNumElements()); - - Value *P = Builder->CreateZExtOrTrunc(CI.getOperand(0), Ty); - return new IntToPtrInst(P, CI.getType()); - } + unsigned AS = CI.getAddressSpace(); + if (CI.getOperand(0)->getType()->getScalarSizeInBits() != + DL.getPointerSizeInBits(AS)) { + Type *Ty = DL.getIntPtrType(CI.getContext(), AS); + if (CI.getType()->isVectorTy()) // Handle vectors of pointers. + Ty = VectorType::get(Ty, CI.getType()->getVectorNumElements()); + + Value *P = Builder->CreateZExtOrTrunc(CI.getOperand(0), Ty); + return new IntToPtrInst(P, CI.getType()); } if (Instruction *I = commonCastTransforms(CI)) @@ -1424,41 +1459,6 @@ Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) { CI.setOperand(0, GEP->getOperand(0)); return &CI; } - - if (!DL) - return commonCastTransforms(CI); - - // If the GEP has a single use, and the base pointer is a bitcast, and the - // GEP computes a constant offset, see if we can convert these three - // instructions into fewer. This typically happens with unions and other - // non-type-safe code. - unsigned AS = GEP->getPointerAddressSpace(); - unsigned OffsetBits = DL->getPointerSizeInBits(AS); - APInt Offset(OffsetBits, 0); - BitCastInst *BCI = dyn_cast<BitCastInst>(GEP->getOperand(0)); - if (GEP->hasOneUse() && - BCI && - GEP->accumulateConstantOffset(*DL, Offset)) { - // Get the base pointer input of the bitcast, and the type it points to. - Value *OrigBase = BCI->getOperand(0); - SmallVector<Value*, 8> NewIndices; - if (FindElementAtOffset(OrigBase->getType(), - Offset.getSExtValue(), - NewIndices)) { - // If we were able to index down into an element, create the GEP - // and bitcast the result. This eliminates one bitcast, potentially - // two. - Value *NGEP = cast<GEPOperator>(GEP)->isInBounds() ? - Builder->CreateInBoundsGEP(OrigBase, NewIndices) : - Builder->CreateGEP(OrigBase, NewIndices); - NGEP->takeName(GEP); - - if (isa<BitCastInst>(CI)) - return new BitCastInst(NGEP, CI.getType()); - assert(isa<PtrToIntInst>(CI)); - return new PtrToIntInst(NGEP, CI.getType()); - } - } } return commonCastTransforms(CI); @@ -1469,16 +1469,13 @@ Instruction *InstCombiner::visitPtrToInt(PtrToIntInst &CI) { // do a ptrtoint to intptr_t then do a trunc or zext. This allows the cast // to be exposed to other transforms. - if (!DL) - return commonPointerCastTransforms(CI); - Type *Ty = CI.getType(); unsigned AS = CI.getPointerAddressSpace(); - if (Ty->getScalarSizeInBits() == DL->getPointerSizeInBits(AS)) + if (Ty->getScalarSizeInBits() == DL.getPointerSizeInBits(AS)) return commonPointerCastTransforms(CI); - Type *PtrTy = DL->getIntPtrType(CI.getContext(), AS); + Type *PtrTy = DL.getIntPtrType(CI.getContext(), AS); if (Ty->isVectorTy()) // Handle vectors of pointers. PtrTy = VectorType::get(PtrTy, Ty->getVectorNumElements()); @@ -1562,8 +1559,8 @@ static unsigned getTypeSizeIndex(unsigned Value, Type *Ty) { /// This returns false if the pattern can't be matched or true if it can, /// filling in Elements with the elements found here. static bool CollectInsertionElements(Value *V, unsigned Shift, - SmallVectorImpl<Value*> &Elements, - Type *VecEltTy, InstCombiner &IC) { + SmallVectorImpl<Value *> &Elements, + Type *VecEltTy, bool isBigEndian) { assert(isMultipleOfTypeSize(Shift, VecEltTy) && "Shift should be a multiple of the element type size"); @@ -1579,7 +1576,7 @@ static bool CollectInsertionElements(Value *V, unsigned Shift, return true; unsigned ElementIndex = getTypeSizeIndex(Shift, VecEltTy); - if (IC.getDataLayout()->isBigEndian()) + if (isBigEndian) ElementIndex = Elements.size() - ElementIndex - 1; // Fail if multiple elements are inserted into this slot. @@ -1599,7 +1596,7 @@ static bool CollectInsertionElements(Value *V, unsigned Shift, // it to the right type so it gets properly inserted. if (NumElts == 1) return CollectInsertionElements(ConstantExpr::getBitCast(C, VecEltTy), - Shift, Elements, VecEltTy, IC); + Shift, Elements, VecEltTy, isBigEndian); // Okay, this is a constant that covers multiple elements. Slice it up into // pieces and insert each element-sized piece into the vector. @@ -1614,7 +1611,8 @@ static bool CollectInsertionElements(Value *V, unsigned Shift, Constant *Piece = ConstantExpr::getLShr(C, ConstantInt::get(C->getType(), ShiftI)); Piece = ConstantExpr::getTrunc(Piece, ElementIntTy); - if (!CollectInsertionElements(Piece, ShiftI, Elements, VecEltTy, IC)) + if (!CollectInsertionElements(Piece, ShiftI, Elements, VecEltTy, + isBigEndian)) return false; } return true; @@ -1627,28 +1625,28 @@ static bool CollectInsertionElements(Value *V, unsigned Shift, switch (I->getOpcode()) { default: return false; // Unhandled case. case Instruction::BitCast: - return CollectInsertionElements(I->getOperand(0), Shift, - Elements, VecEltTy, IC); + return CollectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy, + isBigEndian); case Instruction::ZExt: if (!isMultipleOfTypeSize( I->getOperand(0)->getType()->getPrimitiveSizeInBits(), VecEltTy)) return false; - return CollectInsertionElements(I->getOperand(0), Shift, - Elements, VecEltTy, IC); + return CollectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy, + isBigEndian); case Instruction::Or: - return CollectInsertionElements(I->getOperand(0), Shift, - Elements, VecEltTy, IC) && - CollectInsertionElements(I->getOperand(1), Shift, - Elements, VecEltTy, IC); + return CollectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy, + isBigEndian) && + CollectInsertionElements(I->getOperand(1), Shift, Elements, VecEltTy, + isBigEndian); case Instruction::Shl: { // Must be shifting by a constant that is a multiple of the element size. ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1)); if (!CI) return false; Shift += CI->getZExtValue(); if (!isMultipleOfTypeSize(Shift, VecEltTy)) return false; - return CollectInsertionElements(I->getOperand(0), Shift, - Elements, VecEltTy, IC); + return CollectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy, + isBigEndian); } } @@ -1671,15 +1669,13 @@ static bool CollectInsertionElements(Value *V, unsigned Shift, /// Into two insertelements that do "buildvector{%inc, %inc5}". static Value *OptimizeIntegerToVectorInsertions(BitCastInst &CI, InstCombiner &IC) { - // We need to know the target byte order to perform this optimization. - if (!IC.getDataLayout()) return nullptr; - VectorType *DestVecTy = cast<VectorType>(CI.getType()); Value *IntInput = CI.getOperand(0); SmallVector<Value*, 8> Elements(DestVecTy->getNumElements()); if (!CollectInsertionElements(IntInput, 0, Elements, - DestVecTy->getElementType(), IC)) + DestVecTy->getElementType(), + IC.getDataLayout().isBigEndian())) return nullptr; // If we succeeded, we know that all of the element are specified by Elements @@ -1699,10 +1695,8 @@ static Value *OptimizeIntegerToVectorInsertions(BitCastInst &CI, /// OptimizeIntToFloatBitCast - See if we can optimize an integer->float/double /// bitcast. The various long double bitcasts can't get in here. -static Instruction *OptimizeIntToFloatBitCast(BitCastInst &CI,InstCombiner &IC){ - // We need to know the target byte order to perform this optimization. - if (!IC.getDataLayout()) return nullptr; - +static Instruction *OptimizeIntToFloatBitCast(BitCastInst &CI, InstCombiner &IC, + const DataLayout &DL) { Value *Src = CI.getOperand(0); Type *DestTy = CI.getType(); @@ -1725,7 +1719,7 @@ static Instruction *OptimizeIntToFloatBitCast(BitCastInst &CI,InstCombiner &IC){ } unsigned Elt = 0; - if (IC.getDataLayout()->isBigEndian()) + if (DL.isBigEndian()) Elt = VecTy->getPrimitiveSizeInBits() / DestWidth - 1; return ExtractElementInst::Create(VecInput, IC.Builder->getInt32(Elt)); } @@ -1749,7 +1743,7 @@ static Instruction *OptimizeIntToFloatBitCast(BitCastInst &CI,InstCombiner &IC){ } unsigned Elt = ShAmt->getZExtValue() / DestWidth; - if (IC.getDataLayout()->isBigEndian()) + if (DL.isBigEndian()) Elt = VecTy->getPrimitiveSizeInBits() / DestWidth - 1 - Elt; return ExtractElementInst::Create(VecInput, IC.Builder->getInt32(Elt)); } @@ -1785,26 +1779,24 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) { // If the source and destination are pointers, and this cast is equivalent // to a getelementptr X, 0, 0, 0... turn it into the appropriate gep. // This can enhance SROA and other transforms that want type-safe pointers. - Constant *ZeroUInt = - Constant::getNullValue(Type::getInt32Ty(CI.getContext())); unsigned NumZeros = 0; while (SrcElTy != DstElTy && isa<CompositeType>(SrcElTy) && !SrcElTy->isPointerTy() && SrcElTy->getNumContainedTypes() /* not "{}" */) { - SrcElTy = cast<CompositeType>(SrcElTy)->getTypeAtIndex(ZeroUInt); + SrcElTy = cast<CompositeType>(SrcElTy)->getTypeAtIndex(0U); ++NumZeros; } // If we found a path from the src to dest, create the getelementptr now. if (SrcElTy == DstElTy) { - SmallVector<Value*, 8> Idxs(NumZeros+1, ZeroUInt); + SmallVector<Value *, 8> Idxs(NumZeros + 1, Builder->getInt32(0)); return GetElementPtrInst::CreateInBounds(Src, Idxs); } } // Try to optimize int -> float bitcasts. if ((DestTy->isFloatTy() || DestTy->isDoubleTy()) && isa<IntegerType>(SrcTy)) - if (Instruction *I = OptimizeIntToFloatBitCast(CI, *this)) + if (Instruction *I = OptimizeIntToFloatBitCast(CI, *this, DL)) return I; if (VectorType *DestVTy = dyn_cast<VectorType>(DestTy)) { diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index c07c96d..2dafa58 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -11,7 +11,7 @@ // //===----------------------------------------------------------------------===// -#include "InstCombine.h" +#include "InstCombineInternal.h" #include "llvm/ADT/APSInt.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/ConstantFolding.h" @@ -24,7 +24,7 @@ #include "llvm/IR/PatternMatch.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" using namespace llvm; using namespace PatternMatch; @@ -229,10 +229,6 @@ static void ComputeUnsignedMinMaxValuesFromKnownBits(const APInt &KnownZero, Instruction *InstCombiner:: FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV, CmpInst &ICI, ConstantInt *AndCst) { - // We need TD information to know the pointer size unless this is inbounds. - if (!GEP->isInBounds() && !DL) - return nullptr; - Constant *Init = GV->getInitializer(); if (!isa<ConstantArray>(Init) && !isa<ConstantDataArray>(Init)) return nullptr; @@ -303,7 +299,6 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV, // the array, this will fully represent all the comparison results. uint64_t MagicBitvector = 0; - // Scan the array and see if one of our patterns matches. Constant *CompareRHS = cast<Constant>(ICI.getOperand(1)); for (unsigned i = 0, e = ArrayElementCount; i != e; ++i) { @@ -398,7 +393,7 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV, // index down like the GEP would do implicitly. We don't have to do this for // an inbounds GEP because the index can't be out of range. if (!GEP->isInBounds()) { - Type *IntPtrTy = DL->getIntPtrType(GEP->getType()); + Type *IntPtrTy = DL.getIntPtrType(GEP->getType()); unsigned PtrSize = IntPtrTy->getIntegerBitWidth(); if (Idx->getType()->getPrimitiveSizeInBits() > PtrSize) Idx = Builder->CreateTrunc(Idx, IntPtrTy); @@ -487,10 +482,8 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV, // - Default to i32 if (ArrayElementCount <= Idx->getType()->getIntegerBitWidth()) Ty = Idx->getType(); - else if (DL) - Ty = DL->getSmallestLegalIntType(Init->getContext(), ArrayElementCount); - else if (ArrayElementCount <= 32) - Ty = Type::getInt32Ty(Init->getContext()); + else + Ty = DL.getSmallestLegalIntType(Init->getContext(), ArrayElementCount); if (Ty) { Value *V = Builder->CreateIntCast(Idx, Ty, false); @@ -514,8 +507,8 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV, /// /// If we can't emit an optimized form for this expression, this returns null. /// -static Value *EvaluateGEPOffsetExpression(User *GEP, InstCombiner &IC) { - const DataLayout &DL = *IC.getDataLayout(); +static Value *EvaluateGEPOffsetExpression(User *GEP, InstCombiner &IC, + const DataLayout &DL) { gep_type_iterator GTI = gep_type_begin(GEP); // Check to see if this gep only has a single variable index. If so, and if @@ -628,12 +621,12 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS, RHS = RHS->stripPointerCasts(); Value *PtrBase = GEPLHS->getOperand(0); - if (DL && PtrBase == RHS && GEPLHS->isInBounds()) { + if (PtrBase == RHS && GEPLHS->isInBounds()) { // ((gep Ptr, OFFSET) cmp Ptr) ---> (OFFSET cmp 0). // This transformation (ignoring the base and scales) is valid because we // know pointers can't overflow since the gep is inbounds. See if we can // output an optimized form. - Value *Offset = EvaluateGEPOffsetExpression(GEPLHS, *this); + Value *Offset = EvaluateGEPOffsetExpression(GEPLHS, *this, DL); // If not, synthesize the offset the hard way. if (!Offset) @@ -661,11 +654,11 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS, // If we're comparing GEPs with two base pointers that only differ in type // and both GEPs have only constant indices or just one use, then fold // the compare with the adjusted indices. - if (DL && GEPLHS->isInBounds() && GEPRHS->isInBounds() && + if (GEPLHS->isInBounds() && GEPRHS->isInBounds() && (GEPLHS->hasAllConstantIndices() || GEPLHS->hasOneUse()) && (GEPRHS->hasAllConstantIndices() || GEPRHS->hasOneUse()) && PtrBase->stripPointerCasts() == - GEPRHS->getOperand(0)->stripPointerCasts()) { + GEPRHS->getOperand(0)->stripPointerCasts()) { Value *LOffset = EmitGEPOffset(GEPLHS); Value *ROffset = EmitGEPOffset(GEPRHS); @@ -733,9 +726,7 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS, // Only lower this if the icmp is the only user of the GEP or if we expect // the result to fold to a constant! - if (DL && - GEPsInBounds && - (isa<ConstantExpr>(GEPLHS) || GEPLHS->hasOneUse()) && + if (GEPsInBounds && (isa<ConstantExpr>(GEPLHS) || GEPLHS->hasOneUse()) && (isa<ConstantExpr>(GEPRHS) || GEPRHS->hasOneUse())) { // ((gep Ptr, OFFSET1) cmp (gep Ptr, OFFSET2) ---> (OFFSET1 cmp OFFSET2) Value *L = EmitGEPOffset(GEPLHS); @@ -1928,17 +1919,20 @@ Instruction *InstCombiner::visitICmpInstWithCastAndCast(ICmpInst &ICI) { // Turn icmp (ptrtoint x), (ptrtoint/c) into a compare of the input if the // integer type is the same size as the pointer type. - if (DL && LHSCI->getOpcode() == Instruction::PtrToInt && - DL->getPointerTypeSizeInBits(SrcTy) == DestTy->getIntegerBitWidth()) { + if (LHSCI->getOpcode() == Instruction::PtrToInt && + DL.getPointerTypeSizeInBits(SrcTy) == DestTy->getIntegerBitWidth()) { Value *RHSOp = nullptr; - if (Constant *RHSC = dyn_cast<Constant>(ICI.getOperand(1))) { + if (PtrToIntOperator *RHSC = dyn_cast<PtrToIntOperator>(ICI.getOperand(1))) { + Value *RHSCIOp = RHSC->getOperand(0); + if (RHSCIOp->getType()->getPointerAddressSpace() == + LHSCIOp->getType()->getPointerAddressSpace()) { + RHSOp = RHSC->getOperand(0); + // If the pointer types don't match, insert a bitcast. + if (LHSCIOp->getType() != RHSOp->getType()) + RHSOp = Builder->CreateBitCast(RHSOp, LHSCIOp->getType()); + } + } else if (Constant *RHSC = dyn_cast<Constant>(ICI.getOperand(1))) RHSOp = ConstantExpr::getIntToPtr(RHSC, SrcTy); - } else if (PtrToIntInst *RHSC = dyn_cast<PtrToIntInst>(ICI.getOperand(1))) { - RHSOp = RHSC->getOperand(0); - // If the pointer types don't match, insert a bitcast. - if (LHSCIOp->getType() != RHSOp->getType()) - RHSOp = Builder->CreateBitCast(RHSOp, LHSCIOp->getType()); - } if (RHSOp) return new ICmpInst(ICI.getPredicate(), LHSCIOp, RHSOp); @@ -2103,7 +2097,7 @@ static Instruction *ProcessUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B, Value *TruncA = Builder->CreateTrunc(A, NewType, A->getName()+".trunc"); Value *TruncB = Builder->CreateTrunc(B, NewType, B->getName()+".trunc"); - CallInst *Call = Builder->CreateCall2(F, TruncA, TruncB, "sadd"); + CallInst *Call = Builder->CreateCall(F, {TruncA, TruncB}, "sadd"); Value *Add = Builder->CreateExtractValue(Call, 0, "sadd.result"); Value *ZExt = Builder->CreateZExt(Add, OrigAdd->getType()); @@ -2115,33 +2109,94 @@ static Instruction *ProcessUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B, return ExtractValueInst::Create(Call, 1, "sadd.overflow"); } -static Instruction *ProcessUAddIdiom(Instruction &I, Value *OrigAddV, - InstCombiner &IC) { - // Don't bother doing this transformation for pointers, don't do it for - // vectors. - if (!isa<IntegerType>(OrigAddV->getType())) return nullptr; +bool InstCombiner::OptimizeOverflowCheck(OverflowCheckFlavor OCF, Value *LHS, + Value *RHS, Instruction &OrigI, + Value *&Result, Constant *&Overflow) { + assert((!OrigI.isCommutative() || + !(isa<Constant>(LHS) && !isa<Constant>(RHS))) && + "call with a constant RHS if possible!"); + + auto SetResult = [&](Value *OpResult, Constant *OverflowVal, bool ReuseName) { + Result = OpResult; + Overflow = OverflowVal; + if (ReuseName) + Result->takeName(&OrigI); + return true; + }; - // If the add is a constant expr, then we don't bother transforming it. - Instruction *OrigAdd = dyn_cast<Instruction>(OrigAddV); - if (!OrigAdd) return nullptr; + switch (OCF) { + case OCF_INVALID: + llvm_unreachable("bad overflow check kind!"); - Value *LHS = OrigAdd->getOperand(0), *RHS = OrigAdd->getOperand(1); + case OCF_UNSIGNED_ADD: { + OverflowResult OR = computeOverflowForUnsignedAdd(LHS, RHS, &OrigI); + if (OR == OverflowResult::NeverOverflows) + return SetResult(Builder->CreateNUWAdd(LHS, RHS), Builder->getFalse(), + true); - // Put the new code above the original add, in case there are any uses of the - // add between the add and the compare. - InstCombiner::BuilderTy *Builder = IC.Builder; - Builder->SetInsertPoint(OrigAdd); + if (OR == OverflowResult::AlwaysOverflows) + return SetResult(Builder->CreateAdd(LHS, RHS), Builder->getTrue(), true); + } + // FALL THROUGH uadd into sadd + case OCF_SIGNED_ADD: { + // X + 0 -> {X, false} + if (match(RHS, m_Zero())) + return SetResult(LHS, Builder->getFalse(), false); + + // We can strength reduce this signed add into a regular add if we can prove + // that it will never overflow. + if (OCF == OCF_SIGNED_ADD) + if (WillNotOverflowSignedAdd(LHS, RHS, OrigI)) + return SetResult(Builder->CreateNSWAdd(LHS, RHS), Builder->getFalse(), + true); + } - Module *M = I.getParent()->getParent()->getParent(); - Type *Ty = LHS->getType(); - Value *F = Intrinsic::getDeclaration(M, Intrinsic::uadd_with_overflow, Ty); - CallInst *Call = Builder->CreateCall2(F, LHS, RHS, "uadd"); - Value *Add = Builder->CreateExtractValue(Call, 0); + case OCF_UNSIGNED_SUB: + case OCF_SIGNED_SUB: { + // X - 0 -> {X, false} + if (match(RHS, m_Zero())) + return SetResult(LHS, Builder->getFalse(), false); + + if (OCF == OCF_SIGNED_SUB) { + if (WillNotOverflowSignedSub(LHS, RHS, OrigI)) + return SetResult(Builder->CreateNSWSub(LHS, RHS), Builder->getFalse(), + true); + } else { + if (WillNotOverflowUnsignedSub(LHS, RHS, OrigI)) + return SetResult(Builder->CreateNUWSub(LHS, RHS), Builder->getFalse(), + true); + } + break; + } - IC.ReplaceInstUsesWith(*OrigAdd, Add); + case OCF_UNSIGNED_MUL: { + OverflowResult OR = computeOverflowForUnsignedMul(LHS, RHS, &OrigI); + if (OR == OverflowResult::NeverOverflows) + return SetResult(Builder->CreateNUWMul(LHS, RHS), Builder->getFalse(), + true); + if (OR == OverflowResult::AlwaysOverflows) + return SetResult(Builder->CreateMul(LHS, RHS), Builder->getTrue(), true); + } // FALL THROUGH + case OCF_SIGNED_MUL: + // X * undef -> undef + if (isa<UndefValue>(RHS)) + return SetResult(RHS, UndefValue::get(Builder->getInt1Ty()), false); + + // X * 0 -> {0, false} + if (match(RHS, m_Zero())) + return SetResult(RHS, Builder->getFalse(), false); + + // X * 1 -> {X, false} + if (match(RHS, m_One())) + return SetResult(LHS, Builder->getFalse(), false); + + if (OCF == OCF_SIGNED_MUL) + if (WillNotOverflowSignedMul(LHS, RHS, OrigI)) + return SetResult(Builder->CreateNSWMul(LHS, RHS), Builder->getFalse(), + true); + } - // The original icmp gets replaced with the overflow value. - return ExtractValueInst::Create(Call, 1, "uadd.overflow"); + return false; } /// \brief Recognize and process idiom involving test for multiplication @@ -2311,7 +2366,7 @@ static Instruction *ProcessUMulZExtIdiom(ICmpInst &I, Value *MulVal, MulB = Builder->CreateZExt(B, MulType); Value *F = Intrinsic::getDeclaration(M, Intrinsic::umul_with_overflow, MulType); - CallInst *Call = Builder->CreateCall2(F, MulA, MulB, "umul"); + CallInst *Call = Builder->CreateCall(F, {MulA, MulB}, "umul"); IC.Worklist.Add(MulInstr); // If there are uses of mul result other than the comparison, we know that @@ -2657,8 +2712,8 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { unsigned BitWidth = 0; if (Ty->isIntOrIntVectorTy()) BitWidth = Ty->getScalarSizeInBits(); - else if (DL) // Pointers require DL info to get their size. - BitWidth = DL->getTypeSizeInBits(Ty->getScalarType()); + else // Get pointer size. + BitWidth = DL.getTypeSizeInBits(Ty->getScalarType()); bool isSignBit = false; @@ -2771,8 +2826,8 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { Op0KnownZero, Op0KnownOne, 0)) return &I; if (SimplifyDemandedBits(I.getOperandUse(1), - APInt::getAllOnesValue(BitWidth), - Op1KnownZero, Op1KnownOne, 0)) + APInt::getAllOnesValue(BitWidth), Op1KnownZero, + Op1KnownOne, 0)) return &I; // Given the known and unknown bits, compute a range that the LHS could be @@ -3091,9 +3146,8 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { } case Instruction::IntToPtr: // icmp pred inttoptr(X), null -> icmp pred X, 0 - if (RHSC->isNullValue() && DL && - DL->getIntPtrType(RHSC->getType()) == - LHSI->getOperand(0)->getType()) + if (RHSC->isNullValue() && + DL.getIntPtrType(RHSC->getType()) == LHSI->getOperand(0)->getType()) return new ICmpInst(I.getPredicate(), LHSI->getOperand(0), Constant::getNullValue(LHSI->getOperand(0)->getType())); break; @@ -3425,7 +3479,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { // if A is a power of 2. if (match(Op0, m_And(m_Value(A), m_Not(m_Value(B)))) && match(Op1, m_Zero()) && - isKnownToBeAPowerOfTwo(A, false, 0, AC, &I, DT) && I.isEquality()) + isKnownToBeAPowerOfTwo(A, DL, false, 0, AC, &I, DT) && I.isEquality()) return new ICmpInst(I.getInversePredicate(), Builder->CreateAnd(A, B), Op1); @@ -3439,21 +3493,18 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { return new ICmpInst(I.getPredicate(), ConstantExpr::getNot(RHSC), A); } - // (a+b) <u a --> llvm.uadd.with.overflow. - // (a+b) <u b --> llvm.uadd.with.overflow. - if (I.getPredicate() == ICmpInst::ICMP_ULT && - match(Op0, m_Add(m_Value(A), m_Value(B))) && - (Op1 == A || Op1 == B)) - if (Instruction *R = ProcessUAddIdiom(I, Op0, *this)) - return R; - - // a >u (a+b) --> llvm.uadd.with.overflow. - // b >u (a+b) --> llvm.uadd.with.overflow. - if (I.getPredicate() == ICmpInst::ICMP_UGT && - match(Op1, m_Add(m_Value(A), m_Value(B))) && - (Op0 == A || Op0 == B)) - if (Instruction *R = ProcessUAddIdiom(I, Op1, *this)) - return R; + Instruction *AddI = nullptr; + if (match(&I, m_UAddWithOverflow(m_Value(A), m_Value(B), + m_Instruction(AddI))) && + isa<IntegerType>(A->getType())) { + Value *Result; + Constant *Overflow; + if (OptimizeOverflowCheck(OCF_UNSIGNED_ADD, A, B, *AddI, Result, + Overflow)) { + ReplaceInstUsesWith(*AddI, Result); + return ReplaceInstUsesWith(I, Overflow); + } + } // (zext a) * (zext b) --> llvm.umul.with.overflow. if (match(Op0, m_Mul(m_ZExt(m_Value(A)), m_ZExt(m_Value(B))))) { @@ -3560,6 +3611,21 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { } } + // (A << C) == (B << C) --> ((A^B) & (~0U >> C)) == 0 + if (match(Op0, m_OneUse(m_Shl(m_Value(A), m_ConstantInt(Cst1)))) && + match(Op1, m_OneUse(m_Shl(m_Value(B), m_Specific(Cst1))))) { + unsigned TypeBits = Cst1->getBitWidth(); + unsigned ShAmt = (unsigned)Cst1->getLimitedValue(TypeBits); + if (ShAmt < TypeBits && ShAmt != 0) { + Value *Xor = Builder->CreateXor(A, B, I.getName() + ".unshifted"); + APInt AndVal = APInt::getLowBitsSet(TypeBits, TypeBits - ShAmt); + Value *And = Builder->CreateAnd(Xor, Builder->getInt(AndVal), + I.getName() + ".mask"); + return new ICmpInst(I.getPredicate(), And, + Constant::getNullValue(Cst1->getType())); + } + } + // Transform "icmp eq (trunc (lshr(X, cst1)), cst" to // "icmp (and X, mask), cst" uint64_t ShAmt = 0; @@ -3886,6 +3952,19 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) { } } + // Test if the FCmpInst instruction is used exclusively by a select as + // part of a minimum or maximum operation. If so, refrain from doing + // any other folding. This helps out other analyses which understand + // non-obfuscated minimum and maximum idioms, such as ScalarEvolution + // and CodeGen. And in this case, at least one of the comparison + // operands has at least one user besides the compare (the select), + // which would often largely negate the benefit of folding anyway. + if (I.hasOneUse()) + if (SelectInst *SI = dyn_cast<SelectInst>(*I.user_begin())) + if ((SI->getOperand(1) == Op0 && SI->getOperand(2) == Op1) || + (SI->getOperand(2) == Op0 && SI->getOperand(1) == Op1)) + return nullptr; + // Handle fcmp with constant RHS if (Constant *RHSC = dyn_cast<Constant>(Op1)) { if (Instruction *LHSI = dyn_cast<Instruction>(Op0)) diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombine.h b/contrib/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index 3c3c135..97ea8df 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombine.h +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -1,4 +1,4 @@ -//===- InstCombine.h - Main InstCombine pass definition ---------*- C++ -*-===// +//===- InstCombineInternal.h - InstCombine pass internals -------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -6,12 +6,17 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// +/// \file +/// +/// This file provides internal interfaces used to implement the InstCombine. +/// +//===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINE_H -#define LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINE_H +#ifndef LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINEINTERNAL_H +#define LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINEINTERNAL_H -#include "InstCombineWorklist.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetFolder.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Dominators.h" @@ -21,7 +26,7 @@ #include "llvm/IR/Operator.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Pass.h" -#include "llvm/Transforms/Utils/SimplifyLibCalls.h" +#include "llvm/Transforms/InstCombine/InstCombineWorklist.h" #define DEBUG_TYPE "instcombine" @@ -34,20 +39,15 @@ class DbgDeclareInst; class MemIntrinsic; class MemSetInst; -/// SelectPatternFlavor - We can match a variety of different patterns for -/// select operations. -enum SelectPatternFlavor { - SPF_UNKNOWN = 0, - SPF_SMIN, - SPF_UMIN, - SPF_SMAX, - SPF_UMAX, - SPF_ABS, - SPF_NABS -}; - -/// getComplexity: Assign a complexity or rank value to LLVM Values... -/// 0 -> undef, 1 -> Const, 2 -> Other, 3 -> Arg, 3 -> Unary, 4 -> OtherInst +/// \brief Assign a complexity or rank value to LLVM Values. +/// +/// This routine maps IR values to various complexity ranks: +/// 0 -> undef +/// 1 -> Constants +/// 2 -> Other non-instructions +/// 3 -> Arguments +/// 3 -> Unary operations +/// 4 -> Other instructions static inline unsigned getComplexity(Value *V) { if (isa<Instruction>(V)) { if (BinaryOperator::isNeg(V) || BinaryOperator::isFNeg(V) || @@ -60,18 +60,82 @@ static inline unsigned getComplexity(Value *V) { return isa<Constant>(V) ? (isa<UndefValue>(V) ? 0 : 1) : 2; } -/// AddOne - Add one to a Constant +/// \brief Add one to a Constant static inline Constant *AddOne(Constant *C) { return ConstantExpr::getAdd(C, ConstantInt::get(C->getType(), 1)); } -/// SubOne - Subtract one from a Constant +/// \brief Subtract one from a Constant static inline Constant *SubOne(Constant *C) { return ConstantExpr::getSub(C, ConstantInt::get(C->getType(), 1)); } -/// InstCombineIRInserter - This is an IRBuilder insertion helper that works -/// just like the normal insertion helper, but also adds any new instructions -/// to the instcombine worklist. +/// \brief Return true if the specified value is free to invert (apply ~ to). +/// This happens in cases where the ~ can be eliminated. If WillInvertAllUses +/// is true, work under the assumption that the caller intends to remove all +/// uses of V and only keep uses of ~V. +/// +static inline bool IsFreeToInvert(Value *V, bool WillInvertAllUses) { + // ~(~(X)) -> X. + if (BinaryOperator::isNot(V)) + return true; + + // Constants can be considered to be not'ed values. + if (isa<ConstantInt>(V)) + return true; + + // Compares can be inverted if all of their uses are being modified to use the + // ~V. + if (isa<CmpInst>(V)) + return WillInvertAllUses; + + // If `V` is of the form `A + Constant` then `-1 - V` can be folded into `(-1 + // - Constant) - A` if we are willing to invert all of the uses. + if (BinaryOperator *BO = dyn_cast<BinaryOperator>(V)) + if (BO->getOpcode() == Instruction::Add || + BO->getOpcode() == Instruction::Sub) + if (isa<Constant>(BO->getOperand(0)) || isa<Constant>(BO->getOperand(1))) + return WillInvertAllUses; + + return false; +} + + +/// \brief Specific patterns of overflow check idioms that we match. +enum OverflowCheckFlavor { + OCF_UNSIGNED_ADD, + OCF_SIGNED_ADD, + OCF_UNSIGNED_SUB, + OCF_SIGNED_SUB, + OCF_UNSIGNED_MUL, + OCF_SIGNED_MUL, + + OCF_INVALID +}; + +/// \brief Returns the OverflowCheckFlavor corresponding to a overflow_with_op +/// intrinsic. +static inline OverflowCheckFlavor +IntrinsicIDToOverflowCheckFlavor(unsigned ID) { + switch (ID) { + default: + return OCF_INVALID; + case Intrinsic::uadd_with_overflow: + return OCF_UNSIGNED_ADD; + case Intrinsic::sadd_with_overflow: + return OCF_SIGNED_ADD; + case Intrinsic::usub_with_overflow: + return OCF_UNSIGNED_SUB; + case Intrinsic::ssub_with_overflow: + return OCF_SIGNED_SUB; + case Intrinsic::umul_with_overflow: + return OCF_UNSIGNED_MUL; + case Intrinsic::smul_with_overflow: + return OCF_SIGNED_MUL; + } +} + +/// \brief An IRBuilder inserter that adds new instructions to the instcombine +/// worklist. class LLVM_LIBRARY_VISIBILITY InstCombineIRInserter : public IRBuilderDefaultInserter<true> { InstCombineWorklist &Worklist; @@ -92,47 +156,60 @@ public: } }; -/// InstCombiner - The -instcombine pass. +/// \brief The core instruction combiner logic. +/// +/// This class provides both the logic to recursively visit instructions and +/// combine them, as well as the pass infrastructure for running this as part +/// of the LLVM pass pipeline. class LLVM_LIBRARY_VISIBILITY InstCombiner - : public FunctionPass, - public InstVisitor<InstCombiner, Instruction *> { - AssumptionCache *AC; - const DataLayout *DL; - TargetLibraryInfo *TLI; - DominatorTree *DT; - bool MadeIRChange; - LibCallSimplifier *Simplifier; - bool MinimizeSize; - + : public InstVisitor<InstCombiner, Instruction *> { + // FIXME: These members shouldn't be public. public: - /// Worklist - All of the instructions that need to be simplified. - InstCombineWorklist Worklist; + /// \brief A worklist of the instructions that need to be simplified. + InstCombineWorklist &Worklist; - /// Builder - This is an IRBuilder that automatically inserts new - /// instructions into the worklist when they are created. + /// \brief An IRBuilder that automatically inserts new instructions into the + /// worklist. typedef IRBuilder<true, TargetFolder, InstCombineIRInserter> BuilderTy; BuilderTy *Builder; - static char ID; // Pass identification, replacement for typeid - InstCombiner() - : FunctionPass(ID), DL(nullptr), DT(nullptr), Builder(nullptr) { - MinimizeSize = false; - initializeInstCombinerPass(*PassRegistry::getPassRegistry()); - } +private: + // Mode in which we are running the combiner. + const bool MinimizeSize; -public: - bool runOnFunction(Function &F) override; + // Required analyses. + // FIXME: These can never be null and should be references. + AssumptionCache *AC; + TargetLibraryInfo *TLI; + DominatorTree *DT; + const DataLayout &DL; + + // Optional analyses. When non-null, these can both be used to do better + // combining and will be updated to reflect any changes. + LoopInfo *LI; + + bool MadeIRChange; - bool DoOneIteration(Function &F, unsigned ItNum); +public: + InstCombiner(InstCombineWorklist &Worklist, BuilderTy *Builder, + bool MinimizeSize, AssumptionCache *AC, TargetLibraryInfo *TLI, + DominatorTree *DT, const DataLayout &DL, LoopInfo *LI) + : Worklist(Worklist), Builder(Builder), MinimizeSize(MinimizeSize), + AC(AC), TLI(TLI), DT(DT), DL(DL), LI(LI), MadeIRChange(false) {} - void getAnalysisUsage(AnalysisUsage &AU) const override; + /// \brief Run the combiner over the entire worklist until it is empty. + /// + /// \returns true if the IR is changed. + bool run(); AssumptionCache *getAssumptionCache() const { return AC; } - const DataLayout *getDataLayout() const { return DL; } - + const DataLayout &getDataLayout() const { return DL; } + DominatorTree *getDominatorTree() const { return DT; } + LoopInfo *getLoopInfo() const { return LI; } + TargetLibraryInfo *getTargetLibraryInfo() const { return TLI; } // Visitation implementation - Implement instruction combining for different @@ -222,6 +299,7 @@ public: Instruction *FoldSPFofSPF(Instruction *Inner, SelectPatternFlavor SPF1, Value *A, Value *B, Instruction &Outer, SelectPatternFlavor SPF2, Value *C); + Instruction *FoldItoFPtoI(Instruction &FI); Instruction *visitSelectInst(SelectInst &SI); Instruction *visitSelectInstWithICmp(SelectInst &SI, ICmpInst *ICI); Instruction *visitCallInst(CallInst &CI); @@ -262,37 +340,51 @@ private: bool ShouldChangeType(Type *From, Type *To) const; Value *dyn_castNegVal(Value *V) const; Value *dyn_castFNegVal(Value *V, bool NoSignedZero = false) const; - Type *FindElementAtOffset(Type *PtrTy, int64_t Offset, + Type *FindElementAtOffset(PointerType *PtrTy, int64_t Offset, SmallVectorImpl<Value *> &NewIndices); Instruction *FoldOpIntoSelect(Instruction &Op, SelectInst *SI); - /// ShouldOptimizeCast - Return true if the cast from "V to Ty" actually - /// results in any code being generated and is interesting to optimize out. If - /// the cast can be eliminated by some other simple transformation, we prefer - /// to do the simplification first. + /// \brief Classify whether a cast is worth optimizing. + /// + /// Returns true if the cast from "V to Ty" actually results in any code + /// being generated and is interesting to optimize out. If the cast can be + /// eliminated by some other simple transformation, we prefer to do the + /// simplification first. bool ShouldOptimizeCast(Instruction::CastOps opcode, const Value *V, Type *Ty); + /// \brief Try to optimize a sequence of instructions checking if an operation + /// on LHS and RHS overflows. + /// + /// If a simplification is possible, stores the simplified result of the + /// operation in OperationResult and result of the overflow check in + /// OverflowResult, and return true. If no simplification is possible, + /// returns false. + bool OptimizeOverflowCheck(OverflowCheckFlavor OCF, Value *LHS, Value *RHS, + Instruction &CtxI, Value *&OperationResult, + Constant *&OverflowResult); + Instruction *visitCallSite(CallSite CS); - Instruction *tryOptimizeCall(CallInst *CI, const DataLayout *DL); + Instruction *tryOptimizeCall(CallInst *CI); bool transformConstExprCastCall(CallSite CS); Instruction *transformCallThroughTrampoline(CallSite CS, IntrinsicInst *Tramp); Instruction *transformZExtICmp(ICmpInst *ICI, Instruction &CI, bool DoXform = true); Instruction *transformSExtICmp(ICmpInst *ICI, Instruction &CI); - bool WillNotOverflowSignedAdd(Value *LHS, Value *RHS, Instruction *CxtI); - bool WillNotOverflowSignedSub(Value *LHS, Value *RHS, Instruction *CxtI); - bool WillNotOverflowUnsignedSub(Value *LHS, Value *RHS, Instruction *CxtI); - bool WillNotOverflowSignedMul(Value *LHS, Value *RHS, Instruction *CxtI); + bool WillNotOverflowSignedAdd(Value *LHS, Value *RHS, Instruction &CxtI); + bool WillNotOverflowSignedSub(Value *LHS, Value *RHS, Instruction &CxtI); + bool WillNotOverflowUnsignedSub(Value *LHS, Value *RHS, Instruction &CxtI); + bool WillNotOverflowSignedMul(Value *LHS, Value *RHS, Instruction &CxtI); Value *EmitGEPOffset(User *GEP); Instruction *scalarizePHI(ExtractElementInst &EI, PHINode *PN); Value *EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask); public: - // InsertNewInstBefore - insert an instruction New before instruction Old - // in the program. Add the new instruction to the worklist. - // + /// \brief Inserts an instruction \p New before instruction \p Old + /// + /// Also adds the new instruction to the worklist and returns \p New so that + /// it is suitable for use as the return from the visitation patterns. Instruction *InsertNewInstBefore(Instruction *New, Instruction &Old) { assert(New && !New->getParent() && "New instruction already inserted into a basic block!"); @@ -302,21 +394,23 @@ public: return New; } - // InsertNewInstWith - same as InsertNewInstBefore, but also sets the - // debug loc. - // + /// \brief Same as InsertNewInstBefore, but also sets the debug loc. Instruction *InsertNewInstWith(Instruction *New, Instruction &Old) { New->setDebugLoc(Old.getDebugLoc()); return InsertNewInstBefore(New, Old); } - // ReplaceInstUsesWith - This method is to be used when an instruction is - // found to be dead, replacable with another preexisting expression. Here - // we add all uses of I to the worklist, replace all uses of I with the new - // value, then return I, so that the inst combiner will know that I was - // modified. - // + /// \brief A combiner-aware RAUW-like routine. + /// + /// This method is to be used when an instruction is found to be dead, + /// replacable with another preexisting expression. Here we add all uses of + /// I to the worklist, replace all uses of I with the new value, then return + /// I, so that the inst combiner will know that I was modified. Instruction *ReplaceInstUsesWith(Instruction &I, Value *V) { + // If there are no uses to replace, then we return nullptr to indicate that + // no changes were made to the program. + if (I.use_empty()) return nullptr; + Worklist.AddUsersToWorkList(I); // Add all modified instrs to worklist. // If we are replacing the instruction with itself, this must be in a @@ -325,30 +419,27 @@ public: V = UndefValue::get(I.getType()); DEBUG(dbgs() << "IC: Replacing " << I << "\n" - " with " << *V << '\n'); + << " with " << *V << '\n'); I.replaceAllUsesWith(V); return &I; } /// Creates a result tuple for an overflow intrinsic \p II with a given - /// \p Result and a constant \p Overflow value. If \p ReUseName is true the - /// \p Result's name is taken from \p II. + /// \p Result and a constant \p Overflow value. Instruction *CreateOverflowTuple(IntrinsicInst *II, Value *Result, - bool Overflow, bool ReUseName = true) { - if (ReUseName) - Result->takeName(II); - Constant *V[] = { UndefValue::get(Result->getType()), - Overflow ? Builder->getTrue() : Builder->getFalse() }; + Constant *Overflow) { + Constant *V[] = {UndefValue::get(Result->getType()), Overflow}; StructType *ST = cast<StructType>(II->getType()); Constant *Struct = ConstantStruct::get(ST, V); return InsertValueInst::Create(Struct, Result, 0); } - - // EraseInstFromFunction - When dealing with an instruction that has side - // effects or produces a void value, we can't rely on DCE to delete the - // instruction. Instead, visit methods should return the value returned by - // this function. + + /// \brief Combiner aware instruction erasure. + /// + /// When dealing with an instruction that has side effects or produces a void + /// value, we can't rely on DCE to delete the instruction. Instead, visit + /// methods should return the value returned by this function. Instruction *EraseInstFromFunction(Instruction &I) { DEBUG(dbgs() << "IC: ERASE " << I << '\n'); @@ -367,13 +458,12 @@ public: } void computeKnownBits(Value *V, APInt &KnownZero, APInt &KnownOne, - unsigned Depth = 0, Instruction *CxtI = nullptr) const { + unsigned Depth, Instruction *CxtI) const { return llvm::computeKnownBits(V, KnownZero, KnownOne, DL, Depth, AC, CxtI, DT); } - bool MaskedValueIsZero(Value *V, const APInt &Mask, - unsigned Depth = 0, + bool MaskedValueIsZero(Value *V, const APInt &Mask, unsigned Depth = 0, Instruction *CxtI = nullptr) const { return llvm::MaskedValueIsZero(V, Mask, DL, Depth, AC, CxtI, DT); } @@ -396,22 +486,24 @@ public: } private: - /// SimplifyAssociativeOrCommutative - This performs a few simplifications for - /// operators which are associative or commutative. + /// \brief Performs a few simplifications for operators which are associative + /// or commutative. bool SimplifyAssociativeOrCommutative(BinaryOperator &I); - /// SimplifyUsingDistributiveLaws - This tries to simplify binary operations - /// which some other binary operation distributes over either by factorizing - /// out common terms (eg "(A*B)+(A*C)" -> "A*(B+C)") or expanding out if this - /// results in simplifications (eg: "A & (B | C) -> (A&B) | (A&C)" if this is - /// a win). Returns the simplified value, or null if it didn't simplify. + /// \brief Tries to simplify binary operations which some other binary + /// operation distributes over. + /// + /// It does this by either by factorizing out common terms (eg "(A*B)+(A*C)" + /// -> "A*(B+C)") or expanding out if this results in simplifications (eg: "A + /// & (B | C) -> (A&B) | (A&C)" if this is a win). Returns the simplified + /// value, or null if it didn't simplify. Value *SimplifyUsingDistributiveLaws(BinaryOperator &I); - /// SimplifyDemandedUseBits - Attempts to replace V with a simpler value - /// based on the demanded bits. + /// \brief Attempts to replace V with a simpler value based on the demanded + /// bits. Value *SimplifyDemandedUseBits(Value *V, APInt DemandedMask, APInt &KnownZero, APInt &KnownOne, unsigned Depth, - Instruction *CxtI = nullptr); + Instruction *CxtI); bool SimplifyDemandedBits(Use &U, APInt DemandedMask, APInt &KnownZero, APInt &KnownOne, unsigned Depth = 0); /// Helper routine of SimplifyDemandedUseBits. It tries to simplify demanded @@ -420,9 +512,8 @@ private: APInt DemandedMask, APInt &KnownZero, APInt &KnownOne); - /// SimplifyDemandedInstructionBits - Inst is an integer instruction that - /// SimplifyDemandedBits knows about. See if the instruction has any - /// properties that allow us to simplify its operands. + /// \brief Tries to simplify operands to an integer instruction based on its + /// demanded bits. bool SimplifyDemandedInstructionBits(Instruction &Inst); Value *SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, @@ -438,9 +529,8 @@ private: // Instruction *FoldOpIntoPhi(Instruction &I); - // FoldPHIArgOpIntoPHI - If all operands to a PHI node are the same "unary" - // operator and they all are only used by the PHI, PHI together their - // inputs, and do the operation once, to the result of the PHI. + /// \brief Try to rotate an operation below a PHI node, using PHI nodes for + /// its operands. Instruction *FoldPHIArgOpIntoPHI(PHINode &PN); Instruction *FoldPHIArgBinOpIntoPHI(PHINode &PN); Instruction *FoldPHIArgGEPIntoPHI(PHINode &PN); @@ -461,8 +551,9 @@ private: Value *EvaluateInDifferentType(Value *V, Type *Ty, bool isSigned); - /// Descale - Return a value X such that Val = X * Scale, or null if none. If - /// the multiplication is known not to overflow then NoSignedWrap is set. + /// \brief Returns a value X such that Val = X * Scale, or null if none. + /// + /// If the multiplication is known not to overflow then NoSignedWrap is set. Value *Descale(Value *Val, APInt Scale, bool &NoSignedWrap); }; diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index 6230c00..5aa59c6 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -11,12 +11,13 @@ // //===----------------------------------------------------------------------===// -#include "InstCombine.h" +#include "InstCombineInternal.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/Loads.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; @@ -83,7 +84,7 @@ isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy, continue; } - if (CallSite CS = I) { + if (auto CS = CallSite(I)) { // If this is the function being called then we treat it like a load and // ignore it. if (CS.isCallee(&U)) @@ -163,62 +164,75 @@ isOnlyCopiedFromConstantGlobal(AllocaInst *AI, return nullptr; } -Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) { - // Ensure that the alloca array size argument has type intptr_t, so that - // any casting is exposed early. - if (DL) { - Type *IntPtrTy = DL->getIntPtrType(AI.getType()); - if (AI.getArraySize()->getType() != IntPtrTy) { - Value *V = Builder->CreateIntCast(AI.getArraySize(), - IntPtrTy, false); - AI.setOperand(0, V); - return &AI; - } +static Instruction *simplifyAllocaArraySize(InstCombiner &IC, AllocaInst &AI) { + // Check for array size of 1 (scalar allocation). + if (!AI.isArrayAllocation()) { + // i32 1 is the canonical array size for scalar allocations. + if (AI.getArraySize()->getType()->isIntegerTy(32)) + return nullptr; + + // Canonicalize it. + Value *V = IC.Builder->getInt32(1); + AI.setOperand(0, V); + return &AI; } // Convert: alloca Ty, C - where C is a constant != 1 into: alloca [C x Ty], 1 - if (AI.isArrayAllocation()) { // Check C != 1 - if (const ConstantInt *C = dyn_cast<ConstantInt>(AI.getArraySize())) { - Type *NewTy = - ArrayType::get(AI.getAllocatedType(), C->getZExtValue()); - AllocaInst *New = Builder->CreateAlloca(NewTy, nullptr, AI.getName()); - New->setAlignment(AI.getAlignment()); - - // Scan to the end of the allocation instructions, to skip over a block of - // allocas if possible...also skip interleaved debug info - // - BasicBlock::iterator It = New; - while (isa<AllocaInst>(*It) || isa<DbgInfoIntrinsic>(*It)) ++It; - - // Now that I is pointing to the first non-allocation-inst in the block, - // insert our getelementptr instruction... - // - Type *IdxTy = DL - ? DL->getIntPtrType(AI.getType()) - : Type::getInt64Ty(AI.getContext()); - Value *NullIdx = Constant::getNullValue(IdxTy); - Value *Idx[2] = { NullIdx, NullIdx }; - Instruction *GEP = + if (const ConstantInt *C = dyn_cast<ConstantInt>(AI.getArraySize())) { + Type *NewTy = ArrayType::get(AI.getAllocatedType(), C->getZExtValue()); + AllocaInst *New = IC.Builder->CreateAlloca(NewTy, nullptr, AI.getName()); + New->setAlignment(AI.getAlignment()); + + // Scan to the end of the allocation instructions, to skip over a block of + // allocas if possible...also skip interleaved debug info + // + BasicBlock::iterator It = New; + while (isa<AllocaInst>(*It) || isa<DbgInfoIntrinsic>(*It)) + ++It; + + // Now that I is pointing to the first non-allocation-inst in the block, + // insert our getelementptr instruction... + // + Type *IdxTy = IC.getDataLayout().getIntPtrType(AI.getType()); + Value *NullIdx = Constant::getNullValue(IdxTy); + Value *Idx[2] = {NullIdx, NullIdx}; + Instruction *GEP = GetElementPtrInst::CreateInBounds(New, Idx, New->getName() + ".sub"); - InsertNewInstBefore(GEP, *It); + IC.InsertNewInstBefore(GEP, *It); - // Now make everything use the getelementptr instead of the original - // allocation. - return ReplaceInstUsesWith(AI, GEP); - } else if (isa<UndefValue>(AI.getArraySize())) { - return ReplaceInstUsesWith(AI, Constant::getNullValue(AI.getType())); - } + // Now make everything use the getelementptr instead of the original + // allocation. + return IC.ReplaceInstUsesWith(AI, GEP); + } + + if (isa<UndefValue>(AI.getArraySize())) + return IC.ReplaceInstUsesWith(AI, Constant::getNullValue(AI.getType())); + + // Ensure that the alloca array size argument has type intptr_t, so that + // any casting is exposed early. + Type *IntPtrTy = IC.getDataLayout().getIntPtrType(AI.getType()); + if (AI.getArraySize()->getType() != IntPtrTy) { + Value *V = IC.Builder->CreateIntCast(AI.getArraySize(), IntPtrTy, false); + AI.setOperand(0, V); + return &AI; } - if (DL && AI.getAllocatedType()->isSized()) { + return nullptr; +} + +Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) { + if (auto *I = simplifyAllocaArraySize(*this, AI)) + return I; + + if (AI.getAllocatedType()->isSized()) { // If the alignment is 0 (unspecified), assign it the preferred alignment. if (AI.getAlignment() == 0) - AI.setAlignment(DL->getPrefTypeAlignment(AI.getAllocatedType())); + AI.setAlignment(DL.getPrefTypeAlignment(AI.getAllocatedType())); // Move all alloca's of zero byte objects to the entry block and merge them // together. Note that we only do this for alloca's, because malloc should // allocate and return a unique pointer, even for a zero byte allocation. - if (DL->getTypeAllocSize(AI.getAllocatedType()) == 0) { + if (DL.getTypeAllocSize(AI.getAllocatedType()) == 0) { // For a zero sized alloca there is no point in doing an array allocation. // This is helpful if the array size is a complicated expression not used // elsewhere. @@ -236,7 +250,7 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) { // dominance as the array size was forced to a constant earlier already. AllocaInst *EntryAI = dyn_cast<AllocaInst>(FirstInst); if (!EntryAI || !EntryAI->getAllocatedType()->isSized() || - DL->getTypeAllocSize(EntryAI->getAllocatedType()) != 0) { + DL.getTypeAllocSize(EntryAI->getAllocatedType()) != 0) { AI.moveBefore(FirstInst); return &AI; } @@ -245,7 +259,7 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) { // assign it the preferred alignment. if (EntryAI->getAlignment() == 0) EntryAI->setAlignment( - DL->getPrefTypeAlignment(EntryAI->getAllocatedType())); + DL.getPrefTypeAlignment(EntryAI->getAllocatedType())); // Replace this zero-sized alloca with the one at the start of the entry // block after ensuring that the address will be aligned enough for both // types. @@ -269,7 +283,7 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) { SmallVector<Instruction *, 4> ToDelete; if (MemTransferInst *Copy = isOnlyCopiedFromConstantGlobal(&AI, ToDelete)) { unsigned SourceAlign = getOrEnforceKnownAlignment( - Copy->getSource(), AI.getAlignment(), DL, AC, &AI, DT); + Copy->getSource(), AI.getAlignment(), DL, &AI, AC, DT); if (AI.getAlignment() <= SourceAlign) { DEBUG(dbgs() << "Found alloca equal to global: " << AI << '\n'); DEBUG(dbgs() << " memcpy = " << *Copy << '\n'); @@ -300,7 +314,8 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) { /// /// Note that this will create all of the instructions with whatever insert /// point the \c InstCombiner currently is using. -static LoadInst *combineLoadToNewType(InstCombiner &IC, LoadInst &LI, Type *NewTy) { +static LoadInst *combineLoadToNewType(InstCombiner &IC, LoadInst &LI, Type *NewTy, + const Twine &Suffix = "") { Value *Ptr = LI.getPointerOperand(); unsigned AS = LI.getPointerAddressSpace(); SmallVector<std::pair<unsigned, MDNode *>, 8> MD; @@ -308,7 +323,8 @@ static LoadInst *combineLoadToNewType(InstCombiner &IC, LoadInst &LI, Type *NewT LoadInst *NewLoad = IC.Builder->CreateAlignedLoad( IC.Builder->CreateBitCast(Ptr, NewTy->getPointerTo(AS)), - LI.getAlignment(), LI.getName()); + LI.getAlignment(), LI.getName() + Suffix); + MDBuilder MDB(NewLoad->getContext()); for (const auto &MDPair : MD) { unsigned ID = MDPair.first; MDNode *N = MDPair.second; @@ -335,21 +351,81 @@ static LoadInst *combineLoadToNewType(InstCombiner &IC, LoadInst &LI, Type *NewT break; case LLVMContext::MD_nonnull: - // FIXME: We should translate this into range metadata for integer types - // and vice versa. - if (NewTy->isPointerTy()) + // This only directly applies if the new type is also a pointer. + if (NewTy->isPointerTy()) { NewLoad->setMetadata(ID, N); + break; + } + // If it's integral now, translate it to !range metadata. + if (NewTy->isIntegerTy()) { + auto *ITy = cast<IntegerType>(NewTy); + auto *NullInt = ConstantExpr::getPtrToInt( + ConstantPointerNull::get(cast<PointerType>(Ptr->getType())), ITy); + auto *NonNullInt = + ConstantExpr::getAdd(NullInt, ConstantInt::get(ITy, 1)); + NewLoad->setMetadata(LLVMContext::MD_range, + MDB.createRange(NonNullInt, NullInt)); + } break; case LLVMContext::MD_range: // FIXME: It would be nice to propagate this in some way, but the type - // conversions make it hard. + // conversions make it hard. If the new type is a pointer, we could + // translate it to !nonnull metadata. break; } } return NewLoad; } +/// \brief Combine a store to a new type. +/// +/// Returns the newly created store instruction. +static StoreInst *combineStoreToNewValue(InstCombiner &IC, StoreInst &SI, Value *V) { + Value *Ptr = SI.getPointerOperand(); + unsigned AS = SI.getPointerAddressSpace(); + SmallVector<std::pair<unsigned, MDNode *>, 8> MD; + SI.getAllMetadata(MD); + + StoreInst *NewStore = IC.Builder->CreateAlignedStore( + V, IC.Builder->CreateBitCast(Ptr, V->getType()->getPointerTo(AS)), + SI.getAlignment()); + for (const auto &MDPair : MD) { + unsigned ID = MDPair.first; + MDNode *N = MDPair.second; + // Note, essentially every kind of metadata should be preserved here! This + // routine is supposed to clone a store instruction changing *only its + // type*. The only metadata it makes sense to drop is metadata which is + // invalidated when the pointer type changes. This should essentially + // never be the case in LLVM, but we explicitly switch over only known + // metadata to be conservatively correct. If you are adding metadata to + // LLVM which pertains to stores, you almost certainly want to add it + // here. + switch (ID) { + case LLVMContext::MD_dbg: + case LLVMContext::MD_tbaa: + case LLVMContext::MD_prof: + case LLVMContext::MD_fpmath: + case LLVMContext::MD_tbaa_struct: + case LLVMContext::MD_alias_scope: + case LLVMContext::MD_noalias: + case LLVMContext::MD_nontemporal: + case LLVMContext::MD_mem_parallel_loop_access: + // All of these directly apply. + NewStore->setMetadata(ID, N); + break; + + case LLVMContext::MD_invariant_load: + case LLVMContext::MD_nonnull: + case LLVMContext::MD_range: + // These don't apply for stores. + break; + } + } + + return NewStore; +} + /// \brief Combine loads to match the type of value their uses after looking /// through intervening bitcasts. /// @@ -376,6 +452,35 @@ static Instruction *combineLoadToOperationType(InstCombiner &IC, LoadInst &LI) { if (LI.use_empty()) return nullptr; + Type *Ty = LI.getType(); + const DataLayout &DL = IC.getDataLayout(); + + // Try to canonicalize loads which are only ever stored to operate over + // integers instead of any other type. We only do this when the loaded type + // is sized and has a size exactly the same as its store size and the store + // size is a legal integer type. + if (!Ty->isIntegerTy() && Ty->isSized() && + DL.isLegalInteger(DL.getTypeStoreSizeInBits(Ty)) && + DL.getTypeStoreSizeInBits(Ty) == DL.getTypeSizeInBits(Ty)) { + if (std::all_of(LI.user_begin(), LI.user_end(), [&LI](User *U) { + auto *SI = dyn_cast<StoreInst>(U); + return SI && SI->getPointerOperand() != &LI; + })) { + LoadInst *NewLoad = combineLoadToNewType( + IC, LI, + Type::getIntNTy(LI.getContext(), DL.getTypeStoreSizeInBits(Ty))); + // Replace all the stores with stores of the newly loaded value. + for (auto UI = LI.user_begin(), UE = LI.user_end(); UI != UE;) { + auto *SI = cast<StoreInst>(*UI++); + IC.Builder->SetInsertPoint(SI); + combineStoreToNewValue(IC, *SI, NewLoad); + IC.EraseInstFromFunction(*SI); + } + assert(LI.use_empty() && "Failed to remove all users of the load!"); + // Return the old load so the combiner can delete it safely. + return &LI; + } + } // Fold away bit casts of the loaded value by loading the desired type. if (LI.hasOneUse()) @@ -391,6 +496,218 @@ static Instruction *combineLoadToOperationType(InstCombiner &IC, LoadInst &LI) { return nullptr; } +static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) { + // FIXME: We could probably with some care handle both volatile and atomic + // stores here but it isn't clear that this is important. + if (!LI.isSimple()) + return nullptr; + + Type *T = LI.getType(); + if (!T->isAggregateType()) + return nullptr; + + assert(LI.getAlignment() && "Alignement must be set at this point"); + + if (auto *ST = dyn_cast<StructType>(T)) { + // If the struct only have one element, we unpack. + if (ST->getNumElements() == 1) { + LoadInst *NewLoad = combineLoadToNewType(IC, LI, ST->getTypeAtIndex(0U), + ".unpack"); + return IC.ReplaceInstUsesWith(LI, IC.Builder->CreateInsertValue( + UndefValue::get(T), NewLoad, 0, LI.getName())); + } + } + + if (auto *AT = dyn_cast<ArrayType>(T)) { + // If the array only have one element, we unpack. + if (AT->getNumElements() == 1) { + LoadInst *NewLoad = combineLoadToNewType(IC, LI, AT->getElementType(), + ".unpack"); + return IC.ReplaceInstUsesWith(LI, IC.Builder->CreateInsertValue( + UndefValue::get(T), NewLoad, 0, LI.getName())); + } + } + + return nullptr; +} + +// If we can determine that all possible objects pointed to by the provided +// pointer value are, not only dereferenceable, but also definitively less than +// or equal to the provided maximum size, then return true. Otherwise, return +// false (constant global values and allocas fall into this category). +// +// FIXME: This should probably live in ValueTracking (or similar). +static bool isObjectSizeLessThanOrEq(Value *V, uint64_t MaxSize, + const DataLayout &DL) { + SmallPtrSet<Value *, 4> Visited; + SmallVector<Value *, 4> Worklist(1, V); + + do { + Value *P = Worklist.pop_back_val(); + P = P->stripPointerCasts(); + + if (!Visited.insert(P).second) + continue; + + if (SelectInst *SI = dyn_cast<SelectInst>(P)) { + Worklist.push_back(SI->getTrueValue()); + Worklist.push_back(SI->getFalseValue()); + continue; + } + + if (PHINode *PN = dyn_cast<PHINode>(P)) { + for (Value *IncValue : PN->incoming_values()) + Worklist.push_back(IncValue); + continue; + } + + if (GlobalAlias *GA = dyn_cast<GlobalAlias>(P)) { + if (GA->mayBeOverridden()) + return false; + Worklist.push_back(GA->getAliasee()); + continue; + } + + // If we know how big this object is, and it is less than MaxSize, continue + // searching. Otherwise, return false. + if (AllocaInst *AI = dyn_cast<AllocaInst>(P)) { + if (!AI->getAllocatedType()->isSized()) + return false; + + ConstantInt *CS = dyn_cast<ConstantInt>(AI->getArraySize()); + if (!CS) + return false; + + uint64_t TypeSize = DL.getTypeAllocSize(AI->getAllocatedType()); + // Make sure that, even if the multiplication below would wrap as an + // uint64_t, we still do the right thing. + if ((CS->getValue().zextOrSelf(128)*APInt(128, TypeSize)).ugt(MaxSize)) + return false; + continue; + } + + if (GlobalVariable *GV = dyn_cast<GlobalVariable>(P)) { + if (!GV->hasDefinitiveInitializer() || !GV->isConstant()) + return false; + + uint64_t InitSize = DL.getTypeAllocSize(GV->getType()->getElementType()); + if (InitSize > MaxSize) + return false; + continue; + } + + return false; + } while (!Worklist.empty()); + + return true; +} + +// If we're indexing into an object of a known size, and the outer index is +// not a constant, but having any value but zero would lead to undefined +// behavior, replace it with zero. +// +// For example, if we have: +// @f.a = private unnamed_addr constant [1 x i32] [i32 12], align 4 +// ... +// %arrayidx = getelementptr inbounds [1 x i32]* @f.a, i64 0, i64 %x +// ... = load i32* %arrayidx, align 4 +// Then we know that we can replace %x in the GEP with i64 0. +// +// FIXME: We could fold any GEP index to zero that would cause UB if it were +// not zero. Currently, we only handle the first such index. Also, we could +// also search through non-zero constant indices if we kept track of the +// offsets those indices implied. +static bool canReplaceGEPIdxWithZero(InstCombiner &IC, GetElementPtrInst *GEPI, + Instruction *MemI, unsigned &Idx) { + if (GEPI->getNumOperands() < 2) + return false; + + // Find the first non-zero index of a GEP. If all indices are zero, return + // one past the last index. + auto FirstNZIdx = [](const GetElementPtrInst *GEPI) { + unsigned I = 1; + for (unsigned IE = GEPI->getNumOperands(); I != IE; ++I) { + Value *V = GEPI->getOperand(I); + if (const ConstantInt *CI = dyn_cast<ConstantInt>(V)) + if (CI->isZero()) + continue; + + break; + } + + return I; + }; + + // Skip through initial 'zero' indices, and find the corresponding pointer + // type. See if the next index is not a constant. + Idx = FirstNZIdx(GEPI); + if (Idx == GEPI->getNumOperands()) + return false; + if (isa<Constant>(GEPI->getOperand(Idx))) + return false; + + SmallVector<Value *, 4> Ops(GEPI->idx_begin(), GEPI->idx_begin() + Idx); + Type *AllocTy = GetElementPtrInst::getIndexedType( + cast<PointerType>(GEPI->getOperand(0)->getType()->getScalarType()) + ->getElementType(), + Ops); + if (!AllocTy || !AllocTy->isSized()) + return false; + const DataLayout &DL = IC.getDataLayout(); + uint64_t TyAllocSize = DL.getTypeAllocSize(AllocTy); + + // If there are more indices after the one we might replace with a zero, make + // sure they're all non-negative. If any of them are negative, the overall + // address being computed might be before the base address determined by the + // first non-zero index. + auto IsAllNonNegative = [&]() { + for (unsigned i = Idx+1, e = GEPI->getNumOperands(); i != e; ++i) { + bool KnownNonNegative, KnownNegative; + IC.ComputeSignBit(GEPI->getOperand(i), KnownNonNegative, + KnownNegative, 0, MemI); + if (KnownNonNegative) + continue; + return false; + } + + return true; + }; + + // FIXME: If the GEP is not inbounds, and there are extra indices after the + // one we'll replace, those could cause the address computation to wrap + // (rendering the IsAllNonNegative() check below insufficient). We can do + // better, ignoring zero indicies (and other indicies we can prove small + // enough not to wrap). + if (Idx+1 != GEPI->getNumOperands() && !GEPI->isInBounds()) + return false; + + // Note that isObjectSizeLessThanOrEq will return true only if the pointer is + // also known to be dereferenceable. + return isObjectSizeLessThanOrEq(GEPI->getOperand(0), TyAllocSize, DL) && + IsAllNonNegative(); +} + +// If we're indexing into an object with a variable index for the memory +// access, but the object has only one element, we can assume that the index +// will always be zero. If we replace the GEP, return it. +template <typename T> +static Instruction *replaceGEPIdxWithZero(InstCombiner &IC, Value *Ptr, + T &MemI) { + if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Ptr)) { + unsigned Idx; + if (canReplaceGEPIdxWithZero(IC, GEPI, &MemI, Idx)) { + Instruction *NewGEPI = GEPI->clone(); + NewGEPI->setOperand(Idx, + ConstantInt::get(GEPI->getOperand(Idx)->getType(), 0)); + NewGEPI->insertBefore(GEPI); + MemI.setOperand(MemI.getPointerOperandIndex(), NewGEPI); + return NewGEPI; + } + } + + return nullptr; +} + Instruction *InstCombiner::visitLoadInst(LoadInst &LI) { Value *Op = LI.getOperand(0); @@ -399,23 +716,30 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) { return Res; // Attempt to improve the alignment. - if (DL) { - unsigned KnownAlign = getOrEnforceKnownAlignment( - Op, DL->getPrefTypeAlignment(LI.getType()), DL, AC, &LI, DT); - unsigned LoadAlign = LI.getAlignment(); - unsigned EffectiveLoadAlign = LoadAlign != 0 ? LoadAlign : - DL->getABITypeAlignment(LI.getType()); - - if (KnownAlign > EffectiveLoadAlign) - LI.setAlignment(KnownAlign); - else if (LoadAlign == 0) - LI.setAlignment(EffectiveLoadAlign); + unsigned KnownAlign = getOrEnforceKnownAlignment( + Op, DL.getPrefTypeAlignment(LI.getType()), DL, &LI, AC, DT); + unsigned LoadAlign = LI.getAlignment(); + unsigned EffectiveLoadAlign = + LoadAlign != 0 ? LoadAlign : DL.getABITypeAlignment(LI.getType()); + + if (KnownAlign > EffectiveLoadAlign) + LI.setAlignment(KnownAlign); + else if (LoadAlign == 0) + LI.setAlignment(EffectiveLoadAlign); + + // Replace GEP indices if possible. + if (Instruction *NewGEPI = replaceGEPIdxWithZero(*this, Op, LI)) { + Worklist.Add(NewGEPI); + return &LI; } // None of the following transforms are legal for volatile/atomic loads. // FIXME: Some of it is okay for atomic loads; needs refactoring. if (!LI.isSimple()) return nullptr; + if (Instruction *Res = unpackLoadToAggregate(*this, LI)) + return Res; + // Do really simple store-to-load forwarding and load CSE, to catch cases // where there are several consecutive memory accesses to the same location, // separated by a few arithmetic operations. @@ -466,8 +790,8 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) { if (SelectInst *SI = dyn_cast<SelectInst>(Op)) { // load (select (Cond, &V1, &V2)) --> select(Cond, load &V1, load &V2). unsigned Align = LI.getAlignment(); - if (isSafeToLoadUnconditionally(SI->getOperand(1), SI, Align, DL) && - isSafeToLoadUnconditionally(SI->getOperand(2), SI, Align, DL)) { + if (isSafeToLoadUnconditionally(SI->getOperand(1), SI, Align) && + isSafeToLoadUnconditionally(SI->getOperand(2), SI, Align)) { LoadInst *V1 = Builder->CreateLoad(SI->getOperand(1), SI->getOperand(1)->getName()+".val"); LoadInst *V2 = Builder->CreateLoad(SI->getOperand(2), @@ -521,50 +845,12 @@ static bool combineStoreToValueType(InstCombiner &IC, StoreInst &SI) { if (!SI.isSimple()) return false; - Value *Ptr = SI.getPointerOperand(); Value *V = SI.getValueOperand(); - unsigned AS = SI.getPointerAddressSpace(); - SmallVector<std::pair<unsigned, MDNode *>, 8> MD; - SI.getAllMetadata(MD); // Fold away bit casts of the stored value by storing the original type. if (auto *BC = dyn_cast<BitCastInst>(V)) { V = BC->getOperand(0); - StoreInst *NewStore = IC.Builder->CreateAlignedStore( - V, IC.Builder->CreateBitCast(Ptr, V->getType()->getPointerTo(AS)), - SI.getAlignment()); - for (const auto &MDPair : MD) { - unsigned ID = MDPair.first; - MDNode *N = MDPair.second; - // Note, essentially every kind of metadata should be preserved here! This - // routine is supposed to clone a store instruction changing *only its - // type*. The only metadata it makes sense to drop is metadata which is - // invalidated when the pointer type changes. This should essentially - // never be the case in LLVM, but we explicitly switch over only known - // metadata to be conservatively correct. If you are adding metadata to - // LLVM which pertains to stores, you almost certainly want to add it - // here. - switch (ID) { - case LLVMContext::MD_dbg: - case LLVMContext::MD_tbaa: - case LLVMContext::MD_prof: - case LLVMContext::MD_fpmath: - case LLVMContext::MD_tbaa_struct: - case LLVMContext::MD_alias_scope: - case LLVMContext::MD_noalias: - case LLVMContext::MD_nontemporal: - case LLVMContext::MD_mem_parallel_loop_access: - // All of these directly apply. - NewStore->setMetadata(ID, N); - break; - - case LLVMContext::MD_invariant_load: - case LLVMContext::MD_nonnull: - case LLVMContext::MD_range: - // These don't apply for stores. - break; - } - } + combineStoreToNewValue(IC, SI, V); return true; } @@ -573,6 +859,39 @@ static bool combineStoreToValueType(InstCombiner &IC, StoreInst &SI) { return false; } +static bool unpackStoreToAggregate(InstCombiner &IC, StoreInst &SI) { + // FIXME: We could probably with some care handle both volatile and atomic + // stores here but it isn't clear that this is important. + if (!SI.isSimple()) + return false; + + Value *V = SI.getValueOperand(); + Type *T = V->getType(); + + if (!T->isAggregateType()) + return false; + + if (auto *ST = dyn_cast<StructType>(T)) { + // If the struct only have one element, we unpack. + if (ST->getNumElements() == 1) { + V = IC.Builder->CreateExtractValue(V, 0); + combineStoreToNewValue(IC, SI, V); + return true; + } + } + + if (auto *AT = dyn_cast<ArrayType>(T)) { + // If the array only have one element, we unpack. + if (AT->getNumElements() == 1) { + V = IC.Builder->CreateExtractValue(V, 0); + combineStoreToNewValue(IC, SI, V); + return true; + } + } + + return false; +} + /// equivalentAddressValues - Test if A and B will obviously have the same /// value. This includes recognizing that %t0 and %t1 will have the same /// value in code like this: @@ -611,17 +930,25 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) { return EraseInstFromFunction(SI); // Attempt to improve the alignment. - if (DL) { - unsigned KnownAlign = getOrEnforceKnownAlignment( - Ptr, DL->getPrefTypeAlignment(Val->getType()), DL, AC, &SI, DT); - unsigned StoreAlign = SI.getAlignment(); - unsigned EffectiveStoreAlign = StoreAlign != 0 ? StoreAlign : - DL->getABITypeAlignment(Val->getType()); - - if (KnownAlign > EffectiveStoreAlign) - SI.setAlignment(KnownAlign); - else if (StoreAlign == 0) - SI.setAlignment(EffectiveStoreAlign); + unsigned KnownAlign = getOrEnforceKnownAlignment( + Ptr, DL.getPrefTypeAlignment(Val->getType()), DL, &SI, AC, DT); + unsigned StoreAlign = SI.getAlignment(); + unsigned EffectiveStoreAlign = + StoreAlign != 0 ? StoreAlign : DL.getABITypeAlignment(Val->getType()); + + if (KnownAlign > EffectiveStoreAlign) + SI.setAlignment(KnownAlign); + else if (StoreAlign == 0) + SI.setAlignment(EffectiveStoreAlign); + + // Try to canonicalize the stored type. + if (unpackStoreToAggregate(*this, SI)) + return EraseInstFromFunction(SI); + + // Replace GEP indices if possible. + if (Instruction *NewGEPI = replaceGEPIdxWithZero(*this, Ptr, SI)) { + Worklist.Add(NewGEPI); + return &SI; } // Don't hack volatile/atomic stores. diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp index b2ff96f..a554e9f 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp @@ -12,7 +12,7 @@ // //===----------------------------------------------------------------------===// -#include "InstCombine.h" +#include "InstCombineInternal.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/PatternMatch.h" @@ -26,7 +26,7 @@ using namespace PatternMatch; /// where it is known to be non-zero. If this allows us to simplify the /// computation, do so and return the new operand, otherwise return null. static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC, - Instruction *CxtI) { + Instruction &CxtI) { // If V has multiple uses, then we would have to do more analysis to determine // if this is safe. For example, the use could be in dynamically unreached // code. @@ -47,8 +47,8 @@ static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC, // inexact. Similarly for <<. if (BinaryOperator *I = dyn_cast<BinaryOperator>(V)) if (I->isLogicalShift() && - isKnownToBeAPowerOfTwo(I->getOperand(0), false, 0, - IC.getAssumptionCache(), CxtI, + isKnownToBeAPowerOfTwo(I->getOperand(0), IC.getDataLayout(), false, 0, + IC.getAssumptionCache(), &CxtI, IC.getDominatorTree())) { // We know that this is an exact/nuw shift and that the input is a // non-zero context as well. @@ -126,7 +126,7 @@ static Constant *getLogBase2Vector(ConstantDataVector *CV) { /// \brief Return true if we can prove that: /// (mul LHS, RHS) === (mul nsw LHS, RHS) bool InstCombiner::WillNotOverflowSignedMul(Value *LHS, Value *RHS, - Instruction *CxtI) { + Instruction &CxtI) { // Multiplying n * m significant bits yields a result of n + m significant // bits. If the total number of significant bits does not exceed the // result bit width (minus 1), there is no overflow. @@ -137,8 +137,8 @@ bool InstCombiner::WillNotOverflowSignedMul(Value *LHS, Value *RHS, // Note that underestimating the number of sign bits gives a more // conservative answer. - unsigned SignBits = ComputeNumSignBits(LHS, 0, CxtI) + - ComputeNumSignBits(RHS, 0, CxtI); + unsigned SignBits = + ComputeNumSignBits(LHS, 0, &CxtI) + ComputeNumSignBits(RHS, 0, &CxtI); // First handle the easy case: if we have enough sign bits there's // definitely no overflow. @@ -157,8 +157,8 @@ bool InstCombiner::WillNotOverflowSignedMul(Value *LHS, Value *RHS, // For simplicity we just check if at least one side is not negative. bool LHSNonNegative, LHSNegative; bool RHSNonNegative, RHSNegative; - ComputeSignBit(LHS, LHSNonNegative, LHSNegative, /*Depth=*/0, CxtI); - ComputeSignBit(RHS, RHSNonNegative, RHSNegative, /*Depth=*/0, CxtI); + ComputeSignBit(LHS, LHSNonNegative, LHSNegative, /*Depth=*/0, &CxtI); + ComputeSignBit(RHS, RHSNonNegative, RHSNegative, /*Depth=*/0, &CxtI); if (LHSNonNegative || RHSNonNegative) return true; } @@ -217,12 +217,16 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) { NewCst = getLogBase2Vector(CV); if (NewCst) { + unsigned Width = NewCst->getType()->getPrimitiveSizeInBits(); BinaryOperator *Shl = BinaryOperator::CreateShl(NewOp, NewCst); if (I.hasNoUnsignedWrap()) Shl->setHasNoUnsignedWrap(); - if (I.hasNoSignedWrap() && NewCst->isNotMinSignedValue()) - Shl->setHasNoSignedWrap(); + if (I.hasNoSignedWrap()) { + uint64_t V; + if (match(NewCst, m_ConstantInt(V)) && V != Width - 1) + Shl->setHasNoSignedWrap(); + } return Shl; } @@ -375,7 +379,7 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) { } } - if (!I.hasNoSignedWrap() && WillNotOverflowSignedMul(Op0, Op1, &I)) { + if (!I.hasNoSignedWrap() && WillNotOverflowSignedMul(Op0, Op1, I)) { Changed = true; I.setHasNoSignedWrap(true); } @@ -422,7 +426,7 @@ static bool isFiniteNonZeroFp(Constant *C) { if (C->getType()->isVectorTy()) { for (unsigned I = 0, E = C->getType()->getVectorNumElements(); I != E; ++I) { - ConstantFP *CFP = dyn_cast<ConstantFP>(C->getAggregateElement(I)); + ConstantFP *CFP = dyn_cast_or_null<ConstantFP>(C->getAggregateElement(I)); if (!CFP || !CFP->getValueAPF().isFiniteNonZero()) return false; } @@ -437,7 +441,7 @@ static bool isNormalFp(Constant *C) { if (C->getType()->isVectorTy()) { for (unsigned I = 0, E = C->getType()->getVectorNumElements(); I != E; ++I) { - ConstantFP *CFP = dyn_cast<ConstantFP>(C->getAggregateElement(I)); + ConstantFP *CFP = dyn_cast_or_null<ConstantFP>(C->getAggregateElement(I)); if (!CFP || !CFP->getValueAPF().isNormal()) return false; } @@ -780,7 +784,7 @@ Instruction *InstCombiner::commonIDivTransforms(BinaryOperator &I) { Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); // The RHS is known non-zero. - if (Value *V = simplifyValueKnownNonZero(I.getOperand(1), *this, &I)) { + if (Value *V = simplifyValueKnownNonZero(I.getOperand(1), *this, I)) { I.setOperand(1, V); return &I; } @@ -1155,7 +1159,7 @@ Instruction *InstCombiner::visitSDiv(BinaryOperator &I) { return BO; } - if (isKnownToBeAPowerOfTwo(Op1, /*OrZero*/ true, 0, AC, &I, DT)) { + if (isKnownToBeAPowerOfTwo(Op1, DL, /*OrZero*/ true, 0, AC, &I, DT)) { // X sdiv (1 << Y) -> X udiv (1 << Y) ( -> X u>> Y) // Safe because the only negative value (1 << Y) can take on is // INT_MIN, and X sdiv INT_MIN == X udiv INT_MIN == 0 if X doesn't have @@ -1206,7 +1210,8 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) { if (Value *V = SimplifyVectorOp(I)) return ReplaceInstUsesWith(I, V); - if (Value *V = SimplifyFDivInst(Op0, Op1, DL, TLI, DT, AC)) + if (Value *V = SimplifyFDivInst(Op0, Op1, I.getFastMathFlags(), + DL, TLI, DT, AC)) return ReplaceInstUsesWith(I, V); if (isa<Constant>(Op0)) @@ -1337,7 +1342,7 @@ Instruction *InstCombiner::commonIRemTransforms(BinaryOperator &I) { Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); // The RHS is known non-zero. - if (Value *V = simplifyValueKnownNonZero(I.getOperand(1), *this, &I)) { + if (Value *V = simplifyValueKnownNonZero(I.getOperand(1), *this, I)) { I.setOperand(1, V); return &I; } @@ -1384,7 +1389,7 @@ Instruction *InstCombiner::visitURem(BinaryOperator &I) { I.getType()); // X urem Y -> X and Y-1, where Y is a power of 2, - if (isKnownToBeAPowerOfTwo(Op1, /*OrZero*/ true, 0, AC, &I, DT)) { + if (isKnownToBeAPowerOfTwo(Op1, DL, /*OrZero*/ true, 0, AC, &I, DT)) { Constant *N1 = Constant::getAllOnesValue(I.getType()); Value *Add = Builder->CreateAdd(Op1, N1); return BinaryOperator::CreateAnd(Op0, Add); @@ -1481,7 +1486,8 @@ Instruction *InstCombiner::visitFRem(BinaryOperator &I) { if (Value *V = SimplifyVectorOp(I)) return ReplaceInstUsesWith(I, V); - if (Value *V = SimplifyFRemInst(Op0, Op1, DL, TLI, DT, AC)) + if (Value *V = SimplifyFRemInst(Op0, Op1, I.getFastMathFlags(), + DL, TLI, DT, AC)) return ReplaceInstUsesWith(I, V); // Handle cases involving: rem X, (select Cond, Y, Z) diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp index 53831c8..6a6693c 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp @@ -11,11 +11,10 @@ // //===----------------------------------------------------------------------===// -#include "InstCombine.h" +#include "InstCombineInternal.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/Analysis/InstructionSimplify.h" -#include "llvm/IR/DataLayout.h" using namespace llvm; #define DEBUG_TYPE "instcombine" @@ -231,7 +230,8 @@ Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) { Value *Base = FixedOperands[0]; GetElementPtrInst *NewGEP = - GetElementPtrInst::Create(Base, makeArrayRef(FixedOperands).slice(1)); + GetElementPtrInst::Create(FirstInst->getSourceElementType(), Base, + makeArrayRef(FixedOperands).slice(1)); if (AllInBounds) NewGEP->setIsInBounds(); NewGEP->setDebugLoc(FirstInst->getDebugLoc()); return NewGEP; @@ -375,8 +375,8 @@ Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) { // and mark all the input loads as non-volatile. If we don't do this, we will // insert a new volatile load and the old ones will not be deletable. if (isVolatile) - for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) - cast<LoadInst>(PN.getIncomingValue(i))->setVolatile(false); + for (Value *IncValue : PN.incoming_values()) + cast<LoadInst>(IncValue)->setVolatile(false); LoadInst *NewLI = new LoadInst(PhiVal, "", isVolatile, LoadAlignment); NewLI->setDebugLoc(FirstLI->getDebugLoc()); @@ -539,8 +539,7 @@ static bool PHIsEqualValue(PHINode *PN, Value *NonPhiInVal, // Scan the operands to see if they are either phi nodes or are equal to // the value. - for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { - Value *Op = PN->getIncomingValue(i); + for (Value *Op : PN->incoming_values()) { if (PHINode *OpPN = dyn_cast<PHINode>(Op)) { if (!PHIsEqualValue(OpPN, NonPhiInVal, ValueEqualPHIs)) return false; @@ -891,8 +890,8 @@ Instruction *InstCombiner::visitPHINode(PHINode &PN) { // it is only used by trunc or trunc(lshr) operations. If so, we split the // PHI into the various pieces being extracted. This sort of thing is // introduced when SROA promotes an aggregate to a single large integer type. - if (PN.getType()->isIntegerTy() && DL && - !DL->isLegalInteger(PN.getType()->getPrimitiveSizeInBits())) + if (PN.getType()->isIntegerTy() && + !DL.isLegalInteger(PN.getType()->getPrimitiveSizeInBits())) if (Instruction *Res = SliceUpIllegalIntegerPHI(PN)) return Res; diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index bf3c33e..d2fbcdd 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -11,88 +11,55 @@ // //===----------------------------------------------------------------------===// -#include "InstCombine.h" +#include "InstCombineInternal.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/PatternMatch.h" using namespace llvm; using namespace PatternMatch; #define DEBUG_TYPE "instcombine" -/// MatchSelectPattern - Pattern match integer [SU]MIN, [SU]MAX, and ABS idioms, -/// returning the kind and providing the out parameter results if we -/// successfully match. static SelectPatternFlavor -MatchSelectPattern(Value *V, Value *&LHS, Value *&RHS) { - SelectInst *SI = dyn_cast<SelectInst>(V); - if (!SI) return SPF_UNKNOWN; - - ICmpInst *ICI = dyn_cast<ICmpInst>(SI->getCondition()); - if (!ICI) return SPF_UNKNOWN; - - ICmpInst::Predicate Pred = ICI->getPredicate(); - Value *CmpLHS = ICI->getOperand(0); - Value *CmpRHS = ICI->getOperand(1); - Value *TrueVal = SI->getTrueValue(); - Value *FalseVal = SI->getFalseValue(); - - LHS = CmpLHS; - RHS = CmpRHS; - - // (icmp X, Y) ? X : Y - if (TrueVal == CmpLHS && FalseVal == CmpRHS) { - switch (Pred) { - default: return SPF_UNKNOWN; // Equality. - case ICmpInst::ICMP_UGT: - case ICmpInst::ICMP_UGE: return SPF_UMAX; - case ICmpInst::ICMP_SGT: - case ICmpInst::ICMP_SGE: return SPF_SMAX; - case ICmpInst::ICMP_ULT: - case ICmpInst::ICMP_ULE: return SPF_UMIN; - case ICmpInst::ICMP_SLT: - case ICmpInst::ICMP_SLE: return SPF_SMIN; - } - } - - // (icmp X, Y) ? Y : X - if (TrueVal == CmpRHS && FalseVal == CmpLHS) { - switch (Pred) { - default: return SPF_UNKNOWN; // Equality. - case ICmpInst::ICMP_UGT: - case ICmpInst::ICMP_UGE: return SPF_UMIN; - case ICmpInst::ICMP_SGT: - case ICmpInst::ICMP_SGE: return SPF_SMIN; - case ICmpInst::ICMP_ULT: - case ICmpInst::ICMP_ULE: return SPF_UMAX; - case ICmpInst::ICMP_SLT: - case ICmpInst::ICMP_SLE: return SPF_SMAX; - } +getInverseMinMaxSelectPattern(SelectPatternFlavor SPF) { + switch (SPF) { + default: + llvm_unreachable("unhandled!"); + + case SPF_SMIN: + return SPF_SMAX; + case SPF_UMIN: + return SPF_UMAX; + case SPF_SMAX: + return SPF_SMIN; + case SPF_UMAX: + return SPF_UMIN; } +} - if (ConstantInt *C1 = dyn_cast<ConstantInt>(CmpRHS)) { - if ((CmpLHS == TrueVal && match(FalseVal, m_Neg(m_Specific(CmpLHS)))) || - (CmpLHS == FalseVal && match(TrueVal, m_Neg(m_Specific(CmpLHS))))) { - - // ABS(X) ==> (X >s 0) ? X : -X and (X >s -1) ? X : -X - // NABS(X) ==> (X >s 0) ? -X : X and (X >s -1) ? -X : X - if (Pred == ICmpInst::ICMP_SGT && (C1->isZero() || C1->isMinusOne())) { - return (CmpLHS == TrueVal) ? SPF_ABS : SPF_NABS; - } - - // ABS(X) ==> (X <s 0) ? -X : X and (X <s 1) ? -X : X - // NABS(X) ==> (X <s 0) ? X : -X and (X <s 1) ? X : -X - if (Pred == ICmpInst::ICMP_SLT && (C1->isZero() || C1->isOne())) { - return (CmpLHS == FalseVal) ? SPF_ABS : SPF_NABS; - } - } +static CmpInst::Predicate getICmpPredicateForMinMax(SelectPatternFlavor SPF) { + switch (SPF) { + default: + llvm_unreachable("unhandled!"); + + case SPF_SMIN: + return ICmpInst::ICMP_SLT; + case SPF_UMIN: + return ICmpInst::ICMP_ULT; + case SPF_SMAX: + return ICmpInst::ICMP_SGT; + case SPF_UMAX: + return ICmpInst::ICMP_UGT; } - - // TODO: (X > 4) ? X : 5 --> (X >= 5) ? X : 5 --> MAX(X, 5) - - return SPF_UNKNOWN; } +static Value *generateMinMaxSelectPattern(InstCombiner::BuilderTy *Builder, + SelectPatternFlavor SPF, Value *A, + Value *B) { + CmpInst::Predicate Pred = getICmpPredicateForMinMax(SPF); + return Builder->CreateSelect(Builder->CreateICmp(Pred, A, B), A, B); +} /// GetSelectFoldableOperands - We want to turn code that looks like this: /// %C = or %A, %B @@ -312,9 +279,9 @@ Instruction *InstCombiner::FoldSelectIntoOp(SelectInst &SI, Value *TrueVal, /// SimplifyWithOpReplaced - See if V simplifies when its operand Op is /// replaced with RepOp. static Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, - const DataLayout *TD, const TargetLibraryInfo *TLI, - DominatorTree *DT, AssumptionCache *AC) { + const DataLayout &DL, DominatorTree *DT, + AssumptionCache *AC) { // Trivial replacement. if (V == Op) return RepOp; @@ -326,18 +293,18 @@ static Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, // If this is a binary operator, try to simplify it with the replaced op. if (BinaryOperator *B = dyn_cast<BinaryOperator>(I)) { if (B->getOperand(0) == Op) - return SimplifyBinOp(B->getOpcode(), RepOp, B->getOperand(1), TD, TLI); + return SimplifyBinOp(B->getOpcode(), RepOp, B->getOperand(1), DL, TLI); if (B->getOperand(1) == Op) - return SimplifyBinOp(B->getOpcode(), B->getOperand(0), RepOp, TD, TLI); + return SimplifyBinOp(B->getOpcode(), B->getOperand(0), RepOp, DL, TLI); } // Same for CmpInsts. if (CmpInst *C = dyn_cast<CmpInst>(I)) { if (C->getOperand(0) == Op) - return SimplifyCmpInst(C->getPredicate(), RepOp, C->getOperand(1), TD, + return SimplifyCmpInst(C->getPredicate(), RepOp, C->getOperand(1), DL, TLI, DT, AC); if (C->getOperand(1) == Op) - return SimplifyCmpInst(C->getPredicate(), C->getOperand(0), RepOp, TD, + return SimplifyCmpInst(C->getPredicate(), C->getOperand(0), RepOp, DL, TLI, DT, AC); } @@ -361,14 +328,14 @@ static Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, if (ConstOps.size() == I->getNumOperands()) { if (CmpInst *C = dyn_cast<CmpInst>(I)) return ConstantFoldCompareInstOperands(C->getPredicate(), ConstOps[0], - ConstOps[1], TD, TLI); + ConstOps[1], DL, TLI); if (LoadInst *LI = dyn_cast<LoadInst>(I)) if (!LI->isVolatile()) - return ConstantFoldLoadFromConstPtr(ConstOps[0], TD); + return ConstantFoldLoadFromConstPtr(ConstOps[0], DL); - return ConstantFoldInstOperands(I->getOpcode(), I->getType(), - ConstOps, TD, TLI); + return ConstantFoldInstOperands(I->getOpcode(), I->getType(), ConstOps, + DL, TLI); } } @@ -437,6 +404,62 @@ static Value *foldSelectICmpAndOr(const SelectInst &SI, Value *TrueVal, return Builder->CreateOr(V, Y); } +/// Attempt to fold a cttz/ctlz followed by a icmp plus select into a single +/// call to cttz/ctlz with flag 'is_zero_undef' cleared. +/// +/// For example, we can fold the following code sequence: +/// \code +/// %0 = tail call i32 @llvm.cttz.i32(i32 %x, i1 true) +/// %1 = icmp ne i32 %x, 0 +/// %2 = select i1 %1, i32 %0, i32 32 +/// \code +/// +/// into: +/// %0 = tail call i32 @llvm.cttz.i32(i32 %x, i1 false) +static Value *foldSelectCttzCtlz(ICmpInst *ICI, Value *TrueVal, Value *FalseVal, + InstCombiner::BuilderTy *Builder) { + ICmpInst::Predicate Pred = ICI->getPredicate(); + Value *CmpLHS = ICI->getOperand(0); + Value *CmpRHS = ICI->getOperand(1); + + // Check if the condition value compares a value for equality against zero. + if (!ICI->isEquality() || !match(CmpRHS, m_Zero())) + return nullptr; + + Value *Count = FalseVal; + Value *ValueOnZero = TrueVal; + if (Pred == ICmpInst::ICMP_NE) + std::swap(Count, ValueOnZero); + + // Skip zero extend/truncate. + Value *V = nullptr; + if (match(Count, m_ZExt(m_Value(V))) || + match(Count, m_Trunc(m_Value(V)))) + Count = V; + + // Check if the value propagated on zero is a constant number equal to the + // sizeof in bits of 'Count'. + unsigned SizeOfInBits = Count->getType()->getScalarSizeInBits(); + if (!match(ValueOnZero, m_SpecificInt(SizeOfInBits))) + return nullptr; + + // Check that 'Count' is a call to intrinsic cttz/ctlz. Also check that the + // input to the cttz/ctlz is used as LHS for the compare instruction. + if (match(Count, m_Intrinsic<Intrinsic::cttz>(m_Specific(CmpLHS))) || + match(Count, m_Intrinsic<Intrinsic::ctlz>(m_Specific(CmpLHS)))) { + IntrinsicInst *II = cast<IntrinsicInst>(Count); + IRBuilder<> Builder(II); + // Explicitly clear the 'undef_on_zero' flag. + IntrinsicInst *NewI = cast<IntrinsicInst>(II->clone()); + Type *Ty = NewI->getArgOperand(1)->getType(); + NewI->setArgOperand(1, Constant::getNullValue(Ty)); + Builder.Insert(NewI); + return Builder.CreateZExtOrTrunc(NewI, ValueOnZero->getType()); + } + + return nullptr; +} + /// visitSelectInstWithICmp - Visit a SelectInst that has an /// ICmpInst as its first operand. /// @@ -579,25 +602,25 @@ Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI, // arms of the select. See if substituting this value into the arm and // simplifying the result yields the same value as the other arm. if (Pred == ICmpInst::ICMP_EQ) { - if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, DL, TLI, DT, AC) == + if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, TLI, DL, DT, AC) == TrueVal || - SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, DL, TLI, DT, AC) == + SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, TLI, DL, DT, AC) == TrueVal) return ReplaceInstUsesWith(SI, FalseVal); - if (SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, DL, TLI, DT, AC) == + if (SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, TLI, DL, DT, AC) == FalseVal || - SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, DL, TLI, DT, AC) == + SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, TLI, DL, DT, AC) == FalseVal) return ReplaceInstUsesWith(SI, FalseVal); } else if (Pred == ICmpInst::ICMP_NE) { - if (SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, DL, TLI, DT, AC) == + if (SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, TLI, DL, DT, AC) == FalseVal || - SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, DL, TLI, DT, AC) == + SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, TLI, DL, DT, AC) == FalseVal) return ReplaceInstUsesWith(SI, TrueVal); - if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, DL, TLI, DT, AC) == + if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, TLI, DL, DT, AC) == TrueVal || - SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, DL, TLI, DT, AC) == + SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, TLI, DL, DT, AC) == TrueVal) return ReplaceInstUsesWith(SI, TrueVal); } @@ -665,6 +688,9 @@ Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI, if (Value *V = foldSelectICmpAndOr(SI, TrueVal, FalseVal, Builder)) return ReplaceInstUsesWith(SI, V); + if (Value *V = foldSelectCttzCtlz(ICI, TrueVal, FalseVal, Builder)) + return ReplaceInstUsesWith(SI, V); + return Changed ? &SI : nullptr; } @@ -770,6 +796,52 @@ Instruction *InstCombiner::FoldSPFofSPF(Instruction *Inner, SI->getCondition(), SI->getFalseValue(), SI->getTrueValue()); return ReplaceInstUsesWith(Outer, NewSI); } + + auto IsFreeOrProfitableToInvert = + [&](Value *V, Value *&NotV, bool &ElidesXor) { + if (match(V, m_Not(m_Value(NotV)))) { + // If V has at most 2 uses then we can get rid of the xor operation + // entirely. + ElidesXor |= !V->hasNUsesOrMore(3); + return true; + } + + if (IsFreeToInvert(V, !V->hasNUsesOrMore(3))) { + NotV = nullptr; + return true; + } + + return false; + }; + + Value *NotA, *NotB, *NotC; + bool ElidesXor = false; + + // MIN(MIN(~A, ~B), ~C) == ~MAX(MAX(A, B), C) + // MIN(MAX(~A, ~B), ~C) == ~MAX(MIN(A, B), C) + // MAX(MIN(~A, ~B), ~C) == ~MIN(MAX(A, B), C) + // MAX(MAX(~A, ~B), ~C) == ~MIN(MIN(A, B), C) + // + // This transform is performance neutral if we can elide at least one xor from + // the set of three operands, since we'll be tacking on an xor at the very + // end. + if (IsFreeOrProfitableToInvert(A, NotA, ElidesXor) && + IsFreeOrProfitableToInvert(B, NotB, ElidesXor) && + IsFreeOrProfitableToInvert(C, NotC, ElidesXor) && ElidesXor) { + if (!NotA) + NotA = Builder->CreateNot(A); + if (!NotB) + NotB = Builder->CreateNot(B); + if (!NotC) + NotC = Builder->CreateNot(C); + + Value *NewInner = generateMinMaxSelectPattern( + Builder, getInverseMinMaxSelectPattern(SPF1), NotA, NotB); + Value *NewOuter = Builder->CreateNot(generateMinMaxSelectPattern( + Builder, getInverseMinMaxSelectPattern(SPF2), NewInner, NotC)); + return ReplaceInstUsesWith(Outer, NewOuter); + } + return nullptr; } @@ -868,7 +940,7 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { return BinaryOperator::CreateAnd(NotCond, FalseVal); } if (ConstantInt *C = dyn_cast<ConstantInt>(FalseVal)) { - if (C->getZExtValue() == false) { + if (!C->getZExtValue()) { // Change: A = select B, C, false --> A = and B, C return BinaryOperator::CreateAnd(CondVal, TrueVal); } @@ -1082,26 +1154,67 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { } // See if we can fold the select into one of our operands. - if (SI.getType()->isIntegerTy()) { + if (SI.getType()->isIntOrIntVectorTy()) { if (Instruction *FoldI = FoldSelectIntoOp(SI, TrueVal, FalseVal)) return FoldI; - // MAX(MAX(a, b), a) -> MAX(a, b) - // MIN(MIN(a, b), a) -> MIN(a, b) - // MAX(MIN(a, b), a) -> a - // MIN(MAX(a, b), a) -> a Value *LHS, *RHS, *LHS2, *RHS2; - if (SelectPatternFlavor SPF = MatchSelectPattern(&SI, LHS, RHS)) { - if (SelectPatternFlavor SPF2 = MatchSelectPattern(LHS, LHS2, RHS2)) + Instruction::CastOps CastOp; + SelectPatternFlavor SPF = matchSelectPattern(&SI, LHS, RHS, &CastOp); + + if (SPF) { + // Canonicalize so that type casts are outside select patterns. + if (LHS->getType()->getPrimitiveSizeInBits() != + SI.getType()->getPrimitiveSizeInBits()) { + CmpInst::Predicate Pred = getICmpPredicateForMinMax(SPF); + Value *Cmp = Builder->CreateICmp(Pred, LHS, RHS); + Value *NewSI = Builder->CreateCast(CastOp, + Builder->CreateSelect(Cmp, LHS, RHS), + SI.getType()); + return ReplaceInstUsesWith(SI, NewSI); + } + + // MAX(MAX(a, b), a) -> MAX(a, b) + // MIN(MIN(a, b), a) -> MIN(a, b) + // MAX(MIN(a, b), a) -> a + // MIN(MAX(a, b), a) -> a + if (SelectPatternFlavor SPF2 = matchSelectPattern(LHS, LHS2, RHS2)) if (Instruction *R = FoldSPFofSPF(cast<Instruction>(LHS),SPF2,LHS2,RHS2, SI, SPF, RHS)) return R; - if (SelectPatternFlavor SPF2 = MatchSelectPattern(RHS, LHS2, RHS2)) + if (SelectPatternFlavor SPF2 = matchSelectPattern(RHS, LHS2, RHS2)) if (Instruction *R = FoldSPFofSPF(cast<Instruction>(RHS),SPF2,LHS2,RHS2, SI, SPF, LHS)) return R; } + // MAX(~a, ~b) -> ~MIN(a, b) + if (SPF == SPF_SMAX || SPF == SPF_UMAX) { + if (IsFreeToInvert(LHS, LHS->hasNUses(2)) && + IsFreeToInvert(RHS, RHS->hasNUses(2))) { + + // This transform adds a xor operation and that extra cost needs to be + // justified. We look for simplifications that will result from + // applying this rule: + + bool Profitable = + (LHS->hasNUses(2) && match(LHS, m_Not(m_Value()))) || + (RHS->hasNUses(2) && match(RHS, m_Not(m_Value()))) || + (SI.hasOneUse() && match(*SI.user_begin(), m_Not(m_Value()))); + + if (Profitable) { + Value *NewLHS = Builder->CreateNot(LHS); + Value *NewRHS = Builder->CreateNot(RHS); + Value *NewCmp = SPF == SPF_SMAX + ? Builder->CreateICmpSLT(NewLHS, NewRHS) + : Builder->CreateICmpULT(NewLHS, NewRHS); + Value *NewSI = + Builder->CreateNot(Builder->CreateSelect(NewCmp, NewLHS, NewRHS)); + return ReplaceInstUsesWith(SI, NewSI); + } + } + } + // TODO. // ABS(-X) -> ABS(X) } @@ -1115,19 +1228,41 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { return NV; if (SelectInst *TrueSI = dyn_cast<SelectInst>(TrueVal)) { - if (TrueSI->getCondition() == CondVal) { - if (SI.getTrueValue() == TrueSI->getTrueValue()) - return nullptr; - SI.setOperand(1, TrueSI->getTrueValue()); - return &SI; + if (TrueSI->getCondition()->getType() == CondVal->getType()) { + // select(C, select(C, a, b), c) -> select(C, a, c) + if (TrueSI->getCondition() == CondVal) { + if (SI.getTrueValue() == TrueSI->getTrueValue()) + return nullptr; + SI.setOperand(1, TrueSI->getTrueValue()); + return &SI; + } + // select(C0, select(C1, a, b), b) -> select(C0&C1, a, b) + // We choose this as normal form to enable folding on the And and shortening + // paths for the values (this helps GetUnderlyingObjects() for example). + if (TrueSI->getFalseValue() == FalseVal && TrueSI->hasOneUse()) { + Value *And = Builder->CreateAnd(CondVal, TrueSI->getCondition()); + SI.setOperand(0, And); + SI.setOperand(1, TrueSI->getTrueValue()); + return &SI; + } } } if (SelectInst *FalseSI = dyn_cast<SelectInst>(FalseVal)) { - if (FalseSI->getCondition() == CondVal) { - if (SI.getFalseValue() == FalseSI->getFalseValue()) - return nullptr; - SI.setOperand(2, FalseSI->getFalseValue()); - return &SI; + if (FalseSI->getCondition()->getType() == CondVal->getType()) { + // select(C, a, select(C, b, c)) -> select(C, a, c) + if (FalseSI->getCondition() == CondVal) { + if (SI.getFalseValue() == FalseSI->getFalseValue()) + return nullptr; + SI.setOperand(2, FalseSI->getFalseValue()); + return &SI; + } + // select(C0, a, select(C1, a, b)) -> select(C0|C1, a, b) + if (FalseSI->getTrueValue() == TrueVal && FalseSI->hasOneUse()) { + Value *Or = Builder->CreateOr(CondVal, FalseSI->getCondition()); + SI.setOperand(0, Or); + SI.setOperand(2, FalseSI->getFalseValue()); + return &SI; + } } } diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp index 0a16e25..d04ed58 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp @@ -11,7 +11,7 @@ // //===----------------------------------------------------------------------===// -#include "InstCombine.h" +#include "InstCombineInternal.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/IR/IntrinsicInst.h" @@ -175,8 +175,8 @@ static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool isLeftShift, // get into trouble with cyclic PHIs here because we only consider // instructions with a single use. PHINode *PN = cast<PHINode>(I); - for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) - if (!CanEvaluateShifted(PN->getIncomingValue(i), NumBits, isLeftShift, + for (Value *IncValue : PN->incoming_values()) + if (!CanEvaluateShifted(IncValue, NumBits, isLeftShift, IC, PN)) return false; return true; @@ -187,7 +187,7 @@ static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool isLeftShift, /// GetShiftedValue - When CanEvaluateShifted returned true for an expression, /// this value inserts the new computation that produces the shifted value. static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift, - InstCombiner &IC) { + InstCombiner &IC, const DataLayout &DL) { // We can always evaluate constants shifted. if (Constant *C = dyn_cast<Constant>(V)) { if (isLeftShift) @@ -196,8 +196,7 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift, V = IC.Builder->CreateLShr(C, NumBits); // If we got a constantexpr back, try to simplify it with TD info. if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) - V = ConstantFoldConstantExpression(CE, IC.getDataLayout(), - IC.getTargetLibraryInfo()); + V = ConstantFoldConstantExpression(CE, DL, IC.getTargetLibraryInfo()); return V; } @@ -210,8 +209,10 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift, case Instruction::Or: case Instruction::Xor: // Bitwise operators can all arbitrarily be arbitrarily evaluated shifted. - I->setOperand(0, GetShiftedValue(I->getOperand(0), NumBits,isLeftShift,IC)); - I->setOperand(1, GetShiftedValue(I->getOperand(1), NumBits,isLeftShift,IC)); + I->setOperand( + 0, GetShiftedValue(I->getOperand(0), NumBits, isLeftShift, IC, DL)); + I->setOperand( + 1, GetShiftedValue(I->getOperand(1), NumBits, isLeftShift, IC, DL)); return I; case Instruction::Shl: { @@ -297,8 +298,10 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift, } case Instruction::Select: - I->setOperand(1, GetShiftedValue(I->getOperand(1), NumBits,isLeftShift,IC)); - I->setOperand(2, GetShiftedValue(I->getOperand(2), NumBits,isLeftShift,IC)); + I->setOperand( + 1, GetShiftedValue(I->getOperand(1), NumBits, isLeftShift, IC, DL)); + I->setOperand( + 2, GetShiftedValue(I->getOperand(2), NumBits, isLeftShift, IC, DL)); return I; case Instruction::PHI: { // We can change a phi if we can change all operands. Note that we never @@ -306,8 +309,8 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift, // instructions with a single use. PHINode *PN = cast<PHINode>(I); for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) - PN->setIncomingValue(i, GetShiftedValue(PN->getIncomingValue(i), - NumBits, isLeftShift, IC)); + PN->setIncomingValue(i, GetShiftedValue(PN->getIncomingValue(i), NumBits, + isLeftShift, IC, DL)); return PN; } } @@ -337,8 +340,8 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1, DEBUG(dbgs() << "ICE: GetShiftedValue propagating shift through expression" " to eliminate shift:\n IN: " << *Op0 << "\n SH: " << I <<"\n"); - return ReplaceInstUsesWith(I, - GetShiftedValue(Op0, COp1->getZExtValue(), isLeftShift, *this)); + return ReplaceInstUsesWith( + I, GetShiftedValue(Op0, COp1->getZExtValue(), isLeftShift, *this, DL)); } // See if we can simplify any instructions used by the instruction whose sole diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index ad6983a..80628b2 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -12,8 +12,8 @@ // //===----------------------------------------------------------------------===// -#include "InstCombine.h" -#include "llvm/IR/DataLayout.h" +#include "InstCombineInternal.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/PatternMatch.h" @@ -44,19 +44,6 @@ static bool ShrinkDemandedConstant(Instruction *I, unsigned OpNo, Demanded &= OpC->getValue(); I->setOperand(OpNo, ConstantInt::get(OpC->getType(), Demanded)); - // If either 'nsw' or 'nuw' is set and the constant is negative, - // removing *any* bits from the constant could make overflow occur. - // Remove 'nsw' and 'nuw' from the instruction in this case. - if (auto *OBO = dyn_cast<OverflowingBinaryOperator>(I)) { - assert(OBO->getOpcode() == Instruction::Add); - if (OBO->hasNoSignedWrap() || OBO->hasNoUnsignedWrap()) { - if (OpC->getValue().isNegative()) { - cast<BinaryOperator>(OBO)->setHasNoSignedWrap(false); - cast<BinaryOperator>(OBO)->setHasNoUnsignedWrap(false); - } - } - } - return true; } @@ -70,8 +57,8 @@ bool InstCombiner::SimplifyDemandedInstructionBits(Instruction &Inst) { APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0); APInt DemandedMask(APInt::getAllOnesValue(BitWidth)); - Value *V = SimplifyDemandedUseBits(&Inst, DemandedMask, - KnownZero, KnownOne, 0, &Inst); + Value *V = SimplifyDemandedUseBits(&Inst, DemandedMask, KnownZero, KnownOne, + 0, &Inst); if (!V) return false; if (V == &Inst) return true; ReplaceInstUsesWith(Inst, V); @@ -84,9 +71,9 @@ bool InstCombiner::SimplifyDemandedInstructionBits(Instruction &Inst) { bool InstCombiner::SimplifyDemandedBits(Use &U, APInt DemandedMask, APInt &KnownZero, APInt &KnownOne, unsigned Depth) { - Value *NewVal = SimplifyDemandedUseBits(U.get(), DemandedMask, - KnownZero, KnownOne, Depth, - dyn_cast<Instruction>(U.getUser())); + auto *UserI = dyn_cast<Instruction>(U.getUser()); + Value *NewVal = SimplifyDemandedUseBits(U.get(), DemandedMask, KnownZero, + KnownOne, Depth, UserI); if (!NewVal) return false; U = NewVal; return true; @@ -122,15 +109,12 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, assert(Depth <= 6 && "Limit Search Depth"); uint32_t BitWidth = DemandedMask.getBitWidth(); Type *VTy = V->getType(); - assert((DL || !VTy->isPointerTy()) && - "SimplifyDemandedBits needs to know bit widths!"); - assert((!DL || DL->getTypeSizeInBits(VTy->getScalarType()) == BitWidth) && - (!VTy->isIntOrIntVectorTy() || - VTy->getScalarSizeInBits() == BitWidth) && - KnownZero.getBitWidth() == BitWidth && - KnownOne.getBitWidth() == BitWidth && - "Value *V, DemandedMask, KnownZero and KnownOne " - "must have same BitWidth"); + assert( + (!VTy->isIntOrIntVectorTy() || VTy->getScalarSizeInBits() == BitWidth) && + KnownZero.getBitWidth() == BitWidth && + KnownOne.getBitWidth() == BitWidth && + "Value *V, DemandedMask, KnownZero and KnownOne " + "must have same BitWidth"); if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) { // We know all of the bits for a constant! KnownOne = CI->getValue() & DemandedMask; @@ -174,9 +158,9 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, // this instruction has a simpler value in that context. if (I->getOpcode() == Instruction::And) { // If either the LHS or the RHS are Zero, the result is zero. - computeKnownBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth+1, + computeKnownBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth + 1, CxtI); - computeKnownBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1, + computeKnownBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth + 1, CxtI); // If all of the demanded bits are known 1 on one side, return the other. @@ -198,9 +182,9 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, // only bits from X or Y are demanded. // If either the LHS or the RHS are One, the result is One. - computeKnownBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth+1, + computeKnownBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth + 1, CxtI); - computeKnownBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1, + computeKnownBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth + 1, CxtI); // If all of the demanded bits are known zero on one side, return the @@ -225,9 +209,9 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, // We can simplify (X^Y) -> X or Y in the user's context if we know that // only bits from X or Y are demanded. - computeKnownBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth+1, + computeKnownBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth + 1, CxtI); - computeKnownBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1, + computeKnownBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth + 1, CxtI); // If all of the demanded bits are known zero on one side, return the @@ -256,10 +240,10 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, break; case Instruction::And: // If either the LHS or the RHS are Zero, the result is zero. - if (SimplifyDemandedBits(I->getOperandUse(1), DemandedMask, - RHSKnownZero, RHSKnownOne, Depth+1) || + if (SimplifyDemandedBits(I->getOperandUse(1), DemandedMask, RHSKnownZero, + RHSKnownOne, Depth + 1) || SimplifyDemandedBits(I->getOperandUse(0), DemandedMask & ~RHSKnownZero, - LHSKnownZero, LHSKnownOne, Depth+1)) + LHSKnownZero, LHSKnownOne, Depth + 1)) return I; assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?"); assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?"); @@ -294,10 +278,10 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, break; case Instruction::Or: // If either the LHS or the RHS are One, the result is One. - if (SimplifyDemandedBits(I->getOperandUse(1), DemandedMask, - RHSKnownZero, RHSKnownOne, Depth+1) || + if (SimplifyDemandedBits(I->getOperandUse(1), DemandedMask, RHSKnownZero, + RHSKnownOne, Depth + 1) || SimplifyDemandedBits(I->getOperandUse(0), DemandedMask & ~RHSKnownOne, - LHSKnownZero, LHSKnownOne, Depth+1)) + LHSKnownZero, LHSKnownOne, Depth + 1)) return I; assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?"); assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?"); @@ -336,10 +320,10 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, KnownOne = RHSKnownOne | LHSKnownOne; break; case Instruction::Xor: { - if (SimplifyDemandedBits(I->getOperandUse(1), DemandedMask, - RHSKnownZero, RHSKnownOne, Depth+1) || - SimplifyDemandedBits(I->getOperandUse(0), DemandedMask, - LHSKnownZero, LHSKnownOne, Depth+1)) + if (SimplifyDemandedBits(I->getOperandUse(1), DemandedMask, RHSKnownZero, + RHSKnownOne, Depth + 1) || + SimplifyDemandedBits(I->getOperandUse(0), DemandedMask, LHSKnownZero, + LHSKnownOne, Depth + 1)) return I; assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?"); assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?"); @@ -423,10 +407,16 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, break; } case Instruction::Select: - if (SimplifyDemandedBits(I->getOperandUse(2), DemandedMask, - RHSKnownZero, RHSKnownOne, Depth+1) || - SimplifyDemandedBits(I->getOperandUse(1), DemandedMask, - LHSKnownZero, LHSKnownOne, Depth+1)) + // If this is a select as part of a min/max pattern, don't simplify any + // further in case we break the structure. + Value *LHS, *RHS; + if (matchSelectPattern(I, LHS, RHS) != SPF_UNKNOWN) + return nullptr; + + if (SimplifyDemandedBits(I->getOperandUse(2), DemandedMask, RHSKnownZero, + RHSKnownOne, Depth + 1) || + SimplifyDemandedBits(I->getOperandUse(1), DemandedMask, LHSKnownZero, + LHSKnownOne, Depth + 1)) return I; assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?"); assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?"); @@ -445,8 +435,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, DemandedMask = DemandedMask.zext(truncBf); KnownZero = KnownZero.zext(truncBf); KnownOne = KnownOne.zext(truncBf); - if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMask, - KnownZero, KnownOne, Depth+1)) + if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMask, KnownZero, + KnownOne, Depth + 1)) return I; DemandedMask = DemandedMask.trunc(BitWidth); KnownZero = KnownZero.trunc(BitWidth); @@ -471,8 +461,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, // Don't touch a vector-to-scalar bitcast. return nullptr; - if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMask, - KnownZero, KnownOne, Depth+1)) + if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMask, KnownZero, + KnownOne, Depth + 1)) return I; assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?"); break; @@ -483,8 +473,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, DemandedMask = DemandedMask.trunc(SrcBitWidth); KnownZero = KnownZero.trunc(SrcBitWidth); KnownOne = KnownOne.trunc(SrcBitWidth); - if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMask, - KnownZero, KnownOne, Depth+1)) + if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMask, KnownZero, + KnownOne, Depth + 1)) return I; DemandedMask = DemandedMask.zext(BitWidth); KnownZero = KnownZero.zext(BitWidth); @@ -510,8 +500,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, InputDemandedBits = InputDemandedBits.trunc(SrcBitWidth); KnownZero = KnownZero.trunc(SrcBitWidth); KnownOne = KnownOne.trunc(SrcBitWidth); - if (SimplifyDemandedBits(I->getOperandUse(0), InputDemandedBits, - KnownZero, KnownOne, Depth+1)) + if (SimplifyDemandedBits(I->getOperandUse(0), InputDemandedBits, KnownZero, + KnownOne, Depth + 1)) return I; InputDemandedBits = InputDemandedBits.zext(BitWidth); KnownZero = KnownZero.zext(BitWidth); @@ -532,113 +522,35 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, } break; } - case Instruction::Add: { - // Figure out what the input bits are. If the top bits of the and result - // are not demanded, then the add doesn't demand them from its input - // either. + case Instruction::Add: + case Instruction::Sub: { + /// If the high-bits of an ADD/SUB are not demanded, then we do not care + /// about the high bits of the operands. unsigned NLZ = DemandedMask.countLeadingZeros(); - - // If there is a constant on the RHS, there are a variety of xformations - // we can do. - if (ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1))) { - // If null, this should be simplified elsewhere. Some of the xforms here - // won't work if the RHS is zero. - if (RHS->isZero()) - break; - - // If the top bit of the output is demanded, demand everything from the - // input. Otherwise, we demand all the input bits except NLZ top bits. - APInt InDemandedBits(APInt::getLowBitsSet(BitWidth, BitWidth - NLZ)); - - // Find information about known zero/one bits in the input. - if (SimplifyDemandedBits(I->getOperandUse(0), InDemandedBits, - LHSKnownZero, LHSKnownOne, Depth+1)) - return I; - - // If the RHS of the add has bits set that can't affect the input, reduce - // the constant. - if (ShrinkDemandedConstant(I, 1, InDemandedBits)) - return I; - - // Avoid excess work. - if (LHSKnownZero == 0 && LHSKnownOne == 0) - break; - - // Turn it into OR if input bits are zero. - if ((LHSKnownZero & RHS->getValue()) == RHS->getValue()) { - Instruction *Or = - BinaryOperator::CreateOr(I->getOperand(0), I->getOperand(1), - I->getName()); - return InsertNewInstWith(Or, *I); - } - - // We can say something about the output known-zero and known-one bits, - // depending on potential carries from the input constant and the - // unknowns. For example if the LHS is known to have at most the 0x0F0F0 - // bits set and the RHS constant is 0x01001, then we know we have a known - // one mask of 0x00001 and a known zero mask of 0xE0F0E. - - // To compute this, we first compute the potential carry bits. These are - // the bits which may be modified. I'm not aware of a better way to do - // this scan. - const APInt &RHSVal = RHS->getValue(); - APInt CarryBits((~LHSKnownZero + RHSVal) ^ (~LHSKnownZero ^ RHSVal)); - - // Now that we know which bits have carries, compute the known-1/0 sets. - - // Bits are known one if they are known zero in one operand and one in the - // other, and there is no input carry. - KnownOne = ((LHSKnownZero & RHSVal) | - (LHSKnownOne & ~RHSVal)) & ~CarryBits; - - // Bits are known zero if they are known zero in both operands and there - // is no input carry. - KnownZero = LHSKnownZero & ~RHSVal & ~CarryBits; - } else { - // If the high-bits of this ADD are not demanded, then it does not demand - // the high bits of its LHS or RHS. - if (DemandedMask[BitWidth-1] == 0) { - // Right fill the mask of bits for this ADD to demand the most - // significant bit and all those below it. - APInt DemandedFromOps(APInt::getLowBitsSet(BitWidth, BitWidth-NLZ)); - if (SimplifyDemandedBits(I->getOperandUse(0), DemandedFromOps, - LHSKnownZero, LHSKnownOne, Depth+1) || - SimplifyDemandedBits(I->getOperandUse(1), DemandedFromOps, - LHSKnownZero, LHSKnownOne, Depth+1)) - return I; - } - } - break; - } - case Instruction::Sub: - // If the high-bits of this SUB are not demanded, then it does not demand - // the high bits of its LHS or RHS. - if (DemandedMask[BitWidth-1] == 0) { - // Right fill the mask of bits for this SUB to demand the most + if (NLZ > 0) { + // Right fill the mask of bits for this ADD/SUB to demand the most // significant bit and all those below it. - uint32_t NLZ = DemandedMask.countLeadingZeros(); APInt DemandedFromOps(APInt::getLowBitsSet(BitWidth, BitWidth-NLZ)); if (SimplifyDemandedBits(I->getOperandUse(0), DemandedFromOps, - LHSKnownZero, LHSKnownOne, Depth+1) || + LHSKnownZero, LHSKnownOne, Depth + 1) || + ShrinkDemandedConstant(I, 1, DemandedFromOps) || SimplifyDemandedBits(I->getOperandUse(1), DemandedFromOps, - LHSKnownZero, LHSKnownOne, Depth+1)) + LHSKnownZero, LHSKnownOne, Depth + 1)) { + // Disable the nsw and nuw flags here: We can no longer guarantee that + // we won't wrap after simplification. Removing the nsw/nuw flags is + // legal here because the top bit is not demanded. + BinaryOperator &BinOP = *cast<BinaryOperator>(I); + BinOP.setHasNoSignedWrap(false); + BinOP.setHasNoUnsignedWrap(false); return I; + } } - // Otherwise just hand the sub off to computeKnownBits to fill in + // Otherwise just hand the add/sub off to computeKnownBits to fill in // the known zeros and ones. computeKnownBits(V, KnownZero, KnownOne, Depth, CxtI); - - // Turn this into a xor if LHS is 2^n-1 and the remaining bits are known - // zero. - if (ConstantInt *C0 = dyn_cast<ConstantInt>(I->getOperand(0))) { - APInt I0 = C0->getValue(); - if ((I0 + 1).isPowerOf2() && (I0 | KnownZero).isAllOnesValue()) { - Instruction *Xor = BinaryOperator::CreateXor(I->getOperand(1), C0); - return InsertNewInstWith(Xor, *I); - } - } break; + } case Instruction::Shl: if (ConstantInt *SA = dyn_cast<ConstantInt>(I->getOperand(1))) { { @@ -662,8 +574,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, else if (IOp->hasNoUnsignedWrap()) DemandedMaskIn |= APInt::getHighBitsSet(BitWidth, ShiftAmt); - if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMaskIn, - KnownZero, KnownOne, Depth+1)) + if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMaskIn, KnownZero, + KnownOne, Depth + 1)) return I; assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?"); KnownZero <<= ShiftAmt; @@ -686,8 +598,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, if (cast<LShrOperator>(I)->isExact()) DemandedMaskIn |= APInt::getLowBitsSet(BitWidth, ShiftAmt); - if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMaskIn, - KnownZero, KnownOne, Depth+1)) + if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMaskIn, KnownZero, + KnownOne, Depth + 1)) return I; assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?"); KnownZero = APIntOps::lshr(KnownZero, ShiftAmt); @@ -731,8 +643,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, if (cast<AShrOperator>(I)->isExact()) DemandedMaskIn |= APInt::getLowBitsSet(BitWidth, ShiftAmt); - if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMaskIn, - KnownZero, KnownOne, Depth+1)) + if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMaskIn, KnownZero, + KnownOne, Depth + 1)) return I; assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?"); // Compute the new bits that are at the top now. @@ -772,8 +684,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, APInt LowBits = RA - 1; APInt Mask2 = LowBits | APInt::getSignBit(BitWidth); - if (SimplifyDemandedBits(I->getOperandUse(0), Mask2, - LHSKnownZero, LHSKnownOne, Depth+1)) + if (SimplifyDemandedBits(I->getOperandUse(0), Mask2, LHSKnownZero, + LHSKnownOne, Depth + 1)) return I; // The low bits of LHS are unchanged by the srem. @@ -798,7 +710,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, // remainder is zero. if (DemandedMask.isNegative() && KnownZero.isNonNegative()) { APInt LHSKnownZero(BitWidth, 0), LHSKnownOne(BitWidth, 0); - computeKnownBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1, + computeKnownBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth + 1, CxtI); // If it's known zero, our sign bit is also zero. if (LHSKnownZero.isNegative()) @@ -808,10 +720,10 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, case Instruction::URem: { APInt KnownZero2(BitWidth, 0), KnownOne2(BitWidth, 0); APInt AllOnes = APInt::getAllOnesValue(BitWidth); - if (SimplifyDemandedBits(I->getOperandUse(0), AllOnes, - KnownZero2, KnownOne2, Depth+1) || - SimplifyDemandedBits(I->getOperandUse(1), AllOnes, - KnownZero2, KnownOne2, Depth+1)) + if (SimplifyDemandedBits(I->getOperandUse(0), AllOnes, KnownZero2, + KnownOne2, Depth + 1) || + SimplifyDemandedBits(I->getOperandUse(1), AllOnes, KnownZero2, + KnownOne2, Depth + 1)) return I; unsigned Leaders = KnownZero2.countLeadingOnes(); @@ -1051,7 +963,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, // Note that we can't propagate undef elt info, because we don't know // which elt is getting updated. TmpV = SimplifyDemandedVectorElts(I->getOperand(0), DemandedElts, - UndefElts2, Depth+1); + UndefElts2, Depth + 1); if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; } break; } @@ -1069,7 +981,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, APInt DemandedElts2 = DemandedElts; DemandedElts2.clearBit(IdxNo); TmpV = SimplifyDemandedVectorElts(I->getOperand(0), DemandedElts2, - UndefElts, Depth+1); + UndefElts, Depth + 1); if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; } // The inserted element is defined. @@ -1097,12 +1009,12 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, APInt UndefElts4(LHSVWidth, 0); TmpV = SimplifyDemandedVectorElts(I->getOperand(0), LeftDemanded, - UndefElts4, Depth+1); + UndefElts4, Depth + 1); if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; } APInt UndefElts3(LHSVWidth, 0); TmpV = SimplifyDemandedVectorElts(I->getOperand(1), RightDemanded, - UndefElts3, Depth+1); + UndefElts3, Depth + 1); if (TmpV) { I->setOperand(1, TmpV); MadeChange = true; } bool NewUndefElts = false; @@ -1152,12 +1064,12 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, } } - TmpV = SimplifyDemandedVectorElts(I->getOperand(1), LeftDemanded, - UndefElts, Depth+1); + TmpV = SimplifyDemandedVectorElts(I->getOperand(1), LeftDemanded, UndefElts, + Depth + 1); if (TmpV) { I->setOperand(1, TmpV); MadeChange = true; } TmpV = SimplifyDemandedVectorElts(I->getOperand(2), RightDemanded, - UndefElts2, Depth+1); + UndefElts2, Depth + 1); if (TmpV) { I->setOperand(2, TmpV); MadeChange = true; } // Output elements are undefined if both are undefined. @@ -1204,7 +1116,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, // div/rem demand all inputs, because they don't want divide by zero. TmpV = SimplifyDemandedVectorElts(I->getOperand(0), InputDemandedElts, - UndefElts2, Depth+1); + UndefElts2, Depth + 1); if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; @@ -1238,11 +1150,11 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, case Instruction::Sub: case Instruction::Mul: // div/rem demand all inputs, because they don't want divide by zero. - TmpV = SimplifyDemandedVectorElts(I->getOperand(0), DemandedElts, - UndefElts, Depth+1); + TmpV = SimplifyDemandedVectorElts(I->getOperand(0), DemandedElts, UndefElts, + Depth + 1); if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; } TmpV = SimplifyDemandedVectorElts(I->getOperand(1), DemandedElts, - UndefElts2, Depth+1); + UndefElts2, Depth + 1); if (TmpV) { I->setOperand(1, TmpV); MadeChange = true; } // Output elements are undefined if both are undefined. Consider things @@ -1251,8 +1163,8 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, break; case Instruction::FPTrunc: case Instruction::FPExt: - TmpV = SimplifyDemandedVectorElts(I->getOperand(0), DemandedElts, - UndefElts, Depth+1); + TmpV = SimplifyDemandedVectorElts(I->getOperand(0), DemandedElts, UndefElts, + Depth + 1); if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; } break; @@ -1273,10 +1185,10 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, case Intrinsic::x86_sse2_min_sd: case Intrinsic::x86_sse2_max_sd: TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts, - UndefElts, Depth+1); + UndefElts, Depth + 1); if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; } TmpV = SimplifyDemandedVectorElts(II->getArgOperand(1), DemandedElts, - UndefElts2, Depth+1); + UndefElts2, Depth + 1); if (TmpV) { II->setArgOperand(1, TmpV); MadeChange = true; } // If only the low elt is demanded and this is a scalarizable intrinsic, diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp index cb16584..24446c8 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -12,7 +12,8 @@ // //===----------------------------------------------------------------------===// -#include "InstCombine.h" +#include "InstCombineInternal.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/IR/PatternMatch.h" using namespace llvm; using namespace PatternMatch; @@ -201,8 +202,8 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) { APInt UndefElts(VectorWidth, 0); APInt DemandedMask(VectorWidth, 0); DemandedMask.setBit(IndexVal); - if (Value *V = SimplifyDemandedVectorElts(EI.getOperand(0), - DemandedMask, UndefElts)) { + if (Value *V = SimplifyDemandedVectorElts(EI.getOperand(0), DemandedMask, + UndefElts)) { EI.setOperand(0, V); return &EI; } @@ -732,7 +733,8 @@ static Value *BuildNew(Instruction *I, ArrayRef<Value*> NewOps) { case Instruction::GetElementPtr: { Value *Ptr = NewOps[0]; ArrayRef<Value*> Idx = NewOps.slice(1); - GetElementPtrInst *GEP = GetElementPtrInst::Create(Ptr, Idx, "", I); + GetElementPtrInst *GEP = GetElementPtrInst::Create( + cast<GetElementPtrInst>(I)->getSourceElementType(), Ptr, Idx, "", I); GEP->setIsInBounds(cast<GetElementPtrInst>(I)->isInBounds()); return GEP; } @@ -853,10 +855,32 @@ static void RecognizeIdentityMask(const SmallVectorImpl<int> &Mask, } } +// Returns true if the shuffle is extracting a contiguous range of values from +// LHS, for example: +// +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ +// Input: |AA|BB|CC|DD|EE|FF|GG|HH|II|JJ|KK|LL|MM|NN|OO|PP| +// Shuffles to: |EE|FF|GG|HH| +// +--+--+--+--+ +static bool isShuffleExtractingFromLHS(ShuffleVectorInst &SVI, + SmallVector<int, 16> &Mask) { + unsigned LHSElems = + cast<VectorType>(SVI.getOperand(0)->getType())->getNumElements(); + unsigned MaskElems = Mask.size(); + unsigned BegIdx = Mask.front(); + unsigned EndIdx = Mask.back(); + if (BegIdx > EndIdx || EndIdx >= LHSElems || EndIdx - BegIdx != MaskElems - 1) + return false; + for (unsigned I = 0; I != MaskElems; ++I) + if (static_cast<unsigned>(Mask[I]) != BegIdx + I) + return false; + return true; +} + Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) { Value *LHS = SVI.getOperand(0); Value *RHS = SVI.getOperand(1); SmallVector<int, 16> Mask = SVI.getShuffleMask(); + Type *Int32Ty = Type::getInt32Ty(SVI.getContext()); bool MadeChange = false; @@ -892,18 +916,17 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) { SmallVector<Constant*, 16> Elts; for (unsigned i = 0, e = LHSWidth; i != VWidth; ++i) { if (Mask[i] < 0) { - Elts.push_back(UndefValue::get(Type::getInt32Ty(SVI.getContext()))); + Elts.push_back(UndefValue::get(Int32Ty)); continue; } if ((Mask[i] >= (int)e && isa<UndefValue>(RHS)) || (Mask[i] < (int)e && isa<UndefValue>(LHS))) { Mask[i] = -1; // Turn into undef. - Elts.push_back(UndefValue::get(Type::getInt32Ty(SVI.getContext()))); + Elts.push_back(UndefValue::get(Int32Ty)); } else { Mask[i] = Mask[i] % e; // Force to LHS. - Elts.push_back(ConstantInt::get(Type::getInt32Ty(SVI.getContext()), - Mask[i])); + Elts.push_back(ConstantInt::get(Int32Ty, Mask[i])); } } SVI.setOperand(0, SVI.getOperand(1)); @@ -929,6 +952,95 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) { return ReplaceInstUsesWith(SVI, V); } + // SROA generates shuffle+bitcast when the extracted sub-vector is bitcast to + // a non-vector type. We can instead bitcast the original vector followed by + // an extract of the desired element: + // + // %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, + // <4 x i32> <i32 0, i32 1, i32 2, i32 3> + // %1 = bitcast <4 x i8> %sroa to i32 + // Becomes: + // %bc = bitcast <16 x i8> %in to <4 x i32> + // %ext = extractelement <4 x i32> %bc, i32 0 + // + // If the shuffle is extracting a contiguous range of values from the input + // vector then each use which is a bitcast of the extracted size can be + // replaced. This will work if the vector types are compatible, and the begin + // index is aligned to a value in the casted vector type. If the begin index + // isn't aligned then we can shuffle the original vector (keeping the same + // vector type) before extracting. + // + // This code will bail out if the target type is fundamentally incompatible + // with vectors of the source type. + // + // Example of <16 x i8>, target type i32: + // Index range [4,8): v-----------v Will work. + // +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ + // <16 x i8>: | | | | | | | | | | | | | | | | | + // <4 x i32>: | | | | | + // +-----------+-----------+-----------+-----------+ + // Index range [6,10): ^-----------^ Needs an extra shuffle. + // Target type i40: ^--------------^ Won't work, bail. + if (isShuffleExtractingFromLHS(SVI, Mask)) { + Value *V = LHS; + unsigned MaskElems = Mask.size(); + unsigned BegIdx = Mask.front(); + VectorType *SrcTy = cast<VectorType>(V->getType()); + unsigned VecBitWidth = SrcTy->getBitWidth(); + unsigned SrcElemBitWidth = DL.getTypeSizeInBits(SrcTy->getElementType()); + assert(SrcElemBitWidth && "vector elements must have a bitwidth"); + unsigned SrcNumElems = SrcTy->getNumElements(); + SmallVector<BitCastInst *, 8> BCs; + DenseMap<Type *, Value *> NewBCs; + for (User *U : SVI.users()) + if (BitCastInst *BC = dyn_cast<BitCastInst>(U)) + if (!BC->use_empty()) + // Only visit bitcasts that weren't previously handled. + BCs.push_back(BC); + for (BitCastInst *BC : BCs) { + Type *TgtTy = BC->getDestTy(); + unsigned TgtElemBitWidth = DL.getTypeSizeInBits(TgtTy); + if (!TgtElemBitWidth) + continue; + unsigned TgtNumElems = VecBitWidth / TgtElemBitWidth; + bool VecBitWidthsEqual = VecBitWidth == TgtNumElems * TgtElemBitWidth; + bool BegIsAligned = 0 == ((SrcElemBitWidth * BegIdx) % TgtElemBitWidth); + if (!VecBitWidthsEqual) + continue; + if (!VectorType::isValidElementType(TgtTy)) + continue; + VectorType *CastSrcTy = VectorType::get(TgtTy, TgtNumElems); + if (!BegIsAligned) { + // Shuffle the input so [0,NumElements) contains the output, and + // [NumElems,SrcNumElems) is undef. + SmallVector<Constant *, 16> ShuffleMask(SrcNumElems, + UndefValue::get(Int32Ty)); + for (unsigned I = 0, E = MaskElems, Idx = BegIdx; I != E; ++Idx, ++I) + ShuffleMask[I] = ConstantInt::get(Int32Ty, Idx); + V = Builder->CreateShuffleVector(V, UndefValue::get(V->getType()), + ConstantVector::get(ShuffleMask), + SVI.getName() + ".extract"); + BegIdx = 0; + } + unsigned SrcElemsPerTgtElem = TgtElemBitWidth / SrcElemBitWidth; + assert(SrcElemsPerTgtElem); + BegIdx /= SrcElemsPerTgtElem; + bool BCAlreadyExists = NewBCs.find(CastSrcTy) != NewBCs.end(); + auto *NewBC = + BCAlreadyExists + ? NewBCs[CastSrcTy] + : Builder->CreateBitCast(V, CastSrcTy, SVI.getName() + ".bc"); + if (!BCAlreadyExists) + NewBCs[CastSrcTy] = NewBC; + auto *Ext = Builder->CreateExtractElement( + NewBC, ConstantInt::get(Int32Ty, BegIdx), SVI.getName() + ".extract"); + // The shufflevector isn't being replaced: the bitcast that used it + // is. InstCombine will visit the newly-created instructions. + ReplaceInstUsesWith(*BC, Ext); + MadeChange = true; + } + } + // If the LHS is a shufflevector itself, see if we can combine it with this // one without producing an unusual shuffle. // Cases that might be simplified: @@ -1099,7 +1211,6 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) { // or is a splat, do the replacement. if (isSplat || newMask == LHSMask || newMask == RHSMask || newMask == Mask) { SmallVector<Constant*, 16> Elts; - Type *Int32Ty = Type::getInt32Ty(SVI.getContext()); for (unsigned i = 0, e = newMask.size(); i != e; ++i) { if (newMask[i] < 0) { Elts.push_back(UndefValue::get(Int32Ty)); diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineWorklist.h b/contrib/llvm/lib/Transforms/InstCombine/InstCombineWorklist.h deleted file mode 100644 index 8d857d0..0000000 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineWorklist.h +++ /dev/null @@ -1,107 +0,0 @@ -//===- InstCombineWorklist.h - Worklist for InstCombine pass ----*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINEWORKLIST_H -#define LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINEWORKLIST_H - -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/IR/Instruction.h" -#include "llvm/Support/Compiler.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" - -#define DEBUG_TYPE "instcombine" - -namespace llvm { - -/// InstCombineWorklist - This is the worklist management logic for -/// InstCombine. -class LLVM_LIBRARY_VISIBILITY InstCombineWorklist { - SmallVector<Instruction*, 256> Worklist; - DenseMap<Instruction*, unsigned> WorklistMap; - - void operator=(const InstCombineWorklist&RHS) LLVM_DELETED_FUNCTION; - InstCombineWorklist(const InstCombineWorklist&) LLVM_DELETED_FUNCTION; -public: - InstCombineWorklist() {} - - bool isEmpty() const { return Worklist.empty(); } - - /// Add - Add the specified instruction to the worklist if it isn't already - /// in it. - void Add(Instruction *I) { - if (WorklistMap.insert(std::make_pair(I, Worklist.size())).second) { - DEBUG(dbgs() << "IC: ADD: " << *I << '\n'); - Worklist.push_back(I); - } - } - - void AddValue(Value *V) { - if (Instruction *I = dyn_cast<Instruction>(V)) - Add(I); - } - - /// AddInitialGroup - Add the specified batch of stuff in reverse order. - /// which should only be done when the worklist is empty and when the group - /// has no duplicates. - void AddInitialGroup(Instruction *const *List, unsigned NumEntries) { - assert(Worklist.empty() && "Worklist must be empty to add initial group"); - Worklist.reserve(NumEntries+16); - WorklistMap.resize(NumEntries); - DEBUG(dbgs() << "IC: ADDING: " << NumEntries << " instrs to worklist\n"); - for (unsigned Idx = 0; NumEntries; --NumEntries) { - Instruction *I = List[NumEntries-1]; - WorklistMap.insert(std::make_pair(I, Idx++)); - Worklist.push_back(I); - } - } - - // Remove - remove I from the worklist if it exists. - void Remove(Instruction *I) { - DenseMap<Instruction*, unsigned>::iterator It = WorklistMap.find(I); - if (It == WorklistMap.end()) return; // Not in worklist. - - // Don't bother moving everything down, just null out the slot. - Worklist[It->second] = nullptr; - - WorklistMap.erase(It); - } - - Instruction *RemoveOne() { - Instruction *I = Worklist.pop_back_val(); - WorklistMap.erase(I); - return I; - } - - /// AddUsersToWorkList - When an instruction is simplified, add all users of - /// the instruction to the work lists because they might get more simplified - /// now. - /// - void AddUsersToWorkList(Instruction &I) { - for (User *U : I.users()) - Add(cast<Instruction>(U)); - } - - - /// Zap - check that the worklist is empty and nuke the backing store for - /// the map if it is large. - void Zap() { - assert(WorklistMap.empty() && "Worklist empty, but map not?"); - - // Do an explicit clear, this shrinks the map if needed. - WorklistMap.clear(); - } -}; - -} // end namespace llvm. - -#undef DEBUG_TYPE - -#endif diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index a0c239a..be49cd1 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -33,8 +33,8 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar.h" -#include "InstCombine.h" +#include "llvm/Transforms/InstCombine/InstCombine.h" +#include "InstCombineInternal.h" #include "llvm-c/Initialization.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" @@ -43,8 +43,10 @@ #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/LibCallSemantics.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryBuiltins.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/CFG.h" #include "llvm/IR/DataLayout.h" @@ -55,7 +57,8 @@ #include "llvm/IR/ValueHandle.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" #include <algorithm> #include <climits> @@ -72,35 +75,8 @@ STATISTIC(NumExpand, "Number of expansions"); STATISTIC(NumFactor , "Number of factorizations"); STATISTIC(NumReassoc , "Number of reassociations"); -// Initialization Routines -void llvm::initializeInstCombine(PassRegistry &Registry) { - initializeInstCombinerPass(Registry); -} - -void LLVMInitializeInstCombine(LLVMPassRegistryRef R) { - initializeInstCombine(*unwrap(R)); -} - -char InstCombiner::ID = 0; -INITIALIZE_PASS_BEGIN(InstCombiner, "instcombine", - "Combine redundant instructions", false, false) -INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_END(InstCombiner, "instcombine", - "Combine redundant instructions", false, false) - -void InstCombiner::getAnalysisUsage(AnalysisUsage &AU) const { - AU.setPreservesCFG(); - AU.addRequired<AssumptionCacheTracker>(); - AU.addRequired<TargetLibraryInfo>(); - AU.addRequired<DominatorTreeWrapperPass>(); - AU.addPreserved<DominatorTreeWrapperPass>(); -} - - Value *InstCombiner::EmitGEPOffset(User *GEP) { - return llvm::EmitGEPOffset(Builder, *getDataLayout(), GEP); + return llvm::EmitGEPOffset(Builder, DL, GEP); } /// ShouldChangeType - Return true if it is desirable to convert a computation @@ -109,13 +85,10 @@ Value *InstCombiner::EmitGEPOffset(User *GEP) { bool InstCombiner::ShouldChangeType(Type *From, Type *To) const { assert(From->isIntegerTy() && To->isIntegerTy()); - // If we don't have DL, we don't know if the source/dest are legal. - if (!DL) return false; - unsigned FromWidth = From->getPrimitiveSizeInBits(); unsigned ToWidth = To->getPrimitiveSizeInBits(); - bool FromLegal = DL->isLegalInteger(FromWidth); - bool ToLegal = DL->isLegalInteger(ToWidth); + bool FromLegal = DL.isLegalInteger(FromWidth); + bool ToLegal = DL.isLegalInteger(ToWidth); // If this is a legal integer from type, and the result would be an illegal // type, don't do the transformation. @@ -470,7 +443,7 @@ getBinOpsForFactorization(Instruction::BinaryOps TopLevelOpcode, /// This tries to simplify binary operations by factorizing out common terms /// (e. g. "(A*B)+(A*C)" -> "A*(B+C)"). static Value *tryFactorization(InstCombiner::BuilderTy *Builder, - const DataLayout *DL, BinaryOperator &I, + const DataLayout &DL, BinaryOperator &I, Instruction::BinaryOps InnerOpcode, Value *A, Value *B, Value *C, Value *D) { @@ -479,6 +452,7 @@ static Value *tryFactorization(InstCombiner::BuilderTy *Builder, if (!A || !C || !B || !D) return nullptr; + Value *V = nullptr; Value *SimplifiedInst = nullptr; Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); Instruction::BinaryOps TopLevelOpcode = I.getOpcode(); @@ -495,7 +469,7 @@ static Value *tryFactorization(InstCombiner::BuilderTy *Builder, std::swap(C, D); // Consider forming "A op' (B op D)". // If "B op D" simplifies then it can be formed with no cost. - Value *V = SimplifyBinOp(TopLevelOpcode, B, D, DL); + V = SimplifyBinOp(TopLevelOpcode, B, D, DL); // If "B op D" doesn't simplify then only go on if both of the existing // operations "A op' B" and "C op' D" will be zapped as no longer used. if (!V && LHS->hasOneUse() && RHS->hasOneUse()) @@ -514,7 +488,7 @@ static Value *tryFactorization(InstCombiner::BuilderTy *Builder, std::swap(C, D); // Consider forming "(A op C) op' B". // If "A op C" simplifies then it can be formed with no cost. - Value *V = SimplifyBinOp(TopLevelOpcode, A, C, DL); + V = SimplifyBinOp(TopLevelOpcode, A, C, DL); // If "A op C" doesn't simplify then only go on if both of the existing // operations "A op' B" and "C op' D" will be zapped as no longer used. @@ -544,7 +518,19 @@ static Value *tryFactorization(InstCombiner::BuilderTy *Builder, if (BinaryOperator *Op1 = dyn_cast<BinaryOperator>(RHS)) if (isa<OverflowingBinaryOperator>(Op1)) HasNSW &= Op1->hasNoSignedWrap(); - BO->setHasNoSignedWrap(HasNSW); + + // We can propogate 'nsw' if we know that + // %Y = mul nsw i16 %X, C + // %Z = add nsw i16 %Y, %X + // => + // %Z = mul nsw i16 %X, C+1 + // + // iff C+1 isn't INT_MIN + const APInt *CInt; + if (TopLevelOpcode == Instruction::Add && + InnerOpcode == Instruction::Mul) + if (match(V, m_APInt(CInt)) && !CInt->isMinSignedValue()) + BO->setHasNoSignedWrap(HasNSW); } } } @@ -741,6 +727,22 @@ Instruction *InstCombiner::FoldOpIntoSelect(Instruction &Op, SelectInst *SI) { return nullptr; } + // Test if a CmpInst instruction is used exclusively by a select as + // part of a minimum or maximum operation. If so, refrain from doing + // any other folding. This helps out other analyses which understand + // non-obfuscated minimum and maximum idioms, such as ScalarEvolution + // and CodeGen. And in this case, at least one of the comparison + // operands has at least one user besides the compare (the select), + // which would often largely negate the benefit of folding anyway. + if (auto *CI = dyn_cast<CmpInst>(SI->getCondition())) { + if (CI->hasOneUse()) { + Value *Op0 = CI->getOperand(0), *Op1 = CI->getOperand(1); + if ((SI->getOperand(1) == Op0 && SI->getOperand(2) == Op1) || + (SI->getOperand(2) == Op0 && SI->getOperand(1) == Op1)) + return nullptr; + } + } + Value *SelectTrueVal = FoldOperationIntoSelectOperand(Op, TV, this); Value *SelectFalseVal = FoldOperationIntoSelectOperand(Op, FV, this); @@ -750,7 +752,6 @@ Instruction *InstCombiner::FoldOpIntoSelect(Instruction &Op, SelectInst *SI) { return nullptr; } - /// FoldOpIntoPhi - Given a binary operator, cast instruction, or select which /// has a PHI node as operand #0, see if we can fold the instruction into the /// PHI (which is only possible if all operands to the PHI are constants). @@ -799,8 +800,7 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) { // If the incoming non-constant value is in I's block, we will remove one // instruction, but insert another equivalent one, leading to infinite // instcombine. - if (isPotentiallyReachable(I.getParent(), NonConstBB, DT, - getAnalysisIfAvailable<LoopInfo>())) + if (isPotentiallyReachable(I.getParent(), NonConstBB, DT, LI)) return nullptr; } @@ -897,23 +897,18 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) { /// whether or not there is a sequence of GEP indices into the pointed type that /// will land us at the specified offset. If so, fill them into NewIndices and /// return the resultant element type, otherwise return null. -Type *InstCombiner::FindElementAtOffset(Type *PtrTy, int64_t Offset, - SmallVectorImpl<Value*> &NewIndices) { - assert(PtrTy->isPtrOrPtrVectorTy()); - - if (!DL) - return nullptr; - - Type *Ty = PtrTy->getPointerElementType(); +Type *InstCombiner::FindElementAtOffset(PointerType *PtrTy, int64_t Offset, + SmallVectorImpl<Value *> &NewIndices) { + Type *Ty = PtrTy->getElementType(); if (!Ty->isSized()) return nullptr; // Start with the index over the outer type. Note that the type size // might be zero (even if the offset isn't zero) if the indexed type // is something like [0 x {int, int}] - Type *IntPtrTy = DL->getIntPtrType(PtrTy); + Type *IntPtrTy = DL.getIntPtrType(PtrTy); int64_t FirstIdx = 0; - if (int64_t TySize = DL->getTypeAllocSize(Ty)) { + if (int64_t TySize = DL.getTypeAllocSize(Ty)) { FirstIdx = Offset/TySize; Offset -= FirstIdx*TySize; @@ -931,11 +926,11 @@ Type *InstCombiner::FindElementAtOffset(Type *PtrTy, int64_t Offset, // Index into the types. If we fail, set OrigBase to null. while (Offset) { // Indexing into tail padding between struct/array elements. - if (uint64_t(Offset*8) >= DL->getTypeSizeInBits(Ty)) + if (uint64_t(Offset * 8) >= DL.getTypeSizeInBits(Ty)) return nullptr; if (StructType *STy = dyn_cast<StructType>(Ty)) { - const StructLayout *SL = DL->getStructLayout(STy); + const StructLayout *SL = DL.getStructLayout(STy); assert(Offset < (int64_t)SL->getSizeInBytes() && "Offset must stay within the indexed type"); @@ -946,7 +941,7 @@ Type *InstCombiner::FindElementAtOffset(Type *PtrTy, int64_t Offset, Offset -= SL->getElementOffset(Elt); Ty = STy->getElementType(Elt); } else if (ArrayType *AT = dyn_cast<ArrayType>(Ty)) { - uint64_t EltSize = DL->getTypeAllocSize(AT->getElementType()); + uint64_t EltSize = DL.getTypeAllocSize(AT->getElementType()); assert(EltSize && "Cannot index into a zero-sized array"); NewIndices.push_back(ConstantInt::get(IntPtrTy,Offset/EltSize)); Offset %= EltSize; @@ -1240,7 +1235,8 @@ Value *InstCombiner::SimplifyVectorOp(BinaryOperator &Inst) { // It may not be safe to reorder shuffles and things like div, urem, etc. // because we may trap when executing those ops on unknown vector elements. // See PR20059. - if (!isSafeToSpeculativelyExecute(&Inst, DL)) return nullptr; + if (!isSafeToSpeculativelyExecute(&Inst)) + return nullptr; unsigned VWidth = cast<VectorType>(Inst.getType())->getNumElements(); Value *LHS = Inst.getOperand(0), *RHS = Inst.getOperand(1); @@ -1326,37 +1322,37 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { // Eliminate unneeded casts for indices, and replace indices which displace // by multiples of a zero size type with zero. - if (DL) { - bool MadeChange = false; - Type *IntPtrTy = DL->getIntPtrType(GEP.getPointerOperandType()); - - gep_type_iterator GTI = gep_type_begin(GEP); - for (User::op_iterator I = GEP.op_begin() + 1, E = GEP.op_end(); - I != E; ++I, ++GTI) { - // Skip indices into struct types. - SequentialType *SeqTy = dyn_cast<SequentialType>(*GTI); - if (!SeqTy) continue; - - // If the element type has zero size then any index over it is equivalent - // to an index of zero, so replace it with zero if it is not zero already. - if (SeqTy->getElementType()->isSized() && - DL->getTypeAllocSize(SeqTy->getElementType()) == 0) - if (!isa<Constant>(*I) || !cast<Constant>(*I)->isNullValue()) { - *I = Constant::getNullValue(IntPtrTy); - MadeChange = true; - } + bool MadeChange = false; + Type *IntPtrTy = DL.getIntPtrType(GEP.getPointerOperandType()); + + gep_type_iterator GTI = gep_type_begin(GEP); + for (User::op_iterator I = GEP.op_begin() + 1, E = GEP.op_end(); I != E; + ++I, ++GTI) { + // Skip indices into struct types. + SequentialType *SeqTy = dyn_cast<SequentialType>(*GTI); + if (!SeqTy) + continue; - Type *IndexTy = (*I)->getType(); - if (IndexTy != IntPtrTy) { - // If we are using a wider index than needed for this platform, shrink - // it to what we need. If narrower, sign-extend it to what we need. - // This explicit cast can make subsequent optimizations more obvious. - *I = Builder->CreateIntCast(*I, IntPtrTy, true); + // If the element type has zero size then any index over it is equivalent + // to an index of zero, so replace it with zero if it is not zero already. + if (SeqTy->getElementType()->isSized() && + DL.getTypeAllocSize(SeqTy->getElementType()) == 0) + if (!isa<Constant>(*I) || !cast<Constant>(*I)->isNullValue()) { + *I = Constant::getNullValue(IntPtrTy); MadeChange = true; } + + Type *IndexTy = (*I)->getType(); + if (IndexTy != IntPtrTy) { + // If we are using a wider index than needed for this platform, shrink + // it to what we need. If narrower, sign-extend it to what we need. + // This explicit cast can make subsequent optimizations more obvious. + *I = Builder->CreateIntCast(*I, IntPtrTy, true); + MadeChange = true; } - if (MadeChange) return &GEP; } + if (MadeChange) + return &GEP; // Check to see if the inputs to the PHI node are getelementptr instructions. if (PHINode *PN = dyn_cast<PHINode>(PtrOp)) { @@ -1364,6 +1360,15 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { if (!Op1) return nullptr; + // Don't fold a GEP into itself through a PHI node. This can only happen + // through the back-edge of a loop. Folding a GEP into itself means that + // the value of the previous iteration needs to be stored in the meantime, + // thus requiring an additional register variable to be live, but not + // actually achieving anything (the GEP still needs to be executed once per + // loop iteration). + if (Op1 == &GEP) + return nullptr; + signed DI = -1; for (auto I = PN->op_begin()+1, E = PN->op_end(); I !=E; ++I) { @@ -1371,6 +1376,10 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { if (!Op2 || Op1->getNumOperands() != Op2->getNumOperands()) return nullptr; + // As for Op1 above, don't try to fold a GEP into itself. + if (Op2 == &GEP) + return nullptr; + // Keep track of the type as we walk the GEP. Type *CurTy = Op1->getOperand(0)->getType()->getScalarType(); @@ -1417,8 +1426,8 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { if (DI == -1) { // All the GEPs feeding the PHI are identical. Clone one down into our // BB so that it can be merged with the current GEP. - GEP.getParent()->getInstList().insert(GEP.getParent()->getFirstNonPHI(), - NewGEP); + GEP.getParent()->getInstList().insert( + GEP.getParent()->getFirstInsertionPt(), NewGEP); } else { // All the GEPs feeding the PHI differ at a single offset. Clone a GEP // into the current block so it can be merged, and create a new PHI to @@ -1434,8 +1443,8 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { PN->getIncomingBlock(I)); NewGEP->setOperand(DI, NewPN); - GEP.getParent()->getInstList().insert(GEP.getParent()->getFirstNonPHI(), - NewGEP); + GEP.getParent()->getInstList().insert( + GEP.getParent()->getFirstInsertionPt(), NewGEP); NewGEP->setOperand(DI, NewPN); } @@ -1486,6 +1495,11 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { // normalized. if (SO1->getType() != GO1->getType()) return nullptr; + // Only do the combine when GO1 and SO1 are both constants. Only in + // this case, we are sure the cost after the merge is never more than + // that before the merge. + if (!isa<Constant>(GO1) || !isa<Constant>(SO1)) + return nullptr; Sum = Builder->CreateAdd(SO1, GO1, PtrOp->getName()+".sum"); } @@ -1507,19 +1521,22 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { } if (!Indices.empty()) - return (GEP.isInBounds() && Src->isInBounds()) ? - GetElementPtrInst::CreateInBounds(Src->getOperand(0), Indices, - GEP.getName()) : - GetElementPtrInst::Create(Src->getOperand(0), Indices, GEP.getName()); + return GEP.isInBounds() && Src->isInBounds() + ? GetElementPtrInst::CreateInBounds( + Src->getSourceElementType(), Src->getOperand(0), Indices, + GEP.getName()) + : GetElementPtrInst::Create(Src->getSourceElementType(), + Src->getOperand(0), Indices, + GEP.getName()); } - if (DL && GEP.getNumIndices() == 1) { + if (GEP.getNumIndices() == 1) { unsigned AS = GEP.getPointerAddressSpace(); if (GEP.getOperand(1)->getType()->getScalarSizeInBits() == - DL->getPointerSizeInBits(AS)) { + DL.getPointerSizeInBits(AS)) { Type *PtrTy = GEP.getPointerOperandType(); Type *Ty = PtrTy->getPointerElementType(); - uint64_t TyAllocSize = DL->getTypeAllocSize(Ty); + uint64_t TyAllocSize = DL.getTypeAllocSize(Ty); bool Matched = false; uint64_t C; @@ -1588,8 +1605,8 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { if (CATy->getElementType() == StrippedPtrTy->getElementType()) { // -> GEP i8* X, ... SmallVector<Value*, 8> Idx(GEP.idx_begin()+1, GEP.idx_end()); - GetElementPtrInst *Res = - GetElementPtrInst::Create(StrippedPtr, Idx, GEP.getName()); + GetElementPtrInst *Res = GetElementPtrInst::Create( + StrippedPtrTy->getElementType(), StrippedPtr, Idx, GEP.getName()); Res->setIsInBounds(GEP.isInBounds()); if (StrippedPtrTy->getAddressSpace() == GEP.getAddressSpace()) return Res; @@ -1613,6 +1630,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { // is a leading zero) we can fold the cast into this GEP. if (StrippedPtrTy->getAddressSpace() == GEP.getAddressSpace()) { GEP.setOperand(0, StrippedPtr); + GEP.setSourceElementType(XATy); return &GEP; } // Cannot replace the base pointer directly because StrippedPtr's @@ -1625,9 +1643,11 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { // %0 = GEP [10 x i8] addrspace(1)* X, ... // addrspacecast i8 addrspace(1)* %0 to i8* SmallVector<Value*, 8> Idx(GEP.idx_begin(), GEP.idx_end()); - Value *NewGEP = GEP.isInBounds() ? - Builder->CreateInBoundsGEP(StrippedPtr, Idx, GEP.getName()) : - Builder->CreateGEP(StrippedPtr, Idx, GEP.getName()); + Value *NewGEP = GEP.isInBounds() + ? Builder->CreateInBoundsGEP( + nullptr, StrippedPtr, Idx, GEP.getName()) + : Builder->CreateGEP(nullptr, StrippedPtr, Idx, + GEP.getName()); return new AddrSpaceCastInst(NewGEP, GEP.getType()); } } @@ -1638,14 +1658,16 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { // into: %t1 = getelementptr [2 x i32]* %str, i32 0, i32 %V; bitcast Type *SrcElTy = StrippedPtrTy->getElementType(); Type *ResElTy = PtrOp->getType()->getPointerElementType(); - if (DL && SrcElTy->isArrayTy() && - DL->getTypeAllocSize(SrcElTy->getArrayElementType()) == - DL->getTypeAllocSize(ResElTy)) { - Type *IdxType = DL->getIntPtrType(GEP.getType()); + if (SrcElTy->isArrayTy() && + DL.getTypeAllocSize(SrcElTy->getArrayElementType()) == + DL.getTypeAllocSize(ResElTy)) { + Type *IdxType = DL.getIntPtrType(GEP.getType()); Value *Idx[2] = { Constant::getNullValue(IdxType), GEP.getOperand(1) }; - Value *NewGEP = GEP.isInBounds() ? - Builder->CreateInBoundsGEP(StrippedPtr, Idx, GEP.getName()) : - Builder->CreateGEP(StrippedPtr, Idx, GEP.getName()); + Value *NewGEP = + GEP.isInBounds() + ? Builder->CreateInBoundsGEP(nullptr, StrippedPtr, Idx, + GEP.getName()) + : Builder->CreateGEP(nullptr, StrippedPtr, Idx, GEP.getName()); // V and GEP are both pointer types --> BitCast return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP, @@ -1656,11 +1678,11 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { // %V = mul i64 %N, 4 // %t = getelementptr i8* bitcast (i32* %arr to i8*), i32 %V // into: %t1 = getelementptr i32* %arr, i32 %N; bitcast - if (DL && ResElTy->isSized() && SrcElTy->isSized()) { + if (ResElTy->isSized() && SrcElTy->isSized()) { // Check that changing the type amounts to dividing the index by a scale // factor. - uint64_t ResSize = DL->getTypeAllocSize(ResElTy); - uint64_t SrcSize = DL->getTypeAllocSize(SrcElTy); + uint64_t ResSize = DL.getTypeAllocSize(ResElTy); + uint64_t SrcSize = DL.getTypeAllocSize(SrcElTy); if (ResSize && SrcSize % ResSize == 0) { Value *Idx = GEP.getOperand(1); unsigned BitWidth = Idx->getType()->getPrimitiveSizeInBits(); @@ -1668,7 +1690,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { // Earlier transforms ensure that the index has type IntPtrType, which // considerably simplifies the logic by eliminating implicit casts. - assert(Idx->getType() == DL->getIntPtrType(GEP.getType()) && + assert(Idx->getType() == DL.getIntPtrType(GEP.getType()) && "Index not cast to pointer width?"); bool NSW; @@ -1676,9 +1698,12 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { // Successfully decomposed Idx as NewIdx * Scale, form a new GEP. // If the multiplication NewIdx * Scale may overflow then the new // GEP may not be "inbounds". - Value *NewGEP = GEP.isInBounds() && NSW ? - Builder->CreateInBoundsGEP(StrippedPtr, NewIdx, GEP.getName()) : - Builder->CreateGEP(StrippedPtr, NewIdx, GEP.getName()); + Value *NewGEP = + GEP.isInBounds() && NSW + ? Builder->CreateInBoundsGEP(nullptr, StrippedPtr, NewIdx, + GEP.getName()) + : Builder->CreateGEP(nullptr, StrippedPtr, NewIdx, + GEP.getName()); // The NewGEP must be pointer typed, so must the old one -> BitCast return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP, @@ -1691,13 +1716,12 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { // getelementptr i8* bitcast ([100 x double]* X to i8*), i32 %tmp // (where tmp = 8*tmp2) into: // getelementptr [100 x double]* %arr, i32 0, i32 %tmp2; bitcast - if (DL && ResElTy->isSized() && SrcElTy->isSized() && - SrcElTy->isArrayTy()) { + if (ResElTy->isSized() && SrcElTy->isSized() && SrcElTy->isArrayTy()) { // Check that changing to the array element type amounts to dividing the // index by a scale factor. - uint64_t ResSize = DL->getTypeAllocSize(ResElTy); - uint64_t ArrayEltSize - = DL->getTypeAllocSize(SrcElTy->getArrayElementType()); + uint64_t ResSize = DL.getTypeAllocSize(ResElTy); + uint64_t ArrayEltSize = + DL.getTypeAllocSize(SrcElTy->getArrayElementType()); if (ResSize && ArrayEltSize % ResSize == 0) { Value *Idx = GEP.getOperand(1); unsigned BitWidth = Idx->getType()->getPrimitiveSizeInBits(); @@ -1705,7 +1729,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { // Earlier transforms ensure that the index has type IntPtrType, which // considerably simplifies the logic by eliminating implicit casts. - assert(Idx->getType() == DL->getIntPtrType(GEP.getType()) && + assert(Idx->getType() == DL.getIntPtrType(GEP.getType()) && "Index not cast to pointer width?"); bool NSW; @@ -1714,13 +1738,14 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { // If the multiplication NewIdx * Scale may overflow then the new // GEP may not be "inbounds". Value *Off[2] = { - Constant::getNullValue(DL->getIntPtrType(GEP.getType())), - NewIdx - }; - - Value *NewGEP = GEP.isInBounds() && NSW ? - Builder->CreateInBoundsGEP(StrippedPtr, Off, GEP.getName()) : - Builder->CreateGEP(StrippedPtr, Off, GEP.getName()); + Constant::getNullValue(DL.getIntPtrType(GEP.getType())), + NewIdx}; + + Value *NewGEP = GEP.isInBounds() && NSW + ? Builder->CreateInBoundsGEP( + SrcElTy, StrippedPtr, Off, GEP.getName()) + : Builder->CreateGEP(SrcElTy, StrippedPtr, Off, + GEP.getName()); // The NewGEP must be pointer typed, so must the old one -> BitCast return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP, GEP.getType()); @@ -1730,9 +1755,6 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { } } - if (!DL) - return nullptr; - // addrspacecast between types is canonicalized as a bitcast, then an // addrspacecast. To take advantage of the below bitcast + struct GEP, look // through the addrspacecast. @@ -1753,10 +1775,10 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { if (BitCastInst *BCI = dyn_cast<BitCastInst>(PtrOp)) { Value *Operand = BCI->getOperand(0); PointerType *OpType = cast<PointerType>(Operand->getType()); - unsigned OffsetBits = DL->getPointerTypeSizeInBits(GEP.getType()); + unsigned OffsetBits = DL.getPointerTypeSizeInBits(GEP.getType()); APInt Offset(OffsetBits, 0); if (!isa<BitCastInst>(Operand) && - GEP.accumulateConstantOffset(*DL, Offset)) { + GEP.accumulateConstantOffset(DL, Offset)) { // If this GEP instruction doesn't move the pointer, just replace the GEP // with a bitcast of the real input to the dest type. @@ -1785,9 +1807,10 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { // GEP. SmallVector<Value*, 8> NewIndices; if (FindElementAtOffset(OpType, Offset.getSExtValue(), NewIndices)) { - Value *NGEP = GEP.isInBounds() ? - Builder->CreateInBoundsGEP(Operand, NewIndices) : - Builder->CreateGEP(Operand, NewIndices); + Value *NGEP = + GEP.isInBounds() + ? Builder->CreateInBoundsGEP(nullptr, Operand, NewIndices) + : Builder->CreateGEP(nullptr, Operand, NewIndices); if (NGEP->getType() == GEP.getType()) return ReplaceInstUsesWith(GEP, NGEP); @@ -2038,6 +2061,15 @@ Instruction *InstCombiner::visitBranchInst(BranchInst &BI) { return &BI; } + // If the condition is irrelevant, remove the use so that other + // transforms on the condition become more effective. + if (BI.isConditional() && + BI.getSuccessor(0) == BI.getSuccessor(1) && + !isa<UndefValue>(BI.getCondition())) { + BI.setCondition(UndefValue::get(BI.getCondition()->getType())); + return &BI; + } + // Canonicalize fcmp_one -> fcmp_oeq FCmpInst::Predicate FPred; Value *Y; if (match(&BI, m_Br(m_FCmp(FPred, m_Value(X), m_Value(Y)), @@ -2077,7 +2109,7 @@ Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) { Value *Cond = SI.getCondition(); unsigned BitWidth = cast<IntegerType>(Cond->getType())->getBitWidth(); APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0); - computeKnownBits(Cond, KnownZero, KnownOne); + computeKnownBits(Cond, KnownZero, KnownOne, 0, &SI); unsigned LeadingKnownZeros = KnownZero.countLeadingOnes(); unsigned LeadingKnownOnes = KnownOne.countLeadingOnes(); @@ -2096,8 +2128,8 @@ Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) { // x86 generates redundant zero-extenstion instructions if the operand is // truncated to i8 or i16. bool TruncCond = false; - if (DL && BitWidth > NewWidth && - NewWidth >= DL->getLargestLegalIntTypeSize()) { + if (NewWidth > 0 && BitWidth > NewWidth && + NewWidth >= DL.getLargestLegalIntTypeSize()) { TruncCond = true; IntegerType *Ty = IntegerType::get(SI.getContext(), NewWidth); Builder->SetInsertPoint(&SI); @@ -2270,7 +2302,8 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) { // We need to insert these at the location of the old load, not at that of // the extractvalue. Builder->SetInsertPoint(L->getParent(), L); - Value *GEP = Builder->CreateInBoundsGEP(L->getPointerOperand(), Indices); + Value *GEP = Builder->CreateInBoundsGEP(L->getType(), + L->getPointerOperand(), Indices); // Returning the load directly will cause the main loop to insert it in // the wrong spot, so use ReplaceInstUsesWith(). return ReplaceInstUsesWith(EV, Builder->CreateLoad(GEP)); @@ -2286,41 +2319,27 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) { return nullptr; } -enum Personality_Type { - Unknown_Personality, - GNU_Ada_Personality, - GNU_CXX_Personality, - GNU_ObjC_Personality -}; - -/// RecognizePersonality - See if the given exception handling personality -/// function is one that we understand. If so, return a description of it; -/// otherwise return Unknown_Personality. -static Personality_Type RecognizePersonality(Value *Pers) { - Function *F = dyn_cast<Function>(Pers->stripPointerCasts()); - if (!F) - return Unknown_Personality; - return StringSwitch<Personality_Type>(F->getName()) - .Case("__gnat_eh_personality", GNU_Ada_Personality) - .Case("__gxx_personality_v0", GNU_CXX_Personality) - .Case("__objc_personality_v0", GNU_ObjC_Personality) - .Default(Unknown_Personality); -} - /// isCatchAll - Return 'true' if the given typeinfo will match anything. -static bool isCatchAll(Personality_Type Personality, Constant *TypeInfo) { +static bool isCatchAll(EHPersonality Personality, Constant *TypeInfo) { switch (Personality) { - case Unknown_Personality: + case EHPersonality::GNU_C: + // The GCC C EH personality only exists to support cleanups, so it's not + // clear what the semantics of catch clauses are. return false; - case GNU_Ada_Personality: + case EHPersonality::Unknown: + return false; + case EHPersonality::GNU_Ada: // While __gnat_all_others_value will match any Ada exception, it doesn't // match foreign exceptions (or didn't, before gcc-4.7). return false; - case GNU_CXX_Personality: - case GNU_ObjC_Personality: + case EHPersonality::GNU_CXX: + case EHPersonality::GNU_ObjC: + case EHPersonality::MSVC_X86SEH: + case EHPersonality::MSVC_Win64SEH: + case EHPersonality::MSVC_CXX: return TypeInfo->isNullValue(); } - llvm_unreachable("Unknown personality!"); + llvm_unreachable("invalid enum"); } static bool shorter_filter(const Value *LHS, const Value *RHS) { @@ -2334,7 +2353,7 @@ Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) { // The logic here should be correct for any real-world personality function. // However if that turns out not to be true, the offending logic can always // be conditioned on the personality function, like the catch-all logic is. - Personality_Type Personality = RecognizePersonality(LI.getPersonalityFn()); + EHPersonality Personality = classifyEHPersonality(LI.getPersonalityFn()); // Simplify the list of clauses, eg by removing repeated catch clauses // (these are often created by inlining). @@ -2625,9 +2644,6 @@ Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) { return nullptr; } - - - /// TryToSinkInstruction - Try to move the specified instruction from its /// current block into the beginning of DestBlock, which can only happen if it's /// safe to move the instruction past all of the instructions between it and the @@ -2660,164 +2676,7 @@ static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) { return true; } - -/// AddReachableCodeToWorklist - Walk the function in depth-first order, adding -/// all reachable code to the worklist. -/// -/// This has a couple of tricks to make the code faster and more powerful. In -/// particular, we constant fold and DCE instructions as we go, to avoid adding -/// them to the worklist (this significantly speeds up instcombine on code where -/// many instructions are dead or constant). Additionally, if we find a branch -/// whose condition is a known constant, we only visit the reachable successors. -/// -static bool AddReachableCodeToWorklist(BasicBlock *BB, - SmallPtrSetImpl<BasicBlock*> &Visited, - InstCombiner &IC, - const DataLayout *DL, - const TargetLibraryInfo *TLI) { - bool MadeIRChange = false; - SmallVector<BasicBlock*, 256> Worklist; - Worklist.push_back(BB); - - SmallVector<Instruction*, 128> InstrsForInstCombineWorklist; - DenseMap<ConstantExpr*, Constant*> FoldedConstants; - - do { - BB = Worklist.pop_back_val(); - - // We have now visited this block! If we've already been here, ignore it. - if (!Visited.insert(BB).second) - continue; - - for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ) { - Instruction *Inst = BBI++; - - // DCE instruction if trivially dead. - if (isInstructionTriviallyDead(Inst, TLI)) { - ++NumDeadInst; - DEBUG(dbgs() << "IC: DCE: " << *Inst << '\n'); - Inst->eraseFromParent(); - continue; - } - - // ConstantProp instruction if trivially constant. - if (!Inst->use_empty() && isa<Constant>(Inst->getOperand(0))) - if (Constant *C = ConstantFoldInstruction(Inst, DL, TLI)) { - DEBUG(dbgs() << "IC: ConstFold to: " << *C << " from: " - << *Inst << '\n'); - Inst->replaceAllUsesWith(C); - ++NumConstProp; - Inst->eraseFromParent(); - continue; - } - - if (DL) { - // See if we can constant fold its operands. - for (User::op_iterator i = Inst->op_begin(), e = Inst->op_end(); - i != e; ++i) { - ConstantExpr *CE = dyn_cast<ConstantExpr>(i); - if (CE == nullptr) continue; - - Constant*& FoldRes = FoldedConstants[CE]; - if (!FoldRes) - FoldRes = ConstantFoldConstantExpression(CE, DL, TLI); - if (!FoldRes) - FoldRes = CE; - - if (FoldRes != CE) { - *i = FoldRes; - MadeIRChange = true; - } - } - } - - InstrsForInstCombineWorklist.push_back(Inst); - } - - // Recursively visit successors. If this is a branch or switch on a - // constant, only visit the reachable successor. - TerminatorInst *TI = BB->getTerminator(); - if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { - if (BI->isConditional() && isa<ConstantInt>(BI->getCondition())) { - bool CondVal = cast<ConstantInt>(BI->getCondition())->getZExtValue(); - BasicBlock *ReachableBB = BI->getSuccessor(!CondVal); - Worklist.push_back(ReachableBB); - continue; - } - } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { - if (ConstantInt *Cond = dyn_cast<ConstantInt>(SI->getCondition())) { - // See if this is an explicit destination. - for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); - i != e; ++i) - if (i.getCaseValue() == Cond) { - BasicBlock *ReachableBB = i.getCaseSuccessor(); - Worklist.push_back(ReachableBB); - continue; - } - - // Otherwise it is the default destination. - Worklist.push_back(SI->getDefaultDest()); - continue; - } - } - - for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) - Worklist.push_back(TI->getSuccessor(i)); - } while (!Worklist.empty()); - - // Once we've found all of the instructions to add to instcombine's worklist, - // add them in reverse order. This way instcombine will visit from the top - // of the function down. This jives well with the way that it adds all uses - // of instructions to the worklist after doing a transformation, thus avoiding - // some N^2 behavior in pathological cases. - IC.Worklist.AddInitialGroup(&InstrsForInstCombineWorklist[0], - InstrsForInstCombineWorklist.size()); - - return MadeIRChange; -} - -bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) { - MadeIRChange = false; - - DEBUG(dbgs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on " - << F.getName() << "\n"); - - { - // Do a depth-first traversal of the function, populate the worklist with - // the reachable instructions. Ignore blocks that are not reachable. Keep - // track of which blocks we visit. - SmallPtrSet<BasicBlock*, 64> Visited; - MadeIRChange |= AddReachableCodeToWorklist(F.begin(), Visited, *this, DL, - TLI); - - // Do a quick scan over the function. If we find any blocks that are - // unreachable, remove any instructions inside of them. This prevents - // the instcombine code from having to deal with some bad special cases. - for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { - if (Visited.count(BB)) continue; - - // Delete the instructions backwards, as it has a reduced likelihood of - // having to update as many def-use and use-def chains. - Instruction *EndInst = BB->getTerminator(); // Last not to be deleted. - while (EndInst != BB->begin()) { - // Delete the next to last instruction. - BasicBlock::iterator I = EndInst; - Instruction *Inst = --I; - if (!Inst->use_empty()) - Inst->replaceAllUsesWith(UndefValue::get(Inst->getType())); - if (isa<LandingPadInst>(Inst)) { - EndInst = Inst; - continue; - } - if (!isa<DbgInfoIntrinsic>(Inst)) { - ++NumDeadInst; - MadeIRChange = true; - } - Inst->eraseFromParent(); - } - } - } - +bool InstCombiner::run() { while (!Worklist.isEmpty()) { Instruction *I = Worklist.RemoveOne(); if (I == nullptr) continue; // skip null values. @@ -2832,7 +2691,7 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) { } // Instruction isn't dead, see if we can constant propagate it. - if (!I->use_empty() && isa<Constant>(I->getOperand(0))) + if (!I->use_empty() && isa<Constant>(I->getOperand(0))) { if (Constant *C = ConstantFoldInstruction(I, DL, TLI)) { DEBUG(dbgs() << "IC: ConstFold to: " << *C << " from: " << *I << '\n'); @@ -2843,6 +2702,7 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) { MadeIRChange = true; continue; } + } // See if we can trivially sink this instruction to a successor basic block. if (I->hasOneUse()) { @@ -2900,7 +2760,7 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) { DEBUG(dbgs() << "IC: Old = " << *I << '\n' << " New = " << *Result << '\n'); - if (!I->getDebugLoc().isUnknown()) + if (I->getDebugLoc()) Result->setDebugLoc(I->getDebugLoc()); // Everything uses the new instruction now. I->replaceAllUsesWith(Result); @@ -2947,63 +2807,287 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) { return MadeIRChange; } -namespace { -class InstCombinerLibCallSimplifier final : public LibCallSimplifier { - InstCombiner *IC; -public: - InstCombinerLibCallSimplifier(const DataLayout *DL, - const TargetLibraryInfo *TLI, - InstCombiner *IC) - : LibCallSimplifier(DL, TLI) { - this->IC = IC; - } +/// AddReachableCodeToWorklist - Walk the function in depth-first order, adding +/// all reachable code to the worklist. +/// +/// This has a couple of tricks to make the code faster and more powerful. In +/// particular, we constant fold and DCE instructions as we go, to avoid adding +/// them to the worklist (this significantly speeds up instcombine on code where +/// many instructions are dead or constant). Additionally, if we find a branch +/// whose condition is a known constant, we only visit the reachable successors. +/// +static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL, + SmallPtrSetImpl<BasicBlock *> &Visited, + InstCombineWorklist &ICWorklist, + const TargetLibraryInfo *TLI) { + bool MadeIRChange = false; + SmallVector<BasicBlock*, 256> Worklist; + Worklist.push_back(BB); - /// replaceAllUsesWith - override so that instruction replacement - /// can be defined in terms of the instruction combiner framework. - void replaceAllUsesWith(Instruction *I, Value *With) const override { - IC->ReplaceInstUsesWith(*I, With); - } -}; + SmallVector<Instruction*, 128> InstrsForInstCombineWorklist; + DenseMap<ConstantExpr*, Constant*> FoldedConstants; + + do { + BB = Worklist.pop_back_val(); + + // We have now visited this block! If we've already been here, ignore it. + if (!Visited.insert(BB).second) + continue; + + for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ) { + Instruction *Inst = BBI++; + + // DCE instruction if trivially dead. + if (isInstructionTriviallyDead(Inst, TLI)) { + ++NumDeadInst; + DEBUG(dbgs() << "IC: DCE: " << *Inst << '\n'); + Inst->eraseFromParent(); + continue; + } + + // ConstantProp instruction if trivially constant. + if (!Inst->use_empty() && isa<Constant>(Inst->getOperand(0))) + if (Constant *C = ConstantFoldInstruction(Inst, DL, TLI)) { + DEBUG(dbgs() << "IC: ConstFold to: " << *C << " from: " + << *Inst << '\n'); + Inst->replaceAllUsesWith(C); + ++NumConstProp; + Inst->eraseFromParent(); + continue; + } + + // See if we can constant fold its operands. + for (User::op_iterator i = Inst->op_begin(), e = Inst->op_end(); i != e; + ++i) { + ConstantExpr *CE = dyn_cast<ConstantExpr>(i); + if (CE == nullptr) + continue; + + Constant *&FoldRes = FoldedConstants[CE]; + if (!FoldRes) + FoldRes = ConstantFoldConstantExpression(CE, DL, TLI); + if (!FoldRes) + FoldRes = CE; + + if (FoldRes != CE) { + *i = FoldRes; + MadeIRChange = true; + } + } + + InstrsForInstCombineWorklist.push_back(Inst); + } + + // Recursively visit successors. If this is a branch or switch on a + // constant, only visit the reachable successor. + TerminatorInst *TI = BB->getTerminator(); + if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { + if (BI->isConditional() && isa<ConstantInt>(BI->getCondition())) { + bool CondVal = cast<ConstantInt>(BI->getCondition())->getZExtValue(); + BasicBlock *ReachableBB = BI->getSuccessor(!CondVal); + Worklist.push_back(ReachableBB); + continue; + } + } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { + if (ConstantInt *Cond = dyn_cast<ConstantInt>(SI->getCondition())) { + // See if this is an explicit destination. + for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); + i != e; ++i) + if (i.getCaseValue() == Cond) { + BasicBlock *ReachableBB = i.getCaseSuccessor(); + Worklist.push_back(ReachableBB); + continue; + } + + // Otherwise it is the default destination. + Worklist.push_back(SI->getDefaultDest()); + continue; + } + } + + for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) + Worklist.push_back(TI->getSuccessor(i)); + } while (!Worklist.empty()); + + // Once we've found all of the instructions to add to instcombine's worklist, + // add them in reverse order. This way instcombine will visit from the top + // of the function down. This jives well with the way that it adds all uses + // of instructions to the worklist after doing a transformation, thus avoiding + // some N^2 behavior in pathological cases. + ICWorklist.AddInitialGroup(&InstrsForInstCombineWorklist[0], + InstrsForInstCombineWorklist.size()); + + return MadeIRChange; } -bool InstCombiner::runOnFunction(Function &F) { - if (skipOptnoneFunction(F)) - return false; +/// \brief Populate the IC worklist from a function, and prune any dead basic +/// blocks discovered in the process. +/// +/// This also does basic constant propagation and other forward fixing to make +/// the combiner itself run much faster. +static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL, + TargetLibraryInfo *TLI, + InstCombineWorklist &ICWorklist) { + bool MadeIRChange = false; + + // Do a depth-first traversal of the function, populate the worklist with + // the reachable instructions. Ignore blocks that are not reachable. Keep + // track of which blocks we visit. + SmallPtrSet<BasicBlock *, 64> Visited; + MadeIRChange |= + AddReachableCodeToWorklist(F.begin(), DL, Visited, ICWorklist, TLI); + + // Do a quick scan over the function. If we find any blocks that are + // unreachable, remove any instructions inside of them. This prevents + // the instcombine code from having to deal with some bad special cases. + for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { + if (Visited.count(BB)) + continue; + + // Delete the instructions backwards, as it has a reduced likelihood of + // having to update as many def-use and use-def chains. + Instruction *EndInst = BB->getTerminator(); // Last not to be deleted. + while (EndInst != BB->begin()) { + // Delete the next to last instruction. + BasicBlock::iterator I = EndInst; + Instruction *Inst = --I; + if (!Inst->use_empty()) + Inst->replaceAllUsesWith(UndefValue::get(Inst->getType())); + if (isa<LandingPadInst>(Inst)) { + EndInst = Inst; + continue; + } + if (!isa<DbgInfoIntrinsic>(Inst)) { + ++NumDeadInst; + MadeIRChange = true; + } + Inst->eraseFromParent(); + } + } - AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - DL = DLP ? &DLP->getDataLayout() : nullptr; - DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - TLI = &getAnalysis<TargetLibraryInfo>(); + return MadeIRChange; +} +static bool +combineInstructionsOverFunction(Function &F, InstCombineWorklist &Worklist, + AssumptionCache &AC, TargetLibraryInfo &TLI, + DominatorTree &DT, LoopInfo *LI = nullptr) { // Minimizing size? - MinimizeSize = F.getAttributes().hasAttribute(AttributeSet::FunctionIndex, - Attribute::MinSize); + bool MinimizeSize = F.hasFnAttribute(Attribute::MinSize); + auto &DL = F.getParent()->getDataLayout(); /// Builder - This is an IRBuilder that automatically inserts new /// instructions into the worklist when they are created. - IRBuilder<true, TargetFolder, InstCombineIRInserter> TheBuilder( - F.getContext(), TargetFolder(DL), InstCombineIRInserter(Worklist, AC)); - Builder = &TheBuilder; - - InstCombinerLibCallSimplifier TheSimplifier(DL, TLI, this); - Simplifier = &TheSimplifier; - - bool EverMadeChange = false; + IRBuilder<true, TargetFolder, InstCombineIRInserter> Builder( + F.getContext(), TargetFolder(DL), InstCombineIRInserter(Worklist, &AC)); // Lower dbg.declare intrinsics otherwise their value may be clobbered // by instcombiner. - EverMadeChange = LowerDbgDeclare(F); + bool DbgDeclaresChanged = LowerDbgDeclare(F); // Iterate while there is work to do. - unsigned Iteration = 0; - while (DoOneIteration(F, Iteration++)) - EverMadeChange = true; + int Iteration = 0; + for (;;) { + ++Iteration; + DEBUG(dbgs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on " + << F.getName() << "\n"); + + bool Changed = false; + if (prepareICWorklistFromFunction(F, DL, &TLI, Worklist)) + Changed = true; + + InstCombiner IC(Worklist, &Builder, MinimizeSize, &AC, &TLI, &DT, DL, LI); + if (IC.run()) + Changed = true; + + if (!Changed) + break; + } + + return DbgDeclaresChanged || Iteration > 1; +} + +PreservedAnalyses InstCombinePass::run(Function &F, + AnalysisManager<Function> *AM) { + auto &AC = AM->getResult<AssumptionAnalysis>(F); + auto &DT = AM->getResult<DominatorTreeAnalysis>(F); + auto &TLI = AM->getResult<TargetLibraryAnalysis>(F); + + auto *LI = AM->getCachedResult<LoopAnalysis>(F); + + if (!combineInstructionsOverFunction(F, Worklist, AC, TLI, DT, LI)) + // No changes, all analyses are preserved. + return PreservedAnalyses::all(); + + // Mark all the analyses that instcombine updates as preserved. + // FIXME: Need a way to preserve CFG analyses here! + PreservedAnalyses PA; + PA.preserve<DominatorTreeAnalysis>(); + return PA; +} + +namespace { +/// \brief The legacy pass manager's instcombine pass. +/// +/// This is a basic whole-function wrapper around the instcombine utility. It +/// will try to combine all instructions in the function. +class InstructionCombiningPass : public FunctionPass { + InstCombineWorklist Worklist; + +public: + static char ID; // Pass identification, replacement for typeid + + InstructionCombiningPass() : FunctionPass(ID) { + initializeInstructionCombiningPassPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override; + bool runOnFunction(Function &F) override; +}; +} + +void InstructionCombiningPass::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); +} + +bool InstructionCombiningPass::runOnFunction(Function &F) { + if (skipOptnoneFunction(F)) + return false; + + // Required analyses. + auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + + // Optional analyses. + auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>(); + auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr; - Builder = nullptr; - return EverMadeChange; + return combineInstructionsOverFunction(F, Worklist, AC, TLI, DT, LI); +} + +char InstructionCombiningPass::ID = 0; +INITIALIZE_PASS_BEGIN(InstructionCombiningPass, "instcombine", + "Combine redundant instructions", false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_END(InstructionCombiningPass, "instcombine", + "Combine redundant instructions", false, false) + +// Initialization Routines +void llvm::initializeInstCombine(PassRegistry &Registry) { + initializeInstructionCombiningPassPass(Registry); +} + +void LLVMInitializeInstCombine(LLVMPassRegistryRef R) { + initializeInstructionCombiningPassPass(*unwrap(R)); } FunctionPass *llvm::createInstructionCombiningPass() { - return new InstCombiner(); + return new InstructionCombiningPass(); } diff --git a/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp index 25f1f02..939e04b 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -24,6 +24,9 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/Triple.h" +#include "llvm/Analysis/MemoryBuiltins.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/DIBuilder.h" #include "llvm/IR/DataLayout.h" @@ -43,12 +46,14 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/Endian.h" #include "llvm/Support/SwapByteOrder.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/ASanStackFrameLayout.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/ModuleUtils.h" +#include "llvm/Transforms/Utils/PromoteMemToReg.h" #include <algorithm> #include <string> #include <system_error> @@ -64,22 +69,21 @@ static const uint64_t kDefaultShadowOffset64 = 1ULL << 44; static const uint64_t kSmallX86_64ShadowOffset = 0x7FFF8000; // < 2G. static const uint64_t kPPC64_ShadowOffset64 = 1ULL << 41; static const uint64_t kMIPS32_ShadowOffset32 = 0x0aaa0000; -static const uint64_t kMIPS64_ShadowOffset64 = 1ULL << 36; +static const uint64_t kMIPS64_ShadowOffset64 = 1ULL << 37; +static const uint64_t kAArch64_ShadowOffset64 = 1ULL << 36; static const uint64_t kFreeBSD_ShadowOffset32 = 1ULL << 30; static const uint64_t kFreeBSD_ShadowOffset64 = 1ULL << 46; static const uint64_t kWindowsShadowOffset32 = 3ULL << 28; -static const size_t kMinStackMallocSize = 1 << 6; // 64B +static const size_t kMinStackMallocSize = 1 << 6; // 64B static const size_t kMaxStackMallocSize = 1 << 16; // 64K static const uintptr_t kCurrentStackFrameMagic = 0x41B58AB3; static const uintptr_t kRetiredStackFrameMagic = 0x45E0360E; static const char *const kAsanModuleCtorName = "asan.module_ctor"; static const char *const kAsanModuleDtorName = "asan.module_dtor"; -static const uint64_t kAsanCtorAndDtorPriority = 1; +static const uint64_t kAsanCtorAndDtorPriority = 1; static const char *const kAsanReportErrorTemplate = "__asan_report_"; -static const char *const kAsanReportLoadN = "__asan_report_load_n"; -static const char *const kAsanReportStoreN = "__asan_report_store_n"; static const char *const kAsanRegisterGlobalsName = "__asan_register_globals"; static const char *const kAsanUnregisterGlobalsName = "__asan_unregister_globals"; @@ -89,7 +93,7 @@ static const char *const kAsanInitName = "__asan_init_v5"; static const char *const kAsanPtrCmp = "__sanitizer_ptr_cmp"; static const char *const kAsanPtrSub = "__sanitizer_ptr_sub"; static const char *const kAsanHandleNoReturnName = "__asan_handle_no_return"; -static const int kMaxAsanStackMallocSizeClass = 10; +static const int kMaxAsanStackMallocSizeClass = 10; static const char *const kAsanStackMallocNameTemplate = "__asan_stack_malloc_"; static const char *const kAsanStackFreeNameTemplate = "__asan_stack_free_"; static const char *const kAsanGenPrefix = "__asan_gen_"; @@ -102,10 +106,6 @@ static const char *const kAsanUnpoisonStackMemoryName = static const char *const kAsanOptionDetectUAR = "__asan_option_detect_stack_use_after_return"; -#ifndef NDEBUG -static const int kAsanStackAfterReturnMagic = 0xf5; -#endif - // Accesses sizes are powers of two: 1, 2, 4, 8, 16. static const size_t kNumberOfAccessSizes = 5; @@ -119,84 +119,110 @@ static const unsigned kAsanAllocaPartialVal2 = 0x000000cbU; // This flag may need to be replaced with -f[no-]asan-reads. static cl::opt<bool> ClInstrumentReads("asan-instrument-reads", - cl::desc("instrument read instructions"), cl::Hidden, cl::init(true)); -static cl::opt<bool> ClInstrumentWrites("asan-instrument-writes", - cl::desc("instrument write instructions"), cl::Hidden, cl::init(true)); -static cl::opt<bool> ClInstrumentAtomics("asan-instrument-atomics", - cl::desc("instrument atomic instructions (rmw, cmpxchg)"), - cl::Hidden, cl::init(true)); -static cl::opt<bool> ClAlwaysSlowPath("asan-always-slow-path", - cl::desc("use instrumentation with slow path for all accesses"), - cl::Hidden, cl::init(false)); + cl::desc("instrument read instructions"), + cl::Hidden, cl::init(true)); +static cl::opt<bool> ClInstrumentWrites( + "asan-instrument-writes", cl::desc("instrument write instructions"), + cl::Hidden, cl::init(true)); +static cl::opt<bool> ClInstrumentAtomics( + "asan-instrument-atomics", + cl::desc("instrument atomic instructions (rmw, cmpxchg)"), cl::Hidden, + cl::init(true)); +static cl::opt<bool> ClAlwaysSlowPath( + "asan-always-slow-path", + cl::desc("use instrumentation with slow path for all accesses"), cl::Hidden, + cl::init(false)); // This flag limits the number of instructions to be instrumented // in any given BB. Normally, this should be set to unlimited (INT_MAX), // but due to http://llvm.org/bugs/show_bug.cgi?id=12652 we temporary // set it to 10000. -static cl::opt<int> ClMaxInsnsToInstrumentPerBB("asan-max-ins-per-bb", - cl::init(10000), - cl::desc("maximal number of instructions to instrument in any given BB"), - cl::Hidden); +static cl::opt<int> ClMaxInsnsToInstrumentPerBB( + "asan-max-ins-per-bb", cl::init(10000), + cl::desc("maximal number of instructions to instrument in any given BB"), + cl::Hidden); // This flag may need to be replaced with -f[no]asan-stack. -static cl::opt<bool> ClStack("asan-stack", - cl::desc("Handle stack memory"), cl::Hidden, cl::init(true)); +static cl::opt<bool> ClStack("asan-stack", cl::desc("Handle stack memory"), + cl::Hidden, cl::init(true)); static cl::opt<bool> ClUseAfterReturn("asan-use-after-return", - cl::desc("Check return-after-free"), cl::Hidden, cl::init(true)); + cl::desc("Check return-after-free"), + cl::Hidden, cl::init(true)); // This flag may need to be replaced with -f[no]asan-globals. static cl::opt<bool> ClGlobals("asan-globals", - cl::desc("Handle global objects"), cl::Hidden, cl::init(true)); + cl::desc("Handle global objects"), cl::Hidden, + cl::init(true)); static cl::opt<bool> ClInitializers("asan-initialization-order", - cl::desc("Handle C++ initializer order"), cl::Hidden, cl::init(true)); -static cl::opt<bool> ClInvalidPointerPairs("asan-detect-invalid-pointer-pair", - cl::desc("Instrument <, <=, >, >=, - with pointer operands"), - cl::Hidden, cl::init(false)); -static cl::opt<unsigned> ClRealignStack("asan-realign-stack", - cl::desc("Realign stack to the value of this flag (power of two)"), - cl::Hidden, cl::init(32)); + cl::desc("Handle C++ initializer order"), + cl::Hidden, cl::init(true)); +static cl::opt<bool> ClInvalidPointerPairs( + "asan-detect-invalid-pointer-pair", + cl::desc("Instrument <, <=, >, >=, - with pointer operands"), cl::Hidden, + cl::init(false)); +static cl::opt<unsigned> ClRealignStack( + "asan-realign-stack", + cl::desc("Realign stack to the value of this flag (power of two)"), + cl::Hidden, cl::init(32)); static cl::opt<int> ClInstrumentationWithCallsThreshold( "asan-instrumentation-with-call-threshold", - cl::desc("If the function being instrumented contains more than " - "this number of memory accesses, use callbacks instead of " - "inline checks (-1 means never use callbacks)."), - cl::Hidden, cl::init(7000)); + cl::desc( + "If the function being instrumented contains more than " + "this number of memory accesses, use callbacks instead of " + "inline checks (-1 means never use callbacks)."), + cl::Hidden, cl::init(7000)); static cl::opt<std::string> ClMemoryAccessCallbackPrefix( - "asan-memory-access-callback-prefix", - cl::desc("Prefix for memory access callbacks"), cl::Hidden, - cl::init("__asan_")); + "asan-memory-access-callback-prefix", + cl::desc("Prefix for memory access callbacks"), cl::Hidden, + cl::init("__asan_")); static cl::opt<bool> ClInstrumentAllocas("asan-instrument-allocas", - cl::desc("instrument dynamic allocas"), cl::Hidden, cl::init(false)); + cl::desc("instrument dynamic allocas"), + cl::Hidden, cl::init(false)); +static cl::opt<bool> ClSkipPromotableAllocas( + "asan-skip-promotable-allocas", + cl::desc("Do not instrument promotable allocas"), cl::Hidden, + cl::init(true)); // These flags allow to change the shadow mapping. // The shadow mapping looks like // Shadow = (Mem >> scale) + (1 << offset_log) static cl::opt<int> ClMappingScale("asan-mapping-scale", - cl::desc("scale of asan shadow mapping"), cl::Hidden, cl::init(0)); + cl::desc("scale of asan shadow mapping"), + cl::Hidden, cl::init(0)); // Optimization flags. Not user visible, used mostly for testing // and benchmarking the tool. -static cl::opt<bool> ClOpt("asan-opt", - cl::desc("Optimize instrumentation"), cl::Hidden, cl::init(true)); -static cl::opt<bool> ClOptSameTemp("asan-opt-same-temp", - cl::desc("Instrument the same temp just once"), cl::Hidden, - cl::init(true)); +static cl::opt<bool> ClOpt("asan-opt", cl::desc("Optimize instrumentation"), + cl::Hidden, cl::init(true)); +static cl::opt<bool> ClOptSameTemp( + "asan-opt-same-temp", cl::desc("Instrument the same temp just once"), + cl::Hidden, cl::init(true)); static cl::opt<bool> ClOptGlobals("asan-opt-globals", - cl::desc("Don't instrument scalar globals"), cl::Hidden, cl::init(true)); - -static cl::opt<bool> ClCheckLifetime("asan-check-lifetime", - cl::desc("Use llvm.lifetime intrinsics to insert extra checks"), - cl::Hidden, cl::init(false)); + cl::desc("Don't instrument scalar globals"), + cl::Hidden, cl::init(true)); +static cl::opt<bool> ClOptStack( + "asan-opt-stack", cl::desc("Don't instrument scalar stack variables"), + cl::Hidden, cl::init(false)); + +static cl::opt<bool> ClCheckLifetime( + "asan-check-lifetime", + cl::desc("Use llvm.lifetime intrinsics to insert extra checks"), cl::Hidden, + cl::init(false)); static cl::opt<bool> ClDynamicAllocaStack( "asan-stack-dynamic-alloca", cl::desc("Use dynamic alloca to represent stack variables"), cl::Hidden, - cl::init(false)); + cl::init(true)); + +static cl::opt<uint32_t> ClForceExperiment( + "asan-force-experiment", + cl::desc("Force optimization experiment (for testing)"), cl::Hidden, + cl::init(0)); // Debug flags. static cl::opt<int> ClDebug("asan-debug", cl::desc("debug"), cl::Hidden, cl::init(0)); static cl::opt<int> ClDebugStack("asan-debug-stack", cl::desc("debug stack"), cl::Hidden, cl::init(0)); -static cl::opt<std::string> ClDebugFunc("asan-debug-func", - cl::Hidden, cl::desc("Debug func")); +static cl::opt<std::string> ClDebugFunc("asan-debug-func", cl::Hidden, + cl::desc("Debug func")); static cl::opt<int> ClDebugMin("asan-debug-min", cl::desc("Debug min inst"), cl::Hidden, cl::init(-1)); static cl::opt<int> ClDebugMax("asan-debug-max", cl::desc("Debug man inst"), @@ -206,10 +232,10 @@ STATISTIC(NumInstrumentedReads, "Number of instrumented reads"); STATISTIC(NumInstrumentedWrites, "Number of instrumented writes"); STATISTIC(NumInstrumentedDynamicAllocas, "Number of instrumented dynamic allocas"); -STATISTIC(NumOptimizedAccessesToGlobalArray, - "Number of optimized accesses to global arrays"); STATISTIC(NumOptimizedAccessesToGlobalVar, "Number of optimized accesses to global vars"); +STATISTIC(NumOptimizedAccessesToStackVar, + "Number of optimized accesses to stack vars"); namespace { /// Frontend-provided metadata for source location. @@ -224,8 +250,8 @@ struct LocationMetadata { void parse(MDNode *MDN) { assert(MDN->getNumOperands() == 3); - MDString *MDFilename = cast<MDString>(MDN->getOperand(0)); - Filename = MDFilename->getString(); + MDString *DIFilename = cast<MDString>(MDN->getOperand(0)); + Filename = DIFilename->getString(); LineNo = mdconst::extract<ConstantInt>(MDN->getOperand(1))->getLimitedValue(); ColumnNo = @@ -237,9 +263,7 @@ struct LocationMetadata { class GlobalsMetadata { public: struct Entry { - Entry() - : SourceLoc(), Name(), IsDynInit(false), - IsBlacklisted(false) {} + Entry() : SourceLoc(), Name(), IsDynInit(false), IsBlacklisted(false) {} LocationMetadata SourceLoc; StringRef Name; bool IsDynInit; @@ -248,19 +272,17 @@ class GlobalsMetadata { GlobalsMetadata() : inited_(false) {} - void init(Module& M) { + void init(Module &M) { assert(!inited_); inited_ = true; NamedMDNode *Globals = M.getNamedMetadata("llvm.asan.globals"); - if (!Globals) - return; + if (!Globals) return; for (auto MDN : Globals->operands()) { // Metadata node contains the global and the fields of "Entry". assert(MDN->getNumOperands() == 5); auto *GV = mdconst::extract_or_null<GlobalVariable>(MDN->getOperand(0)); // The optimizer may optimize away a global entirely. - if (!GV) - continue; + if (!GV) continue; // We can already have an entry for GV if it was merged with another // global. Entry &E = Entries[GV]; @@ -285,7 +307,7 @@ class GlobalsMetadata { private: bool inited_; - DenseMap<GlobalVariable*, Entry> Entries; + DenseMap<GlobalVariable *, Entry> Entries; }; /// This struct defines the shadow mapping using the rule: @@ -308,6 +330,7 @@ static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize) { TargetTriple.getArch() == llvm::Triple::mipsel; bool IsMIPS64 = TargetTriple.getArch() == llvm::Triple::mips64 || TargetTriple.getArch() == llvm::Triple::mips64el; + bool IsAArch64 = TargetTriple.getArch() == llvm::Triple::aarch64; bool IsWindows = TargetTriple.isOSWindows(); ShadowMapping Mapping; @@ -334,6 +357,8 @@ static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize) { Mapping.Offset = kSmallX86_64ShadowOffset; else if (IsMIPS64) Mapping.Offset = kMIPS64_ShadowOffset64; + else if (IsAArch64) + Mapping.Offset = kAArch64_ShadowOffset64; else Mapping.Offset = kDefaultShadowOffset64; } @@ -367,17 +392,36 @@ struct AddressSanitizer : public FunctionPass { } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); + } + uint64_t getAllocaSizeInBytes(AllocaInst *AI) const { + Type *Ty = AI->getAllocatedType(); + uint64_t SizeInBytes = + AI->getModule()->getDataLayout().getTypeAllocSize(Ty); + return SizeInBytes; } - void instrumentMop(Instruction *I, bool UseCalls); + /// Check if we want (and can) handle this alloca. + bool isInterestingAlloca(AllocaInst &AI); + /// If it is an interesting memory access, return the PointerOperand + /// and set IsWrite/Alignment. Otherwise return nullptr. + Value *isInterestingMemoryAccess(Instruction *I, bool *IsWrite, + uint64_t *TypeSize, + unsigned *Alignment); + void instrumentMop(ObjectSizeOffsetVisitor &ObjSizeVis, Instruction *I, + bool UseCalls, const DataLayout &DL); void instrumentPointerComparisonOrSubtraction(Instruction *I); void instrumentAddress(Instruction *OrigIns, Instruction *InsertBefore, Value *Addr, uint32_t TypeSize, bool IsWrite, - Value *SizeArgument, bool UseCalls); + Value *SizeArgument, bool UseCalls, uint32_t Exp); + void instrumentUnusualSizeOrAlignment(Instruction *I, Value *Addr, + uint32_t TypeSize, bool IsWrite, + Value *SizeArgument, bool UseCalls, + uint32_t Exp); Value *createSlowPathCmp(IRBuilder<> &IRB, Value *AddrLong, Value *ShadowValue, uint32_t TypeSize); Instruction *generateCrashCode(Instruction *InsertBefore, Value *Addr, bool IsWrite, size_t AccessSizeIndex, - Value *SizeArgument); + Value *SizeArgument, uint32_t Exp); void instrumentMemIntrinsic(MemIntrinsic *MI); Value *memToShadow(Value *Shadow, IRBuilder<> &IRB); bool runOnFunction(Function &F) override; @@ -392,9 +436,10 @@ struct AddressSanitizer : public FunctionPass { bool LooksLikeCodeInBug11395(Instruction *I); bool GlobalIsLinkerInitialized(GlobalVariable *G); + bool isSafeAccess(ObjectSizeOffsetVisitor &ObjSizeVis, Value *Addr, + uint64_t TypeSize) const; LLVMContext *C; - const DataLayout *DL; Triple TargetTriple; int LongSize; Type *IntptrTy; @@ -404,15 +449,16 @@ struct AddressSanitizer : public FunctionPass { Function *AsanInitFunction; Function *AsanHandleNoReturnFunc; Function *AsanPtrCmpFunction, *AsanPtrSubFunction; - // This array is indexed by AccessIsWrite and log2(AccessSize). - Function *AsanErrorCallback[2][kNumberOfAccessSizes]; - Function *AsanMemoryAccessCallback[2][kNumberOfAccessSizes]; - // This array is indexed by AccessIsWrite. - Function *AsanErrorCallbackSized[2], - *AsanMemoryAccessCallbackSized[2]; + // This array is indexed by AccessIsWrite, Experiment and log2(AccessSize). + Function *AsanErrorCallback[2][2][kNumberOfAccessSizes]; + Function *AsanMemoryAccessCallback[2][2][kNumberOfAccessSizes]; + // This array is indexed by AccessIsWrite and Experiment. + Function *AsanErrorCallbackSized[2][2]; + Function *AsanMemoryAccessCallbackSized[2][2]; Function *AsanMemmove, *AsanMemcpy, *AsanMemset; InlineAsm *EmptyAsm; GlobalsMetadata GlobalsMD; + DenseMap<AllocaInst *, bool> ProcessedAllocas; friend struct FunctionStackPoisoner; }; @@ -422,9 +468,7 @@ class AddressSanitizerModule : public ModulePass { AddressSanitizerModule() : ModulePass(ID) {} bool runOnModule(Module &M) override; static char ID; // Pass identification, replacement for typeid - const char *getPassName() const override { - return "AddressSanitizerModule"; - } + const char *getPassName() const override { return "AddressSanitizerModule"; } private: void initializeCallbacks(Module &M); @@ -440,7 +484,6 @@ class AddressSanitizerModule : public ModulePass { GlobalsMetadata GlobalsMD; Type *IntptrTy; LLVMContext *C; - const DataLayout *DL; Triple TargetTriple; ShadowMapping Mapping; Function *AsanPoisonGlobals; @@ -467,12 +510,12 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> { Type *IntptrPtrTy; ShadowMapping Mapping; - SmallVector<AllocaInst*, 16> AllocaVec; - SmallVector<Instruction*, 8> RetVec; + SmallVector<AllocaInst *, 16> AllocaVec; + SmallVector<Instruction *, 8> RetVec; unsigned StackAlignment; Function *AsanStackMallocFunc[kMaxAsanStackMallocSizeClass + 1], - *AsanStackFreeFunc[kMaxAsanStackMallocSizeClass + 1]; + *AsanStackFreeFunc[kMaxAsanStackMallocSizeClass + 1]; Function *AsanPoisonStackMemoryFunc, *AsanUnpoisonStackMemoryFunc; // Stores a place and arguments of poisoning/unpoisoning call for alloca. @@ -493,33 +536,38 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> { Value *LeftRzAddr; Value *RightRzAddr; bool Poison; - explicit DynamicAllocaCall(AllocaInst *AI, - Value *LeftRzAddr = nullptr, - Value *RightRzAddr = nullptr) - : AI(AI), LeftRzAddr(LeftRzAddr), RightRzAddr(RightRzAddr), Poison(true) - {} + explicit DynamicAllocaCall(AllocaInst *AI, Value *LeftRzAddr = nullptr, + Value *RightRzAddr = nullptr) + : AI(AI), + LeftRzAddr(LeftRzAddr), + RightRzAddr(RightRzAddr), + Poison(true) {} }; SmallVector<DynamicAllocaCall, 1> DynamicAllocaVec; // Maps Value to an AllocaInst from which the Value is originated. - typedef DenseMap<Value*, AllocaInst*> AllocaForValueMapTy; + typedef DenseMap<Value *, AllocaInst *> AllocaForValueMapTy; AllocaForValueMapTy AllocaForValue; bool HasNonEmptyInlineAsm; std::unique_ptr<CallInst> EmptyInlineAsm; FunctionStackPoisoner(Function &F, AddressSanitizer &ASan) - : F(F), ASan(ASan), DIB(*F.getParent(), /*AllowUnresolved*/ false), - C(ASan.C), IntptrTy(ASan.IntptrTy), - IntptrPtrTy(PointerType::get(IntptrTy, 0)), Mapping(ASan.Mapping), - StackAlignment(1 << Mapping.Scale), HasNonEmptyInlineAsm(false), + : F(F), + ASan(ASan), + DIB(*F.getParent(), /*AllowUnresolved*/ false), + C(ASan.C), + IntptrTy(ASan.IntptrTy), + IntptrPtrTy(PointerType::get(IntptrTy, 0)), + Mapping(ASan.Mapping), + StackAlignment(1 << Mapping.Scale), + HasNonEmptyInlineAsm(false), EmptyInlineAsm(CallInst::Create(ASan.EmptyAsm)) {} bool runOnFunction() { if (!ClStack) return false; // Collect alloca, ret, lifetime instructions etc. - for (BasicBlock *BB : depth_first(&F.getEntryBlock())) - visit(*BB); + for (BasicBlock *BB : depth_first(&F.getEntryBlock())) visit(*BB); if (AllocaVec.empty() && DynamicAllocaVec.empty()) return false; @@ -540,33 +588,31 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> { // ----------------------- Visitors. /// \brief Collect all Ret instructions. - void visitReturnInst(ReturnInst &RI) { - RetVec.push_back(&RI); - } + void visitReturnInst(ReturnInst &RI) { RetVec.push_back(&RI); } // Unpoison dynamic allocas redzones. void unpoisonDynamicAlloca(DynamicAllocaCall &AllocaCall) { - if (!AllocaCall.Poison) - return; + if (!AllocaCall.Poison) return; for (auto Ret : RetVec) { IRBuilder<> IRBRet(Ret); PointerType *Int32PtrTy = PointerType::getUnqual(IRBRet.getInt32Ty()); Value *Zero = Constant::getNullValue(IRBRet.getInt32Ty()); Value *PartialRzAddr = IRBRet.CreateSub(AllocaCall.RightRzAddr, ConstantInt::get(IntptrTy, 4)); - IRBRet.CreateStore(Zero, IRBRet.CreateIntToPtr(AllocaCall.LeftRzAddr, - Int32PtrTy)); - IRBRet.CreateStore(Zero, IRBRet.CreateIntToPtr(PartialRzAddr, - Int32PtrTy)); - IRBRet.CreateStore(Zero, IRBRet.CreateIntToPtr(AllocaCall.RightRzAddr, - Int32PtrTy)); + IRBRet.CreateStore( + Zero, IRBRet.CreateIntToPtr(AllocaCall.LeftRzAddr, Int32PtrTy)); + IRBRet.CreateStore(Zero, + IRBRet.CreateIntToPtr(PartialRzAddr, Int32PtrTy)); + IRBRet.CreateStore( + Zero, IRBRet.CreateIntToPtr(AllocaCall.RightRzAddr, Int32PtrTy)); } } // Right shift for BigEndian and left shift for LittleEndian. Value *shiftAllocaMagic(Value *Val, IRBuilder<> &IRB, Value *Shift) { - return ASan.DL->isLittleEndian() ? IRB.CreateShl(Val, Shift) - : IRB.CreateLShr(Val, Shift); + auto &DL = F.getParent()->getDataLayout(); + return DL.isLittleEndian() ? IRB.CreateShl(Val, Shift) + : IRB.CreateLShr(Val, Shift); } // Compute PartialRzMagic for dynamic alloca call. Since we don't know the @@ -595,7 +641,7 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> { /// \brief Collect Alloca instructions we want (and can) handle. void visitAllocaInst(AllocaInst &AI) { - if (!isInterestingAlloca(AI)) return; + if (!ASan.isInterestingAlloca(AI)) return; StackAlignment = std::max(StackAlignment, AI.getAlignment()); if (isDynamicAlloca(AI)) @@ -609,8 +655,7 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> { void visitIntrinsicInst(IntrinsicInst &II) { if (!ClCheckLifetime) return; Intrinsic::ID ID = II.getIntrinsicID(); - if (ID != Intrinsic::lifetime_start && - ID != Intrinsic::lifetime_end) + if (ID != Intrinsic::lifetime_start && ID != Intrinsic::lifetime_end) return; // Found lifetime intrinsic, add ASan instrumentation if necessary. ConstantInt *Size = dyn_cast<ConstantInt>(II.getArgOperand(0)); @@ -640,8 +685,7 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> { bool doesDominateAllExits(const Instruction *I) const { for (auto Ret : RetVec) { - if (!ASan.getDominatorTree().dominates(I, Ret)) - return false; + if (!ASan.getDominatorTree().dominates(I, Ret)) return false; } return true; } @@ -649,19 +693,6 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> { bool isDynamicAlloca(AllocaInst &AI) const { return AI.isArrayAllocation() || !AI.isStaticAlloca(); } - - // Check if we want (and can) handle this alloca. - bool isInterestingAlloca(AllocaInst &AI) const { - return (AI.getAllocatedType()->isSized() && - // alloca() may be called with 0 size, ignore it. - getAllocaSizeInBytes(&AI) > 0); - } - - uint64_t getAllocaSizeInBytes(AllocaInst *AI) const { - Type *Ty = AI->getAllocatedType(); - uint64_t SizeInBytes = ASan.DL->getTypeAllocSize(Ty); - return SizeInBytes; - } /// Finds alloca where the value comes from. AllocaInst *findAllocaForValue(Value *V); void poisonRedZones(ArrayRef<uint8_t> ShadowBytes, IRBuilder<> &IRB, @@ -679,21 +710,25 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> { } // namespace char AddressSanitizer::ID = 0; -INITIALIZE_PASS_BEGIN(AddressSanitizer, "asan", - "AddressSanitizer: detects use-after-free and out-of-bounds bugs.", - false, false) +INITIALIZE_PASS_BEGIN( + AddressSanitizer, "asan", + "AddressSanitizer: detects use-after-free and out-of-bounds bugs.", false, + false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_END(AddressSanitizer, "asan", - "AddressSanitizer: detects use-after-free and out-of-bounds bugs.", - false, false) +INITIALIZE_PASS_END( + AddressSanitizer, "asan", + "AddressSanitizer: detects use-after-free and out-of-bounds bugs.", false, + false) FunctionPass *llvm::createAddressSanitizerFunctionPass() { return new AddressSanitizer(); } char AddressSanitizerModule::ID = 0; -INITIALIZE_PASS(AddressSanitizerModule, "asan-module", +INITIALIZE_PASS( + AddressSanitizerModule, "asan-module", "AddressSanitizer: detects use-after-free and out-of-bounds bugs." - "ModulePass", false, false) + "ModulePass", + false, false) ModulePass *llvm::createAddressSanitizerModulePass() { return new AddressSanitizerModule(); } @@ -705,16 +740,15 @@ static size_t TypeSizeToSizeIndex(uint32_t TypeSize) { } // \brief Create a constant for Str so that we can pass it to the run-time lib. -static GlobalVariable *createPrivateGlobalForString( - Module &M, StringRef Str, bool AllowMerging) { +static GlobalVariable *createPrivateGlobalForString(Module &M, StringRef Str, + bool AllowMerging) { Constant *StrConst = ConstantDataArray::getString(M.getContext(), Str); // We use private linkage for module-local strings. If they can be merged // with another one, we set the unnamed_addr attribute. GlobalVariable *GV = new GlobalVariable(M, StrConst->getType(), true, GlobalValue::PrivateLinkage, StrConst, kAsanGenPrefix); - if (AllowMerging) - GV->setUnnamedAddr(true); + if (AllowMerging) GV->setUnnamedAddr(true); GV->setAlignment(1); // Strings may not be merged w/o setting align 1. return GV; } @@ -743,8 +777,7 @@ static bool GlobalWasGeneratedByAsan(GlobalVariable *G) { Value *AddressSanitizer::memToShadow(Value *Shadow, IRBuilder<> &IRB) { // Shadow >> scale Shadow = IRB.CreateLShr(Shadow, Mapping.Scale); - if (Mapping.Offset == 0) - return Shadow; + if (Mapping.Offset == 0) return Shadow; // (Shadow >> scale) | offset if (Mapping.OrShadowOffset) return IRB.CreateOr(Shadow, ConstantInt::get(IntptrTy, Mapping.Offset)); @@ -756,53 +789,84 @@ Value *AddressSanitizer::memToShadow(Value *Shadow, IRBuilder<> &IRB) { void AddressSanitizer::instrumentMemIntrinsic(MemIntrinsic *MI) { IRBuilder<> IRB(MI); if (isa<MemTransferInst>(MI)) { - IRB.CreateCall3( + IRB.CreateCall( isa<MemMoveInst>(MI) ? AsanMemmove : AsanMemcpy, - IRB.CreatePointerCast(MI->getOperand(0), IRB.getInt8PtrTy()), - IRB.CreatePointerCast(MI->getOperand(1), IRB.getInt8PtrTy()), - IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false)); + {IRB.CreatePointerCast(MI->getOperand(0), IRB.getInt8PtrTy()), + IRB.CreatePointerCast(MI->getOperand(1), IRB.getInt8PtrTy()), + IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false)}); } else if (isa<MemSetInst>(MI)) { - IRB.CreateCall3( + IRB.CreateCall( AsanMemset, - IRB.CreatePointerCast(MI->getOperand(0), IRB.getInt8PtrTy()), - IRB.CreateIntCast(MI->getOperand(1), IRB.getInt32Ty(), false), - IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false)); + {IRB.CreatePointerCast(MI->getOperand(0), IRB.getInt8PtrTy()), + IRB.CreateIntCast(MI->getOperand(1), IRB.getInt32Ty(), false), + IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false)}); } MI->eraseFromParent(); } -// If I is an interesting memory access, return the PointerOperand -// and set IsWrite/Alignment. Otherwise return nullptr. -static Value *isInterestingMemoryAccess(Instruction *I, bool *IsWrite, - unsigned *Alignment) { +/// Check if we want (and can) handle this alloca. +bool AddressSanitizer::isInterestingAlloca(AllocaInst &AI) { + auto PreviouslySeenAllocaInfo = ProcessedAllocas.find(&AI); + + if (PreviouslySeenAllocaInfo != ProcessedAllocas.end()) + return PreviouslySeenAllocaInfo->getSecond(); + + bool IsInteresting = (AI.getAllocatedType()->isSized() && + // alloca() may be called with 0 size, ignore it. + getAllocaSizeInBytes(&AI) > 0 && + // We are only interested in allocas not promotable to registers. + // Promotable allocas are common under -O0. + (!ClSkipPromotableAllocas || !isAllocaPromotable(&AI))); + + ProcessedAllocas[&AI] = IsInteresting; + return IsInteresting; +} + +/// If I is an interesting memory access, return the PointerOperand +/// and set IsWrite/Alignment. Otherwise return nullptr. +Value *AddressSanitizer::isInterestingMemoryAccess(Instruction *I, + bool *IsWrite, + uint64_t *TypeSize, + unsigned *Alignment) { // Skip memory accesses inserted by another instrumentation. - if (I->getMetadata("nosanitize")) - return nullptr; + if (I->getMetadata("nosanitize")) return nullptr; + + Value *PtrOperand = nullptr; + const DataLayout &DL = I->getModule()->getDataLayout(); if (LoadInst *LI = dyn_cast<LoadInst>(I)) { if (!ClInstrumentReads) return nullptr; *IsWrite = false; + *TypeSize = DL.getTypeStoreSizeInBits(LI->getType()); *Alignment = LI->getAlignment(); - return LI->getPointerOperand(); - } - if (StoreInst *SI = dyn_cast<StoreInst>(I)) { + PtrOperand = LI->getPointerOperand(); + } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) { if (!ClInstrumentWrites) return nullptr; *IsWrite = true; + *TypeSize = DL.getTypeStoreSizeInBits(SI->getValueOperand()->getType()); *Alignment = SI->getAlignment(); - return SI->getPointerOperand(); - } - if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) { + PtrOperand = SI->getPointerOperand(); + } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) { if (!ClInstrumentAtomics) return nullptr; *IsWrite = true; + *TypeSize = DL.getTypeStoreSizeInBits(RMW->getValOperand()->getType()); *Alignment = 0; - return RMW->getPointerOperand(); - } - if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I)) { + PtrOperand = RMW->getPointerOperand(); + } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I)) { if (!ClInstrumentAtomics) return nullptr; *IsWrite = true; + *TypeSize = DL.getTypeStoreSizeInBits(XCHG->getCompareOperand()->getType()); *Alignment = 0; - return XCHG->getPointerOperand(); + PtrOperand = XCHG->getPointerOperand(); } - return nullptr; + + // Treat memory accesses to promotable allocas as non-interesting since they + // will not cause memory violations. This greatly speeds up the instrumented + // executable at -O0. + if (ClSkipPromotableAllocas) + if (auto AI = dyn_cast_or_null<AllocaInst>(PtrOperand)) + return isInterestingAlloca(*AI) ? AI : nullptr; + + return PtrOperand; } static bool isPointerOperand(Value *V) { @@ -814,17 +878,15 @@ static bool isPointerOperand(Value *V) { // the frontend. static bool isInterestingPointerComparisonOrSubtraction(Instruction *I) { if (ICmpInst *Cmp = dyn_cast<ICmpInst>(I)) { - if (!Cmp->isRelational()) - return false; + if (!Cmp->isRelational()) return false; } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) { - if (BO->getOpcode() != Instruction::Sub) - return false; + if (BO->getOpcode() != Instruction::Sub) return false; } else { return false; } if (!isPointerOperand(I->getOperand(0)) || !isPointerOperand(I->getOperand(1))) - return false; + return false; return true; } @@ -835,8 +897,8 @@ bool AddressSanitizer::GlobalIsLinkerInitialized(GlobalVariable *G) { return G->hasInitializer() && !GlobalsMD.get(G).IsDynInit; } -void -AddressSanitizer::instrumentPointerComparisonOrSubtraction(Instruction *I) { +void AddressSanitizer::instrumentPointerComparisonOrSubtraction( + Instruction *I) { IRBuilder<> IRB(I); Function *F = isa<ICmpInst>(I) ? AsanPtrCmpFunction : AsanPtrSubFunction; Value *Param[2] = {I->getOperand(0), I->getOperand(1)}; @@ -844,41 +906,50 @@ AddressSanitizer::instrumentPointerComparisonOrSubtraction(Instruction *I) { if (Param[i]->getType()->isPointerTy()) Param[i] = IRB.CreatePointerCast(Param[i], IntptrTy); } - IRB.CreateCall2(F, Param[0], Param[1]); + IRB.CreateCall(F, Param); } -void AddressSanitizer::instrumentMop(Instruction *I, bool UseCalls) { +void AddressSanitizer::instrumentMop(ObjectSizeOffsetVisitor &ObjSizeVis, + Instruction *I, bool UseCalls, + const DataLayout &DL) { bool IsWrite = false; unsigned Alignment = 0; - Value *Addr = isInterestingMemoryAccess(I, &IsWrite, &Alignment); + uint64_t TypeSize = 0; + Value *Addr = isInterestingMemoryAccess(I, &IsWrite, &TypeSize, &Alignment); assert(Addr); + + // Optimization experiments. + // The experiments can be used to evaluate potential optimizations that remove + // instrumentation (assess false negatives). Instead of completely removing + // some instrumentation, you set Exp to a non-zero value (mask of optimization + // experiments that want to remove instrumentation of this instruction). + // If Exp is non-zero, this pass will emit special calls into runtime + // (e.g. __asan_report_exp_load1 instead of __asan_report_load1). These calls + // make runtime terminate the program in a special way (with a different + // exit status). Then you run the new compiler on a buggy corpus, collect + // the special terminations (ideally, you don't see them at all -- no false + // negatives) and make the decision on the optimization. + uint32_t Exp = ClForceExperiment; + if (ClOpt && ClOptGlobals) { - if (GlobalVariable *G = dyn_cast<GlobalVariable>(Addr)) { - // If initialization order checking is disabled, a simple access to a - // dynamically initialized global is always valid. - if (!ClInitializers || GlobalIsLinkerInitialized(G)) { - NumOptimizedAccessesToGlobalVar++; - return; - } - } - ConstantExpr *CE = dyn_cast<ConstantExpr>(Addr); - if (CE && CE->isGEPWithNoNotionalOverIndexing()) { - if (GlobalVariable *G = dyn_cast<GlobalVariable>(CE->getOperand(0))) { - if (CE->getOperand(1)->isNullValue() && GlobalIsLinkerInitialized(G)) { - NumOptimizedAccessesToGlobalArray++; - return; - } - } + // If initialization order checking is disabled, a simple access to a + // dynamically initialized global is always valid. + GlobalVariable *G = dyn_cast<GlobalVariable>(GetUnderlyingObject(Addr, DL)); + if (G != NULL && (!ClInitializers || GlobalIsLinkerInitialized(G)) && + isSafeAccess(ObjSizeVis, Addr, TypeSize)) { + NumOptimizedAccessesToGlobalVar++; + return; } } - Type *OrigPtrTy = Addr->getType(); - Type *OrigTy = cast<PointerType>(OrigPtrTy)->getElementType(); - - assert(OrigTy->isSized()); - uint32_t TypeSize = DL->getTypeStoreSizeInBits(OrigTy); - - assert((TypeSize % 8) == 0); + if (ClOpt && ClOptStack) { + // A direct inbounds access to a stack variable is always valid. + if (isa<AllocaInst>(GetUnderlyingObject(Addr, DL)) && + isSafeAccess(ObjSizeVis, Addr, TypeSize)) { + NumOptimizedAccessesToStackVar++; + return; + } + } if (IsWrite) NumInstrumentedWrites++; @@ -891,65 +962,57 @@ void AddressSanitizer::instrumentMop(Instruction *I, bool UseCalls) { if ((TypeSize == 8 || TypeSize == 16 || TypeSize == 32 || TypeSize == 64 || TypeSize == 128) && (Alignment >= Granularity || Alignment == 0 || Alignment >= TypeSize / 8)) - return instrumentAddress(I, I, Addr, TypeSize, IsWrite, nullptr, UseCalls); - // Instrument unusual size or unusual alignment. - // We can not do it with a single check, so we do 1-byte check for the first - // and the last bytes. We call __asan_report_*_n(addr, real_size) to be able - // to report the actual access size. - IRBuilder<> IRB(I); - Value *Size = ConstantInt::get(IntptrTy, TypeSize / 8); - Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy); - if (UseCalls) { - IRB.CreateCall2(AsanMemoryAccessCallbackSized[IsWrite], AddrLong, Size); - } else { - Value *LastByte = IRB.CreateIntToPtr( - IRB.CreateAdd(AddrLong, ConstantInt::get(IntptrTy, TypeSize / 8 - 1)), - OrigPtrTy); - instrumentAddress(I, I, Addr, 8, IsWrite, Size, false); - instrumentAddress(I, I, LastByte, 8, IsWrite, Size, false); - } + return instrumentAddress(I, I, Addr, TypeSize, IsWrite, nullptr, UseCalls, + Exp); + instrumentUnusualSizeOrAlignment(I, Addr, TypeSize, IsWrite, nullptr, + UseCalls, Exp); } -// Validate the result of Module::getOrInsertFunction called for an interface -// function of AddressSanitizer. If the instrumented module defines a function -// with the same name, their prototypes must match, otherwise -// getOrInsertFunction returns a bitcast. -static Function *checkInterfaceFunction(Constant *FuncOrBitcast) { - if (isa<Function>(FuncOrBitcast)) return cast<Function>(FuncOrBitcast); - FuncOrBitcast->dump(); - report_fatal_error("trying to redefine an AddressSanitizer " - "interface function"); -} - -Instruction *AddressSanitizer::generateCrashCode( - Instruction *InsertBefore, Value *Addr, - bool IsWrite, size_t AccessSizeIndex, Value *SizeArgument) { +Instruction *AddressSanitizer::generateCrashCode(Instruction *InsertBefore, + Value *Addr, bool IsWrite, + size_t AccessSizeIndex, + Value *SizeArgument, + uint32_t Exp) { IRBuilder<> IRB(InsertBefore); - CallInst *Call = SizeArgument - ? IRB.CreateCall2(AsanErrorCallbackSized[IsWrite], Addr, SizeArgument) - : IRB.CreateCall(AsanErrorCallback[IsWrite][AccessSizeIndex], Addr); + Value *ExpVal = Exp == 0 ? nullptr : ConstantInt::get(IRB.getInt32Ty(), Exp); + CallInst *Call = nullptr; + if (SizeArgument) { + if (Exp == 0) + Call = IRB.CreateCall(AsanErrorCallbackSized[IsWrite][0], + {Addr, SizeArgument}); + else + Call = IRB.CreateCall(AsanErrorCallbackSized[IsWrite][1], + {Addr, SizeArgument, ExpVal}); + } else { + if (Exp == 0) + Call = + IRB.CreateCall(AsanErrorCallback[IsWrite][0][AccessSizeIndex], Addr); + else + Call = IRB.CreateCall(AsanErrorCallback[IsWrite][1][AccessSizeIndex], + {Addr, ExpVal}); + } // We don't do Call->setDoesNotReturn() because the BB already has // UnreachableInst at the end. // This EmptyAsm is required to avoid callback merge. - IRB.CreateCall(EmptyAsm); + IRB.CreateCall(EmptyAsm, {}); return Call; } Value *AddressSanitizer::createSlowPathCmp(IRBuilder<> &IRB, Value *AddrLong, - Value *ShadowValue, - uint32_t TypeSize) { + Value *ShadowValue, + uint32_t TypeSize) { size_t Granularity = 1 << Mapping.Scale; // Addr & (Granularity - 1) - Value *LastAccessedByte = IRB.CreateAnd( - AddrLong, ConstantInt::get(IntptrTy, Granularity - 1)); + Value *LastAccessedByte = + IRB.CreateAnd(AddrLong, ConstantInt::get(IntptrTy, Granularity - 1)); // (Addr & (Granularity - 1)) + size - 1 if (TypeSize / 8 > 1) LastAccessedByte = IRB.CreateAdd( LastAccessedByte, ConstantInt::get(IntptrTy, TypeSize / 8 - 1)); // (uint8_t) ((Addr & (Granularity-1)) + size - 1) - LastAccessedByte = IRB.CreateIntCast( - LastAccessedByte, ShadowValue->getType(), false); + LastAccessedByte = + IRB.CreateIntCast(LastAccessedByte, ShadowValue->getType(), false); // ((uint8_t) ((Addr & (Granularity-1)) + size - 1)) >= ShadowValue return IRB.CreateICmpSGE(LastAccessedByte, ShadowValue); } @@ -957,24 +1020,29 @@ Value *AddressSanitizer::createSlowPathCmp(IRBuilder<> &IRB, Value *AddrLong, void AddressSanitizer::instrumentAddress(Instruction *OrigIns, Instruction *InsertBefore, Value *Addr, uint32_t TypeSize, bool IsWrite, - Value *SizeArgument, bool UseCalls) { + Value *SizeArgument, bool UseCalls, + uint32_t Exp) { IRBuilder<> IRB(InsertBefore); Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy); size_t AccessSizeIndex = TypeSizeToSizeIndex(TypeSize); if (UseCalls) { - IRB.CreateCall(AsanMemoryAccessCallback[IsWrite][AccessSizeIndex], - AddrLong); + if (Exp == 0) + IRB.CreateCall(AsanMemoryAccessCallback[IsWrite][0][AccessSizeIndex], + AddrLong); + else + IRB.CreateCall(AsanMemoryAccessCallback[IsWrite][1][AccessSizeIndex], + {AddrLong, ConstantInt::get(IRB.getInt32Ty(), Exp)}); return; } - Type *ShadowTy = IntegerType::get( - *C, std::max(8U, TypeSize >> Mapping.Scale)); + Type *ShadowTy = + IntegerType::get(*C, std::max(8U, TypeSize >> Mapping.Scale)); Type *ShadowPtrTy = PointerType::get(ShadowTy, 0); Value *ShadowPtr = memToShadow(AddrLong, IRB); Value *CmpVal = Constant::getNullValue(ShadowTy); - Value *ShadowValue = IRB.CreateLoad( - IRB.CreateIntToPtr(ShadowPtr, ShadowPtrTy)); + Value *ShadowValue = + IRB.CreateLoad(IRB.CreateIntToPtr(ShadowPtr, ShadowPtrTy)); Value *Cmp = IRB.CreateICmpNE(ShadowValue, CmpVal); size_t Granularity = 1 << Mapping.Scale; @@ -983,10 +1051,9 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns, if (ClAlwaysSlowPath || (TypeSize < 8 * Granularity)) { // We use branch weights for the slow path check, to indicate that the slow // path is rarely taken. This seems to be the case for SPEC benchmarks. - TerminatorInst *CheckTerm = - SplitBlockAndInsertIfThen(Cmp, InsertBefore, false, - MDBuilder(*C).createBranchWeights(1, 100000)); - assert(dyn_cast<BranchInst>(CheckTerm)->isUnconditional()); + TerminatorInst *CheckTerm = SplitBlockAndInsertIfThen( + Cmp, InsertBefore, false, MDBuilder(*C).createBranchWeights(1, 100000)); + assert(cast<BranchInst>(CheckTerm)->isUnconditional()); BasicBlock *NextBB = CheckTerm->getSuccessor(0); IRB.SetInsertPoint(CheckTerm); Value *Cmp2 = createSlowPathCmp(IRB, AddrLong, ShadowValue, TypeSize); @@ -999,11 +1066,37 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns, CrashTerm = SplitBlockAndInsertIfThen(Cmp, InsertBefore, true); } - Instruction *Crash = generateCrashCode( - CrashTerm, AddrLong, IsWrite, AccessSizeIndex, SizeArgument); + Instruction *Crash = generateCrashCode(CrashTerm, AddrLong, IsWrite, + AccessSizeIndex, SizeArgument, Exp); Crash->setDebugLoc(OrigIns->getDebugLoc()); } +// Instrument unusual size or unusual alignment. +// We can not do it with a single check, so we do 1-byte check for the first +// and the last bytes. We call __asan_report_*_n(addr, real_size) to be able +// to report the actual access size. +void AddressSanitizer::instrumentUnusualSizeOrAlignment( + Instruction *I, Value *Addr, uint32_t TypeSize, bool IsWrite, + Value *SizeArgument, bool UseCalls, uint32_t Exp) { + IRBuilder<> IRB(I); + Value *Size = ConstantInt::get(IntptrTy, TypeSize / 8); + Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy); + if (UseCalls) { + if (Exp == 0) + IRB.CreateCall(AsanMemoryAccessCallbackSized[IsWrite][0], + {AddrLong, Size}); + else + IRB.CreateCall(AsanMemoryAccessCallbackSized[IsWrite][1], + {AddrLong, Size, ConstantInt::get(IRB.getInt32Ty(), Exp)}); + } else { + Value *LastByte = IRB.CreateIntToPtr( + IRB.CreateAdd(AddrLong, ConstantInt::get(IntptrTy, TypeSize / 8 - 1)), + Addr->getType()); + instrumentAddress(I, I, Addr, 8, IsWrite, Size, false, Exp); + instrumentAddress(I, I, LastByte, 8, IsWrite, Size, false, Exp); + } +} + void AddressSanitizerModule::poisonOneInitializer(Function &GlobalInit, GlobalValue *ModuleName) { // Set up the arguments to our poison/unpoison functions. @@ -1025,12 +1118,11 @@ void AddressSanitizerModule::createInitializerPoisonCalls( ConstantArray *CA = cast<ConstantArray>(GV->getInitializer()); for (Use &OP : CA->operands()) { - if (isa<ConstantAggregateZero>(OP)) - continue; + if (isa<ConstantAggregateZero>(OP)) continue; ConstantStruct *CS = cast<ConstantStruct>(OP); // Must have a function or null ptr. - if (Function* F = dyn_cast<Function>(CS->getOperand(1))) { + if (Function *F = dyn_cast<Function>(CS->getOperand(1))) { if (F->getName() == kAsanModuleCtorName) continue; ConstantInt *Priority = dyn_cast<ConstantInt>(CS->getOperand(0)); // Don't instrument CTORs that will run before asan.module_ctor. @@ -1055,13 +1147,11 @@ bool AddressSanitizerModule::ShouldInstrumentGlobal(GlobalVariable *G) { G->getLinkage() != GlobalVariable::PrivateLinkage && G->getLinkage() != GlobalVariable::InternalLinkage) return false; - if (G->hasComdat()) - return false; + if (G->hasComdat()) return false; // Two problems with thread-locals: // - The address of the main thread's copy can't be computed at link-time. // - Need to poison all copies, not just the main thread's one. - if (G->isThreadLocal()) - return false; + if (G->isThreadLocal()) return false; // For now, just ignore this Global if the alignment is large. if (G->getAlignment() > MinRedzoneSizeForGlobal()) return false; @@ -1072,10 +1162,8 @@ bool AddressSanitizerModule::ShouldInstrumentGlobal(GlobalVariable *G) { StringRef ParsedSegment, ParsedSection; unsigned TAA = 0, StubSize = 0; bool TAAParsed; - std::string ErrorCode = - MCSectionMachO::ParseSectionSpecifier(Section, ParsedSegment, - ParsedSection, TAA, TAAParsed, - StubSize); + std::string ErrorCode = MCSectionMachO::ParseSectionSpecifier( + Section, ParsedSegment, ParsedSection, TAA, TAAParsed, StubSize); if (!ErrorCode.empty()) { report_fatal_error("Invalid section specifier '" + ParsedSection + "': " + ErrorCode + "."); @@ -1128,20 +1216,19 @@ bool AddressSanitizerModule::ShouldInstrumentGlobal(GlobalVariable *G) { void AddressSanitizerModule::initializeCallbacks(Module &M) { IRBuilder<> IRB(*C); // Declare our poisoning and unpoisoning functions. - AsanPoisonGlobals = checkInterfaceFunction(M.getOrInsertFunction( + AsanPoisonGlobals = checkSanitizerInterfaceFunction(M.getOrInsertFunction( kAsanPoisonGlobalsName, IRB.getVoidTy(), IntptrTy, nullptr)); AsanPoisonGlobals->setLinkage(Function::ExternalLinkage); - AsanUnpoisonGlobals = checkInterfaceFunction(M.getOrInsertFunction( + AsanUnpoisonGlobals = checkSanitizerInterfaceFunction(M.getOrInsertFunction( kAsanUnpoisonGlobalsName, IRB.getVoidTy(), nullptr)); AsanUnpoisonGlobals->setLinkage(Function::ExternalLinkage); // Declare functions that register/unregister globals. - AsanRegisterGlobals = checkInterfaceFunction(M.getOrInsertFunction( - kAsanRegisterGlobalsName, IRB.getVoidTy(), - IntptrTy, IntptrTy, nullptr)); + AsanRegisterGlobals = checkSanitizerInterfaceFunction(M.getOrInsertFunction( + kAsanRegisterGlobalsName, IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr)); AsanRegisterGlobals->setLinkage(Function::ExternalLinkage); - AsanUnregisterGlobals = checkInterfaceFunction(M.getOrInsertFunction( - kAsanUnregisterGlobalsName, - IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr)); + AsanUnregisterGlobals = checkSanitizerInterfaceFunction( + M.getOrInsertFunction(kAsanUnregisterGlobalsName, IRB.getVoidTy(), + IntptrTy, IntptrTy, nullptr)); AsanUnregisterGlobals->setLinkage(Function::ExternalLinkage); } @@ -1154,8 +1241,7 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M) { SmallVector<GlobalVariable *, 16> GlobalsToChange; for (auto &G : M.globals()) { - if (ShouldInstrumentGlobal(&G)) - GlobalsToChange.push_back(&G); + if (ShouldInstrumentGlobal(&G)) GlobalsToChange.push_back(&G); } size_t n = GlobalsToChange.size(); @@ -1180,8 +1266,9 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M) { // We shouldn't merge same module names, as this string serves as unique // module ID in runtime. GlobalVariable *ModuleName = createPrivateGlobalForString( - M, M.getModuleIdentifier(), /*AllowMerging*/false); + M, M.getModuleIdentifier(), /*AllowMerging*/ false); + auto &DL = M.getDataLayout(); for (size_t i = 0; i < n; i++) { static const uint64_t kMaxGlobalRedzone = 1 << 18; GlobalVariable *G = GlobalsToChange[i]; @@ -1195,32 +1282,30 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M) { PointerType *PtrTy = cast<PointerType>(G->getType()); Type *Ty = PtrTy->getElementType(); - uint64_t SizeInBytes = DL->getTypeAllocSize(Ty); + uint64_t SizeInBytes = DL.getTypeAllocSize(Ty); uint64_t MinRZ = MinRedzoneSizeForGlobal(); // MinRZ <= RZ <= kMaxGlobalRedzone // and trying to make RZ to be ~ 1/4 of SizeInBytes. - uint64_t RZ = std::max(MinRZ, - std::min(kMaxGlobalRedzone, - (SizeInBytes / MinRZ / 4) * MinRZ)); + uint64_t RZ = std::max( + MinRZ, std::min(kMaxGlobalRedzone, (SizeInBytes / MinRZ / 4) * MinRZ)); uint64_t RightRedzoneSize = RZ; // Round up to MinRZ - if (SizeInBytes % MinRZ) - RightRedzoneSize += MinRZ - (SizeInBytes % MinRZ); + if (SizeInBytes % MinRZ) RightRedzoneSize += MinRZ - (SizeInBytes % MinRZ); assert(((RightRedzoneSize + SizeInBytes) % MinRZ) == 0); Type *RightRedZoneTy = ArrayType::get(IRB.getInt8Ty(), RightRedzoneSize); StructType *NewTy = StructType::get(Ty, RightRedZoneTy, nullptr); - Constant *NewInitializer = ConstantStruct::get( - NewTy, G->getInitializer(), - Constant::getNullValue(RightRedZoneTy), nullptr); + Constant *NewInitializer = + ConstantStruct::get(NewTy, G->getInitializer(), + Constant::getNullValue(RightRedZoneTy), nullptr); // Create a new global variable with enough space for a redzone. GlobalValue::LinkageTypes Linkage = G->getLinkage(); if (G->isConstant() && Linkage == GlobalValue::PrivateLinkage) Linkage = GlobalValue::InternalLinkage; - GlobalVariable *NewGlobal = new GlobalVariable( - M, NewTy, G->isConstant(), Linkage, - NewInitializer, "", G, G->getThreadLocalMode()); + GlobalVariable *NewGlobal = + new GlobalVariable(M, NewTy, G->isConstant(), Linkage, NewInitializer, + "", G, G->getThreadLocalMode()); NewGlobal->copyAttributesFrom(G); NewGlobal->setAlignment(MinRZ); @@ -1229,7 +1314,7 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M) { Indices2[1] = IRB.getInt32(0); G->replaceAllUsesWith( - ConstantExpr::getGetElementPtr(NewGlobal, Indices2, true)); + ConstantExpr::getGetElementPtr(NewTy, NewGlobal, Indices2, true)); NewGlobal->takeName(G); G->eraseFromParent(); @@ -1249,8 +1334,7 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M) { ConstantExpr::getPointerCast(ModuleName, IntptrTy), ConstantInt::get(IntptrTy, MD.IsDynInit), SourceLoc, nullptr); - if (ClInitializers && MD.IsDynInit) - HasDynamicallyInitializedGlobals = true; + if (ClInitializers && MD.IsDynInit) HasDynamicallyInitializedGlobals = true; DEBUG(dbgs() << "NEW GLOBAL: " << *NewGlobal << "\n"); } @@ -1263,20 +1347,20 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M) { // Create calls for poisoning before initializers run and unpoisoning after. if (HasDynamicallyInitializedGlobals) createInitializerPoisonCalls(M, ModuleName); - IRB.CreateCall2(AsanRegisterGlobals, - IRB.CreatePointerCast(AllGlobals, IntptrTy), - ConstantInt::get(IntptrTy, n)); + IRB.CreateCall(AsanRegisterGlobals, + {IRB.CreatePointerCast(AllGlobals, IntptrTy), + ConstantInt::get(IntptrTy, n)}); // We also need to unregister globals at the end, e.g. when a shared library // gets closed. - Function *AsanDtorFunction = Function::Create( - FunctionType::get(Type::getVoidTy(*C), false), - GlobalValue::InternalLinkage, kAsanModuleDtorName, &M); + Function *AsanDtorFunction = + Function::Create(FunctionType::get(Type::getVoidTy(*C), false), + GlobalValue::InternalLinkage, kAsanModuleDtorName, &M); BasicBlock *AsanDtorBB = BasicBlock::Create(*C, "", AsanDtorFunction); IRBuilder<> IRB_Dtor(ReturnInst::Create(*C, AsanDtorBB)); - IRB_Dtor.CreateCall2(AsanUnregisterGlobals, - IRB.CreatePointerCast(AllGlobals, IntptrTy), - ConstantInt::get(IntptrTy, n)); + IRB_Dtor.CreateCall(AsanUnregisterGlobals, + {IRB.CreatePointerCast(AllGlobals, IntptrTy), + ConstantInt::get(IntptrTy, n)}); appendToGlobalDtors(M, AsanDtorFunction, kAsanCtorAndDtorPriority); DEBUG(dbgs() << M); @@ -1284,12 +1368,8 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M) { } bool AddressSanitizerModule::runOnModule(Module &M) { - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - if (!DLP) - return false; - DL = &DLP->getDataLayout(); C = &(M.getContext()); - int LongSize = DL->getPointerSizeInBits(); + int LongSize = M.getDataLayout().getPointerSizeInBits(); IntptrTy = Type::getIntNTy(*C, LongSize); TargetTriple = Triple(M.getTargetTriple()); Mapping = getShadowMapping(TargetTriple, LongSize); @@ -1301,8 +1381,7 @@ bool AddressSanitizerModule::runOnModule(Module &M) { assert(CtorFunc); IRBuilder<> IRB(CtorFunc->getEntryBlock().getTerminator()); - if (ClGlobals) - Changed |= InstrumentGlobals(IRB, M); + if (ClGlobals) Changed |= InstrumentGlobals(IRB, M); return Changed; } @@ -1310,50 +1389,51 @@ bool AddressSanitizerModule::runOnModule(Module &M) { void AddressSanitizer::initializeCallbacks(Module &M) { IRBuilder<> IRB(*C); // Create __asan_report* callbacks. - for (size_t AccessIsWrite = 0; AccessIsWrite <= 1; AccessIsWrite++) { - for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes; - AccessSizeIndex++) { - // IsWrite and TypeSize are encoded in the function name. - std::string Suffix = - (AccessIsWrite ? "store" : "load") + itostr(1 << AccessSizeIndex); - AsanErrorCallback[AccessIsWrite][AccessSizeIndex] = - checkInterfaceFunction( - M.getOrInsertFunction(kAsanReportErrorTemplate + Suffix, - IRB.getVoidTy(), IntptrTy, nullptr)); - AsanMemoryAccessCallback[AccessIsWrite][AccessSizeIndex] = - checkInterfaceFunction( - M.getOrInsertFunction(ClMemoryAccessCallbackPrefix + Suffix, - IRB.getVoidTy(), IntptrTy, nullptr)); + // IsWrite, TypeSize and Exp are encoded in the function name. + for (int Exp = 0; Exp < 2; Exp++) { + for (size_t AccessIsWrite = 0; AccessIsWrite <= 1; AccessIsWrite++) { + const std::string TypeStr = AccessIsWrite ? "store" : "load"; + const std::string ExpStr = Exp ? "exp_" : ""; + const Type *ExpType = Exp ? Type::getInt32Ty(*C) : nullptr; + AsanErrorCallbackSized[AccessIsWrite][Exp] = + checkSanitizerInterfaceFunction(M.getOrInsertFunction( + kAsanReportErrorTemplate + ExpStr + TypeStr + "_n", + IRB.getVoidTy(), IntptrTy, IntptrTy, ExpType, nullptr)); + AsanMemoryAccessCallbackSized[AccessIsWrite][Exp] = + checkSanitizerInterfaceFunction(M.getOrInsertFunction( + ClMemoryAccessCallbackPrefix + ExpStr + TypeStr + "N", + IRB.getVoidTy(), IntptrTy, IntptrTy, ExpType, nullptr)); + for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes; + AccessSizeIndex++) { + const std::string Suffix = TypeStr + itostr(1 << AccessSizeIndex); + AsanErrorCallback[AccessIsWrite][Exp][AccessSizeIndex] = + checkSanitizerInterfaceFunction(M.getOrInsertFunction( + kAsanReportErrorTemplate + ExpStr + Suffix, IRB.getVoidTy(), + IntptrTy, ExpType, nullptr)); + AsanMemoryAccessCallback[AccessIsWrite][Exp][AccessSizeIndex] = + checkSanitizerInterfaceFunction(M.getOrInsertFunction( + ClMemoryAccessCallbackPrefix + ExpStr + Suffix, IRB.getVoidTy(), + IntptrTy, ExpType, nullptr)); + } } } - AsanErrorCallbackSized[0] = checkInterfaceFunction(M.getOrInsertFunction( - kAsanReportLoadN, IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr)); - AsanErrorCallbackSized[1] = checkInterfaceFunction(M.getOrInsertFunction( - kAsanReportStoreN, IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr)); - - AsanMemoryAccessCallbackSized[0] = checkInterfaceFunction( - M.getOrInsertFunction(ClMemoryAccessCallbackPrefix + "loadN", - IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr)); - AsanMemoryAccessCallbackSized[1] = checkInterfaceFunction( - M.getOrInsertFunction(ClMemoryAccessCallbackPrefix + "storeN", - IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr)); - - AsanMemmove = checkInterfaceFunction(M.getOrInsertFunction( + + AsanMemmove = checkSanitizerInterfaceFunction(M.getOrInsertFunction( ClMemoryAccessCallbackPrefix + "memmove", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IntptrTy, nullptr)); - AsanMemcpy = checkInterfaceFunction(M.getOrInsertFunction( + AsanMemcpy = checkSanitizerInterfaceFunction(M.getOrInsertFunction( ClMemoryAccessCallbackPrefix + "memcpy", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IntptrTy, nullptr)); - AsanMemset = checkInterfaceFunction(M.getOrInsertFunction( + AsanMemset = checkSanitizerInterfaceFunction(M.getOrInsertFunction( ClMemoryAccessCallbackPrefix + "memset", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt32Ty(), IntptrTy, nullptr)); - AsanHandleNoReturnFunc = checkInterfaceFunction( + AsanHandleNoReturnFunc = checkSanitizerInterfaceFunction( M.getOrInsertFunction(kAsanHandleNoReturnName, IRB.getVoidTy(), nullptr)); - AsanPtrCmpFunction = checkInterfaceFunction(M.getOrInsertFunction( + AsanPtrCmpFunction = checkSanitizerInterfaceFunction(M.getOrInsertFunction( kAsanPtrCmp, IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr)); - AsanPtrSubFunction = checkInterfaceFunction(M.getOrInsertFunction( + AsanPtrSubFunction = checkSanitizerInterfaceFunction(M.getOrInsertFunction( kAsanPtrSub, IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr)); // We insert an empty inline asm after __asan_report* to avoid callback merge. EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false), @@ -1364,28 +1444,18 @@ void AddressSanitizer::initializeCallbacks(Module &M) { // virtual bool AddressSanitizer::doInitialization(Module &M) { // Initialize the private fields. No one has accessed them before. - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - if (!DLP) - report_fatal_error("data layout missing"); - DL = &DLP->getDataLayout(); GlobalsMD.init(M); C = &(M.getContext()); - LongSize = DL->getPointerSizeInBits(); + LongSize = M.getDataLayout().getPointerSizeInBits(); IntptrTy = Type::getIntNTy(*C, LongSize); TargetTriple = Triple(M.getTargetTriple()); - AsanCtorFunction = Function::Create( - FunctionType::get(Type::getVoidTy(*C), false), - GlobalValue::InternalLinkage, kAsanModuleCtorName, &M); - BasicBlock *AsanCtorBB = BasicBlock::Create(*C, "", AsanCtorFunction); - // call __asan_init in the module ctor. - IRBuilder<> IRB(ReturnInst::Create(*C, AsanCtorBB)); - AsanInitFunction = checkInterfaceFunction( - M.getOrInsertFunction(kAsanInitName, IRB.getVoidTy(), nullptr)); - AsanInitFunction->setLinkage(Function::ExternalLinkage); - IRB.CreateCall(AsanInitFunction); + std::tie(AsanCtorFunction, AsanInitFunction) = + createSanitizerCtorAndInitFunctions(M, kAsanModuleCtorName, kAsanInitName, + /*InitArgTypes=*/{}, + /*InitArgs=*/{}); Mapping = getShadowMapping(TargetTriple, LongSize); @@ -1403,7 +1473,7 @@ bool AddressSanitizer::maybeInsertAsanInitAtFunctionEntry(Function &F) { // instrumented functions. if (F.getName().find(" load]") != std::string::npos) { IRBuilder<> IRB(F.begin()->begin()); - IRB.CreateCall(AsanInitFunction); + IRB.CreateCall(AsanInitFunction, {}); return true; } return false; @@ -1420,22 +1490,21 @@ bool AddressSanitizer::runOnFunction(Function &F) { // If needed, insert __asan_init before checking for SanitizeAddress attr. maybeInsertAsanInitAtFunctionEntry(F); - if (!F.hasFnAttribute(Attribute::SanitizeAddress)) - return false; + if (!F.hasFnAttribute(Attribute::SanitizeAddress)) return false; - if (!ClDebugFunc.empty() && ClDebugFunc != F.getName()) - return false; + if (!ClDebugFunc.empty() && ClDebugFunc != F.getName()) return false; // We want to instrument every address only once per basic block (unless there // are calls between uses). - SmallSet<Value*, 16> TempsToInstrument; - SmallVector<Instruction*, 16> ToInstrument; - SmallVector<Instruction*, 8> NoReturnCalls; - SmallVector<BasicBlock*, 16> AllBlocks; - SmallVector<Instruction*, 16> PointerComparisonsOrSubtracts; + SmallSet<Value *, 16> TempsToInstrument; + SmallVector<Instruction *, 16> ToInstrument; + SmallVector<Instruction *, 8> NoReturnCalls; + SmallVector<BasicBlock *, 16> AllBlocks; + SmallVector<Instruction *, 16> PointerComparisonsOrSubtracts; int NumAllocas = 0; bool IsWrite; unsigned Alignment; + uint64_t TypeSize; // Fill the set of memory operations to instrument. for (auto &BB : F) { @@ -1444,8 +1513,8 @@ bool AddressSanitizer::runOnFunction(Function &F) { int NumInsnsPerBB = 0; for (auto &Inst : BB) { if (LooksLikeCodeInBug11395(&Inst)) return false; - if (Value *Addr = - isInterestingMemoryAccess(&Inst, &IsWrite, &Alignment)) { + if (Value *Addr = isInterestingMemoryAccess(&Inst, &IsWrite, &TypeSize, + &Alignment)) { if (ClOpt && ClOptSameTemp) { if (!TempsToInstrument.insert(Addr).second) continue; // We've seen this temp in the current BB. @@ -1457,21 +1526,18 @@ bool AddressSanitizer::runOnFunction(Function &F) { } else if (isa<MemIntrinsic>(Inst)) { // ok, take it. } else { - if (isa<AllocaInst>(Inst)) - NumAllocas++; + if (isa<AllocaInst>(Inst)) NumAllocas++; CallSite CS(&Inst); if (CS) { // A call inside BB. TempsToInstrument.clear(); - if (CS.doesNotReturn()) - NoReturnCalls.push_back(CS.getInstruction()); + if (CS.doesNotReturn()) NoReturnCalls.push_back(CS.getInstruction()); } continue; } ToInstrument.push_back(&Inst); NumInsnsPerBB++; - if (NumInsnsPerBB >= ClMaxInsnsToInstrumentPerBB) - break; + if (NumInsnsPerBB >= ClMaxInsnsToInstrumentPerBB) break; } } @@ -1480,13 +1546,20 @@ bool AddressSanitizer::runOnFunction(Function &F) { ToInstrument.size() > (unsigned)ClInstrumentationWithCallsThreshold) UseCalls = true; + const TargetLibraryInfo *TLI = + &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + const DataLayout &DL = F.getParent()->getDataLayout(); + ObjectSizeOffsetVisitor ObjSizeVis(DL, TLI, F.getContext(), + /*RoundToAlign=*/true); + // Instrument. int NumInstrumented = 0; for (auto Inst : ToInstrument) { if (ClDebugMin < 0 || ClDebugMax < 0 || (NumInstrumented >= ClDebugMin && NumInstrumented <= ClDebugMax)) { - if (isInterestingMemoryAccess(Inst, &IsWrite, &Alignment)) - instrumentMop(Inst, UseCalls); + if (isInterestingMemoryAccess(Inst, &IsWrite, &TypeSize, &Alignment)) + instrumentMop(ObjSizeVis, Inst, UseCalls, + F.getParent()->getDataLayout()); else instrumentMemIntrinsic(cast<MemIntrinsic>(Inst)); } @@ -1500,7 +1573,7 @@ bool AddressSanitizer::runOnFunction(Function &F) { // See e.g. http://code.google.com/p/address-sanitizer/issues/detail?id=37 for (auto CI : NoReturnCalls) { IRBuilder<> IRB(CI); - IRB.CreateCall(AsanHandleNoReturnFunc); + IRB.CreateCall(AsanHandleNoReturnFunc, {}); } for (auto Inst : PointerComparisonsOrSubtracts) { @@ -1531,24 +1604,24 @@ void FunctionStackPoisoner::initializeCallbacks(Module &M) { IRBuilder<> IRB(*C); for (int i = 0; i <= kMaxAsanStackMallocSizeClass; i++) { std::string Suffix = itostr(i); - AsanStackMallocFunc[i] = checkInterfaceFunction(M.getOrInsertFunction( - kAsanStackMallocNameTemplate + Suffix, IntptrTy, IntptrTy, nullptr)); - AsanStackFreeFunc[i] = checkInterfaceFunction( + AsanStackMallocFunc[i] = checkSanitizerInterfaceFunction( + M.getOrInsertFunction(kAsanStackMallocNameTemplate + Suffix, IntptrTy, + IntptrTy, nullptr)); + AsanStackFreeFunc[i] = checkSanitizerInterfaceFunction( M.getOrInsertFunction(kAsanStackFreeNameTemplate + Suffix, IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr)); } - AsanPoisonStackMemoryFunc = checkInterfaceFunction( + AsanPoisonStackMemoryFunc = checkSanitizerInterfaceFunction( M.getOrInsertFunction(kAsanPoisonStackMemoryName, IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr)); - AsanUnpoisonStackMemoryFunc = checkInterfaceFunction( + AsanUnpoisonStackMemoryFunc = checkSanitizerInterfaceFunction( M.getOrInsertFunction(kAsanUnpoisonStackMemoryName, IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr)); } -void -FunctionStackPoisoner::poisonRedZones(ArrayRef<uint8_t> ShadowBytes, - IRBuilder<> &IRB, Value *ShadowBase, - bool DoPoison) { +void FunctionStackPoisoner::poisonRedZones(ArrayRef<uint8_t> ShadowBytes, + IRBuilder<> &IRB, Value *ShadowBase, + bool DoPoison) { size_t n = ShadowBytes.size(); size_t i = 0; // We need to (un)poison n bytes of stack shadow. Poison as many as we can @@ -1559,7 +1632,7 @@ FunctionStackPoisoner::poisonRedZones(ArrayRef<uint8_t> ShadowBytes, for (; i + LargeStoreSizeInBytes - 1 < n; i += LargeStoreSizeInBytes) { uint64_t Val = 0; for (size_t j = 0; j < LargeStoreSizeInBytes; j++) { - if (ASan.DL->isLittleEndian()) + if (F.getParent()->getDataLayout().isLittleEndian()) Val |= (uint64_t)ShadowBytes[i + j] << (8 * j); else Val = (Val << 8) | ShadowBytes[i + j]; @@ -1578,9 +1651,8 @@ FunctionStackPoisoner::poisonRedZones(ArrayRef<uint8_t> ShadowBytes, static int StackMallocSizeClass(uint64_t LocalStackSize) { assert(LocalStackSize <= kMaxStackMallocSize); uint64_t MaxSize = kMinStackMallocSize; - for (int i = 0; ; i++, MaxSize *= 2) - if (LocalStackSize <= MaxSize) - return i; + for (int i = 0;; i++, MaxSize *= 2) + if (LocalStackSize <= MaxSize) return i; llvm_unreachable("impossible LocalStackSize"); } @@ -1592,18 +1664,21 @@ static int StackMallocSizeClass(uint64_t LocalStackSize) { void FunctionStackPoisoner::SetShadowToStackAfterReturnInlined( IRBuilder<> &IRB, Value *ShadowBase, int Size) { assert(!(Size % 8)); - assert(kAsanStackAfterReturnMagic == 0xf5); + + // kAsanStackAfterReturnMagic is 0xf5. + const uint64_t kAsanStackAfterReturnMagic64 = 0xf5f5f5f5f5f5f5f5ULL; + for (int i = 0; i < Size; i += 8) { Value *p = IRB.CreateAdd(ShadowBase, ConstantInt::get(IntptrTy, i)); - IRB.CreateStore(ConstantInt::get(IRB.getInt64Ty(), 0xf5f5f5f5f5f5f5f5ULL), - IRB.CreateIntToPtr(p, IRB.getInt64Ty()->getPointerTo())); + IRB.CreateStore( + ConstantInt::get(IRB.getInt64Ty(), kAsanStackAfterReturnMagic64), + IRB.CreateIntToPtr(p, IRB.getInt64Ty()->getPointerTo())); } } static DebugLoc getFunctionEntryDebugLocation(Function &F) { for (const auto &Inst : F.getEntryBlock()) - if (!isa<AllocaInst>(Inst)) - return Inst.getDebugLoc(); + if (!isa<AllocaInst>(Inst)) return Inst.getDebugLoc(); return DebugLoc(); } @@ -1640,10 +1715,13 @@ Value *FunctionStackPoisoner::createAllocaForLayout( void FunctionStackPoisoner::poisonStack() { assert(AllocaVec.size() > 0 || DynamicAllocaVec.size() > 0); - if (ClInstrumentAllocas) + if (ClInstrumentAllocas) { // Handle dynamic allocas. - for (auto &AllocaCall : DynamicAllocaVec) + for (auto &AllocaCall : DynamicAllocaVec) { handleDynamicAllocaCall(AllocaCall); + unpoisonDynamicAlloca(AllocaCall); + } + } if (AllocaVec.size() == 0) return; @@ -1657,9 +1735,9 @@ void FunctionStackPoisoner::poisonStack() { SmallVector<ASanStackVariableDescription, 16> SVD; SVD.reserve(AllocaVec.size()); for (AllocaInst *AI : AllocaVec) { - ASanStackVariableDescription D = { AI->getName().data(), - getAllocaSizeInBytes(AI), - AI->getAlignment(), AI, 0}; + ASanStackVariableDescription D = {AI->getName().data(), + ASan.getAllocaSizeInBytes(AI), + AI->getAlignment(), AI, 0}; SVD.push_back(D); } // Minimal header size (left redzone) is 4 pointers, @@ -1671,9 +1749,11 @@ void FunctionStackPoisoner::poisonStack() { uint64_t LocalStackSize = L.FrameSize; bool DoStackMalloc = ClUseAfterReturn && LocalStackSize <= kMaxStackMallocSize; - // Don't do dynamic alloca in presence of inline asm: too often it - // makes assumptions on which registers are available. + // Don't do dynamic alloca in presence of inline asm: too often it makes + // assumptions on which registers are available. Don't do stack malloc in the + // presence of inline asm on 32-bit platforms for the same reason. bool DoDynamicAlloca = ClDynamicAllocaStack && !HasNonEmptyInlineAsm; + DoStackMalloc &= !HasNonEmptyInlineAsm || ASan.LongSize != 32; Value *StaticAlloca = DoDynamicAlloca ? nullptr : createAllocaForLayout(IRB, L, false); @@ -1739,7 +1819,7 @@ void FunctionStackPoisoner::poisonStack() { Value *NewAllocaPtr = IRB.CreateIntToPtr( IRB.CreateAdd(LocalStackBase, ConstantInt::get(IntptrTy, Desc.Offset)), AI->getType()); - replaceDbgDeclareForAlloca(AI, NewAllocaPtr, DIB); + replaceDbgDeclareForAlloca(AI, NewAllocaPtr, DIB, /*Deref=*/true); AI->replaceAllUsesWith(NewAllocaPtr); } @@ -1750,19 +1830,19 @@ void FunctionStackPoisoner::poisonStack() { BasePlus0); // Write the frame description constant to redzone[1]. Value *BasePlus1 = IRB.CreateIntToPtr( - IRB.CreateAdd(LocalStackBase, ConstantInt::get(IntptrTy, ASan.LongSize/8)), - IntptrPtrTy); + IRB.CreateAdd(LocalStackBase, + ConstantInt::get(IntptrTy, ASan.LongSize / 8)), + IntptrPtrTy); GlobalVariable *StackDescriptionGlobal = createPrivateGlobalForString(*F.getParent(), L.DescriptionString, - /*AllowMerging*/true); - Value *Description = IRB.CreatePointerCast(StackDescriptionGlobal, - IntptrTy); + /*AllowMerging*/ true); + Value *Description = IRB.CreatePointerCast(StackDescriptionGlobal, IntptrTy); IRB.CreateStore(Description, BasePlus1); // Write the PC to redzone[2]. Value *BasePlus2 = IRB.CreateIntToPtr( - IRB.CreateAdd(LocalStackBase, ConstantInt::get(IntptrTy, - 2 * ASan.LongSize/8)), - IntptrPtrTy); + IRB.CreateAdd(LocalStackBase, + ConstantInt::get(IntptrTy, 2 * ASan.LongSize / 8)), + IntptrPtrTy); IRB.CreateStore(IRB.CreatePointerCast(&F, IntptrTy), BasePlus2); // Poison the stack redzones at the entry. @@ -1807,8 +1887,9 @@ void FunctionStackPoisoner::poisonStack() { IRBPoison.CreateIntToPtr(SavedFlagPtr, IRBPoison.getInt8PtrTy())); } else { // For larger frames call __asan_stack_free_*. - IRBPoison.CreateCall2(AsanStackFreeFunc[StackMallocIdx], FakeStack, - ConstantInt::get(IntptrTy, LocalStackSize)); + IRBPoison.CreateCall( + AsanStackFreeFunc[StackMallocIdx], + {FakeStack, ConstantInt::get(IntptrTy, LocalStackSize)}); } IRBuilder<> IRBElse(ElseTerm); @@ -1822,14 +1903,8 @@ void FunctionStackPoisoner::poisonStack() { } } - if (ClInstrumentAllocas) - // Unpoison dynamic allocas. - for (auto &AllocaCall : DynamicAllocaVec) - unpoisonDynamicAlloca(AllocaCall); - // We are done. Remove the old unused alloca instructions. - for (auto AI : AllocaVec) - AI->eraseFromParent(); + for (auto AI : AllocaVec) AI->eraseFromParent(); } void FunctionStackPoisoner::poisonAlloca(Value *V, uint64_t Size, @@ -1837,9 +1912,9 @@ void FunctionStackPoisoner::poisonAlloca(Value *V, uint64_t Size, // For now just insert the call to ASan runtime. Value *AddrArg = IRB.CreatePointerCast(V, IntptrTy); Value *SizeArg = ConstantInt::get(IntptrTy, Size); - IRB.CreateCall2(DoPoison ? AsanPoisonStackMemoryFunc - : AsanUnpoisonStackMemoryFunc, - AddrArg, SizeArg); + IRB.CreateCall(DoPoison ? AsanPoisonStackMemoryFunc + : AsanUnpoisonStackMemoryFunc, + {AddrArg, SizeArg}); } // Handling llvm.lifetime intrinsics for a given %alloca: @@ -1854,12 +1929,11 @@ void FunctionStackPoisoner::poisonAlloca(Value *V, uint64_t Size, AllocaInst *FunctionStackPoisoner::findAllocaForValue(Value *V) { if (AllocaInst *AI = dyn_cast<AllocaInst>(V)) // We're intested only in allocas we can handle. - return isInterestingAlloca(*AI) ? AI : nullptr; + return ASan.isInterestingAlloca(*AI) ? AI : nullptr; // See if we've already calculated (or started to calculate) alloca for a // given value. AllocaForValueMapTy::iterator I = AllocaForValue.find(V); - if (I != AllocaForValue.end()) - return I->second; + if (I != AllocaForValue.end()) return I->second; // Store 0 while we're calculating alloca for value V to avoid // infinite recursion if the value references itself. AllocaForValue[V] = nullptr; @@ -1867,8 +1941,7 @@ AllocaInst *FunctionStackPoisoner::findAllocaForValue(Value *V) { if (CastInst *CI = dyn_cast<CastInst>(V)) Res = findAllocaForValue(CI->getOperand(0)); else if (PHINode *PN = dyn_cast<PHINode>(V)) { - for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { - Value *IncValue = PN->getIncomingValue(i); + for (Value *IncValue : PN->incoming_values()) { // Allow self-referencing phi-nodes. if (IncValue == PN) continue; AllocaInst *IncValueAI = findAllocaForValue(IncValue); @@ -1878,8 +1951,7 @@ AllocaInst *FunctionStackPoisoner::findAllocaForValue(Value *V) { Res = IncValueAI; } } - if (Res) - AllocaForValue[V] = Res; + if (Res) AllocaForValue[V] = Res; return Res; } @@ -1910,14 +1982,14 @@ Value *FunctionStackPoisoner::computePartialRzMagic(Value *PartialSize, Value *Shift = IRB.CreateAnd(PartialSize, IRB.getInt32(~7)); unsigned Val1Int = kAsanAllocaPartialVal1; unsigned Val2Int = kAsanAllocaPartialVal2; - if (!ASan.DL->isLittleEndian()) { + if (!F.getParent()->getDataLayout().isLittleEndian()) { Val1Int = sys::getSwappedBytes(Val1Int); Val2Int = sys::getSwappedBytes(Val2Int); } Value *Val1 = shiftAllocaMagic(IRB.getInt32(Val1Int), IRB, Shift); Value *PartialBits = IRB.CreateAnd(PartialSize, IRB.getInt32(7)); // For BigEndian get 0x000000YZ -> 0xYZ000000. - if (ASan.DL->isBigEndian()) + if (F.getParent()->getDataLayout().isBigEndian()) PartialBits = IRB.CreateShl(PartialBits, IRB.getInt32(24)); Value *Val2 = IRB.getInt32(Val2Int); Value *Cond = @@ -1951,7 +2023,8 @@ void FunctionStackPoisoner::handleDynamicAllocaCall( // redzones, and OldSize is number of allocated blocks with // ElementSize size, get allocated memory size in bytes by // OldSize * ElementSize. - unsigned ElementSize = ASan.DL->getTypeAllocSize(AI->getAllocatedType()); + unsigned ElementSize = + F.getParent()->getDataLayout().getTypeAllocSize(AI->getAllocatedType()); Value *OldSize = IRB.CreateMul(AI->getArraySize(), ConstantInt::get(IntptrTy, ElementSize)); @@ -2019,3 +2092,20 @@ void FunctionStackPoisoner::handleDynamicAllocaCall( AI->eraseFromParent(); NumInstrumentedDynamicAllocas++; } + +// isSafeAccess returns true if Addr is always inbounds with respect to its +// base object. For example, it is a field access or an array access with +// constant inbounds index. +bool AddressSanitizer::isSafeAccess(ObjectSizeOffsetVisitor &ObjSizeVis, + Value *Addr, uint64_t TypeSize) const { + SizeOffsetType SizeOffset = ObjSizeVis.compute(Addr); + if (!ObjSizeVis.bothKnown(SizeOffset)) return false; + uint64_t Size = SizeOffset.first.getZExtValue(); + int64_t Offset = SizeOffset.second.getSExtValue(); + // Three checks are required to ensure safety: + // . Offset >= 0 (since the offset is given from the base ptr) + // . Size >= Offset (unsigned) + // . Size - Offset >= NeededSize (unsigned) + return Offset >= 0 && Size >= uint64_t(Offset) && + Size - uint64_t(Offset) >= TypeSize / 8; +} diff --git a/contrib/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp b/contrib/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp index 9a5cea8..f685803 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp @@ -16,6 +16,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/TargetFolder.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstIterator.h" @@ -24,7 +25,6 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetLibraryInfo.h" using namespace llvm; #define DEBUG_TYPE "bounds-checking" @@ -49,12 +49,10 @@ namespace { bool runOnFunction(Function &F) override; void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<DataLayoutPass>(); - AU.addRequired<TargetLibraryInfo>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); } private: - const DataLayout *DL; const TargetLibraryInfo *TLI; ObjectSizeOffsetEvaluator *ObjSizeEval; BuilderTy *Builder; @@ -63,7 +61,7 @@ namespace { BasicBlock *getTrapBB(); void emitBranchToTrap(Value *Cmp = nullptr); - bool instrument(Value *Ptr, Value *Val); + bool instrument(Value *Ptr, Value *Val, const DataLayout &DL); }; } @@ -84,7 +82,7 @@ BasicBlock *BoundsChecking::getTrapBB() { Builder->SetInsertPoint(TrapBB); llvm::Value *F = Intrinsic::getDeclaration(Fn->getParent(), Intrinsic::trap); - CallInst *TrapCall = Builder->CreateCall(F); + CallInst *TrapCall = Builder->CreateCall(F, {}); TrapCall->setDoesNotReturn(); TrapCall->setDoesNotThrow(); TrapCall->setDebugLoc(Inst->getDebugLoc()); @@ -125,8 +123,9 @@ void BoundsChecking::emitBranchToTrap(Value *Cmp) { /// result from the load or the value being stored. It is used to determine the /// size of memory block that is touched. /// Returns true if any change was made to the IR, false otherwise. -bool BoundsChecking::instrument(Value *Ptr, Value *InstVal) { - uint64_t NeededSize = DL->getTypeStoreSize(InstVal->getType()); +bool BoundsChecking::instrument(Value *Ptr, Value *InstVal, + const DataLayout &DL) { + uint64_t NeededSize = DL.getTypeStoreSize(InstVal->getType()); DEBUG(dbgs() << "Instrument " << *Ptr << " for " << Twine(NeededSize) << " bytes\n"); @@ -141,7 +140,7 @@ bool BoundsChecking::instrument(Value *Ptr, Value *InstVal) { Value *Offset = SizeOffset.second; ConstantInt *SizeCI = dyn_cast<ConstantInt>(Size); - Type *IntTy = DL->getIntPtrType(Ptr->getType()); + Type *IntTy = DL.getIntPtrType(Ptr->getType()); Value *NeededSizeVal = ConstantInt::get(IntTy, NeededSize); // three checks are required to ensure safety: @@ -165,8 +164,8 @@ bool BoundsChecking::instrument(Value *Ptr, Value *InstVal) { } bool BoundsChecking::runOnFunction(Function &F) { - DL = &getAnalysis<DataLayoutPass>().getDataLayout(); - TLI = &getAnalysis<TargetLibraryInfo>(); + const DataLayout &DL = F.getParent()->getDataLayout(); + TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); TrapBB = nullptr; BuilderTy TheBuilder(F.getContext(), TargetFolder(DL)); @@ -192,13 +191,16 @@ bool BoundsChecking::runOnFunction(Function &F) { Builder->SetInsertPoint(Inst); if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { - MadeChange |= instrument(LI->getPointerOperand(), LI); + MadeChange |= instrument(LI->getPointerOperand(), LI, DL); } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { - MadeChange |= instrument(SI->getPointerOperand(), SI->getValueOperand()); + MadeChange |= + instrument(SI->getPointerOperand(), SI->getValueOperand(), DL); } else if (AtomicCmpXchgInst *AI = dyn_cast<AtomicCmpXchgInst>(Inst)) { - MadeChange |= instrument(AI->getPointerOperand(),AI->getCompareOperand()); + MadeChange |= + instrument(AI->getPointerOperand(), AI->getCompareOperand(), DL); } else if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(Inst)) { - MadeChange |= instrument(AI->getPointerOperand(), AI->getValOperand()); + MadeChange |= + instrument(AI->getPointerOperand(), AI->getValOperand(), DL); } else { llvm_unreachable("unknown Instruction type"); } diff --git a/contrib/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp index 8f24476..2de6e1a 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp @@ -83,14 +83,14 @@ static cl::opt<bool> ClPreserveAlignment( cl::desc("respect alignment requirements provided by input IR"), cl::Hidden, cl::init(false)); -// The ABI list file controls how shadow parameters are passed. The pass treats +// The ABI list files control how shadow parameters are passed. The pass treats // every function labelled "uninstrumented" in the ABI list file as conforming // to the "native" (i.e. unsanitized) ABI. Unless the ABI list contains // additional annotations for those functions, a call to one of those functions // will produce a warning message, as the labelling behaviour of the function is // unknown. The other supported annotations are "functional" and "discard", // which are described below under DataFlowSanitizer::WrapperKind. -static cl::opt<std::string> ClABIListFile( +static cl::list<std::string> ClABIListFiles( "dfsan-abilist", cl::desc("File listing native ABI functions and how the pass treats them"), cl::Hidden); @@ -141,7 +141,9 @@ class DFSanABIList { std::unique_ptr<SpecialCaseList> SCL; public: - DFSanABIList(std::unique_ptr<SpecialCaseList> SCL) : SCL(std::move(SCL)) {} + DFSanABIList() {} + + void set(std::unique_ptr<SpecialCaseList> List) { SCL = std::move(List); } /// Returns whether either this function or its source file are listed in the /// given category. @@ -215,7 +217,6 @@ class DataFlowSanitizer : public ModulePass { WK_Custom }; - const DataLayout *DL; Module *Mod; LLVMContext *Ctx; IntegerType *ShadowTy; @@ -247,7 +248,7 @@ class DataFlowSanitizer : public ModulePass { DFSanABIList ABIList; DenseMap<Value *, Function *> UnwrappedFnMap; AttributeSet ReadOnlyNoneAttrs; - DenseMap<const Function *, DISubprogram> FunctionDIs; + DenseMap<const Function *, DISubprogram *> FunctionDIs; Value *getShadowAddress(Value *Addr, Instruction *Pos); bool isInstrumented(const Function *F); @@ -264,9 +265,9 @@ class DataFlowSanitizer : public ModulePass { Constant *getOrBuildTrampolineFunction(FunctionType *FT, StringRef FName); public: - DataFlowSanitizer(StringRef ABIListFile = StringRef(), - void *(*getArgTLS)() = nullptr, - void *(*getRetValTLS)() = nullptr); + DataFlowSanitizer( + const std::vector<std::string> &ABIListFiles = std::vector<std::string>(), + void *(*getArgTLS)() = nullptr, void *(*getRetValTLS)() = nullptr); static char ID; bool doInitialization(Module &M) override; bool runOnModule(Module &M) override; @@ -351,25 +352,26 @@ char DataFlowSanitizer::ID; INITIALIZE_PASS(DataFlowSanitizer, "dfsan", "DataFlowSanitizer: dynamic data flow analysis.", false, false) -ModulePass *llvm::createDataFlowSanitizerPass(StringRef ABIListFile, - void *(*getArgTLS)(), - void *(*getRetValTLS)()) { - return new DataFlowSanitizer(ABIListFile, getArgTLS, getRetValTLS); +ModulePass * +llvm::createDataFlowSanitizerPass(const std::vector<std::string> &ABIListFiles, + void *(*getArgTLS)(), + void *(*getRetValTLS)()) { + return new DataFlowSanitizer(ABIListFiles, getArgTLS, getRetValTLS); } -DataFlowSanitizer::DataFlowSanitizer(StringRef ABIListFile, - void *(*getArgTLS)(), - void *(*getRetValTLS)()) - : ModulePass(ID), GetArgTLSPtr(getArgTLS), GetRetvalTLSPtr(getRetValTLS), - ABIList(SpecialCaseList::createOrDie(ABIListFile.empty() ? ClABIListFile - : ABIListFile)) { +DataFlowSanitizer::DataFlowSanitizer( + const std::vector<std::string> &ABIListFiles, void *(*getArgTLS)(), + void *(*getRetValTLS)()) + : ModulePass(ID), GetArgTLSPtr(getArgTLS), GetRetvalTLSPtr(getRetValTLS) { + std::vector<std::string> AllABIListFiles(std::move(ABIListFiles)); + AllABIListFiles.insert(AllABIListFiles.end(), ClABIListFiles.begin(), + ClABIListFiles.end()); + ABIList.set(SpecialCaseList::createOrDie(AllABIListFiles)); } FunctionType *DataFlowSanitizer::getArgsFunctionType(FunctionType *T) { - llvm::SmallVector<Type *, 4> ArgTypes; - std::copy(T->param_begin(), T->param_end(), std::back_inserter(ArgTypes)); - for (unsigned i = 0, e = T->getNumParams(); i != e; ++i) - ArgTypes.push_back(ShadowTy); + llvm::SmallVector<Type *, 4> ArgTypes(T->param_begin(), T->param_end()); + ArgTypes.append(T->getNumParams(), ShadowTy); if (T->isVarArg()) ArgTypes.push_back(ShadowPtrTy); Type *RetType = T->getReturnType(); @@ -382,9 +384,8 @@ FunctionType *DataFlowSanitizer::getTrampolineFunctionType(FunctionType *T) { assert(!T->isVarArg()); llvm::SmallVector<Type *, 4> ArgTypes; ArgTypes.push_back(T->getPointerTo()); - std::copy(T->param_begin(), T->param_end(), std::back_inserter(ArgTypes)); - for (unsigned i = 0, e = T->getNumParams(); i != e; ++i) - ArgTypes.push_back(ShadowTy); + ArgTypes.append(T->param_begin(), T->param_end()); + ArgTypes.append(T->getNumParams(), ShadowTy); Type *RetType = T->getReturnType(); if (!RetType->isVoidTy()) ArgTypes.push_back(ShadowPtrTy); @@ -420,16 +421,13 @@ bool DataFlowSanitizer::doInitialization(Module &M) { bool IsMIPS64 = TargetTriple.getArch() == llvm::Triple::mips64 || TargetTriple.getArch() == llvm::Triple::mips64el; - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - if (!DLP) - report_fatal_error("data layout missing"); - DL = &DLP->getDataLayout(); + const DataLayout &DL = M.getDataLayout(); Mod = &M; Ctx = &M.getContext(); ShadowTy = IntegerType::get(*Ctx, ShadowWidth); ShadowPtrTy = PointerType::getUnqual(ShadowTy); - IntptrTy = DL->getIntPtrType(*Ctx); + IntptrTy = DL.getIntPtrType(*Ctx); ZeroShadow = ConstantInt::getSigned(ShadowTy, 0); ShadowPtrMul = ConstantInt::getSigned(IntptrTy, ShadowWidth / 8); if (IsX86_64) @@ -528,9 +526,9 @@ DataFlowSanitizer::buildWrapperFunction(Function *F, StringRef NewFName, F->getParent()); NewF->copyAttributesFrom(F); NewF->removeAttributes( - AttributeSet::ReturnIndex, - AttributeFuncs::typeIncompatible(NewFT->getReturnType(), - AttributeSet::ReturnIndex)); + AttributeSet::ReturnIndex, + AttributeSet::get(F->getContext(), AttributeSet::ReturnIndex, + AttributeFuncs::typeIncompatible(NewFT->getReturnType()))); BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", NewF); if (F->isVarArg()) { @@ -591,9 +589,6 @@ Constant *DataFlowSanitizer::getOrBuildTrampolineFunction(FunctionType *FT, } bool DataFlowSanitizer::runOnModule(Module &M) { - if (!DL) - return false; - if (ABIList.isIn(M, "skip")) return false; @@ -708,9 +703,9 @@ bool DataFlowSanitizer::runOnModule(Module &M) { Function *NewF = Function::Create(NewFT, F.getLinkage(), "", &M); NewF->copyAttributesFrom(&F); NewF->removeAttributes( - AttributeSet::ReturnIndex, - AttributeFuncs::typeIncompatible(NewFT->getReturnType(), - AttributeSet::ReturnIndex)); + AttributeSet::ReturnIndex, + AttributeSet::get(NewF->getContext(), AttributeSet::ReturnIndex, + AttributeFuncs::typeIncompatible(NewFT->getReturnType()))); for (Function::arg_iterator FArg = F.arg_begin(), NewFArg = NewF->arg_begin(), FArgEnd = F.arg_end(); @@ -758,7 +753,7 @@ bool DataFlowSanitizer::runOnModule(Module &M) { // Patch the pointer to LLVM function in debug info descriptor. auto DI = FunctionDIs.find(&F); if (DI != FunctionDIs.end()) - DI->second.replaceFunction(&F); + DI->second->replaceFunction(&F); UnwrappedFnMap[WrappedFnCst] = &F; *i = NewF; @@ -855,7 +850,7 @@ bool DataFlowSanitizer::runOnModule(Module &M) { BranchInst *BI = cast<BranchInst>(SplitBlockAndInsertIfThen( Ne, Pos, /*Unreachable=*/false, ColdCallWeights)); IRBuilder<> ThenIRB(BI); - ThenIRB.CreateCall(DFSF.DFS.DFSanNonzeroLabelFn); + ThenIRB.CreateCall(DFSF.DFS.DFSanNonzeroLabelFn, {}); } } } @@ -870,7 +865,7 @@ Value *DFSanFunction::getArgTLSPtr() { return ArgTLSPtr = DFS.ArgTLS; IRBuilder<> IRB(F->getEntryBlock().begin()); - return ArgTLSPtr = IRB.CreateCall(DFS.GetArgTLS); + return ArgTLSPtr = IRB.CreateCall(DFS.GetArgTLS, {}); } Value *DFSanFunction::getRetvalTLS() { @@ -880,7 +875,7 @@ Value *DFSanFunction::getRetvalTLS() { return RetvalTLSPtr = DFS.RetvalTLS; IRBuilder<> IRB(F->getEntryBlock().begin()); - return RetvalTLSPtr = IRB.CreateCall(DFS.GetRetvalTLS); + return RetvalTLSPtr = IRB.CreateCall(DFS.GetRetvalTLS, {}); } Value *DFSanFunction::getArgTLS(unsigned Idx, Instruction *Pos) { @@ -977,7 +972,7 @@ Value *DFSanFunction::combineShadows(Value *V1, Value *V2, Instruction *Pos) { IRBuilder<> IRB(Pos); if (AvoidNewBlocks) { - CallInst *Call = IRB.CreateCall2(DFS.DFSanCheckedUnionFn, V1, V2); + CallInst *Call = IRB.CreateCall(DFS.DFSanCheckedUnionFn, {V1, V2}); Call->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt); Call->addAttribute(1, Attribute::ZExt); Call->addAttribute(2, Attribute::ZExt); @@ -990,7 +985,7 @@ Value *DFSanFunction::combineShadows(Value *V1, Value *V2, Instruction *Pos) { BranchInst *BI = cast<BranchInst>(SplitBlockAndInsertIfThen( Ne, Pos, /*Unreachable=*/false, DFS.ColdCallWeights, &DT)); IRBuilder<> ThenIRB(BI); - CallInst *Call = ThenIRB.CreateCall2(DFS.DFSanUnionFn, V1, V2); + CallInst *Call = ThenIRB.CreateCall(DFS.DFSanUnionFn, {V1, V2}); Call->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt); Call->addAttribute(1, Attribute::ZExt); Call->addAttribute(2, Attribute::ZExt); @@ -1054,7 +1049,7 @@ Value *DFSanFunction::loadShadow(Value *Addr, uint64_t Size, uint64_t Align, uint64_t ShadowAlign = Align * DFS.ShadowWidth / 8; SmallVector<Value *, 2> Objs; - GetUnderlyingObjects(Addr, Objs, DFS.DL); + GetUnderlyingObjects(Addr, Objs, Pos->getModule()->getDataLayout()); bool AllConstants = true; for (SmallVector<Value *, 2>::iterator i = Objs.begin(), e = Objs.end(); i != e; ++i) { @@ -1080,8 +1075,8 @@ Value *DFSanFunction::loadShadow(Value *Addr, uint64_t Size, uint64_t Align, } case 2: { IRBuilder<> IRB(Pos); - Value *ShadowAddr1 = - IRB.CreateGEP(ShadowAddr, ConstantInt::get(DFS.IntptrTy, 1)); + Value *ShadowAddr1 = IRB.CreateGEP(DFS.ShadowTy, ShadowAddr, + ConstantInt::get(DFS.IntptrTy, 1)); return combineShadows(IRB.CreateAlignedLoad(ShadowAddr, ShadowAlign), IRB.CreateAlignedLoad(ShadowAddr1, ShadowAlign), Pos); } @@ -1092,8 +1087,9 @@ Value *DFSanFunction::loadShadow(Value *Addr, uint64_t Size, uint64_t Align, // shadow is non-equal. BasicBlock *FallbackBB = BasicBlock::Create(*DFS.Ctx, "", F); IRBuilder<> FallbackIRB(FallbackBB); - CallInst *FallbackCall = FallbackIRB.CreateCall2( - DFS.DFSanUnionLoadFn, ShadowAddr, ConstantInt::get(DFS.IntptrTy, Size)); + CallInst *FallbackCall = FallbackIRB.CreateCall( + DFS.DFSanUnionLoadFn, + {ShadowAddr, ConstantInt::get(DFS.IntptrTy, Size)}); FallbackCall->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt); // Compare each of the shadows stored in the loaded 64 bits to each other, @@ -1132,7 +1128,8 @@ Value *DFSanFunction::loadShadow(Value *Addr, uint64_t Size, uint64_t Align, BasicBlock *NextBB = BasicBlock::Create(*DFS.Ctx, "", F); DT.addNewBlock(NextBB, LastBr->getParent()); IRBuilder<> NextIRB(NextBB); - WideAddr = NextIRB.CreateGEP(WideAddr, ConstantInt::get(DFS.IntptrTy, 1)); + WideAddr = NextIRB.CreateGEP(Type::getInt64Ty(*DFS.Ctx), WideAddr, + ConstantInt::get(DFS.IntptrTy, 1)); Value *NextWideShadow = NextIRB.CreateAlignedLoad(WideAddr, ShadowAlign); ShadowsEq = NextIRB.CreateICmpEQ(WideShadow, NextWideShadow); LastBr->setSuccessor(0, NextBB); @@ -1148,14 +1145,15 @@ Value *DFSanFunction::loadShadow(Value *Addr, uint64_t Size, uint64_t Align, } IRBuilder<> IRB(Pos); - CallInst *FallbackCall = IRB.CreateCall2( - DFS.DFSanUnionLoadFn, ShadowAddr, ConstantInt::get(DFS.IntptrTy, Size)); + CallInst *FallbackCall = IRB.CreateCall( + DFS.DFSanUnionLoadFn, {ShadowAddr, ConstantInt::get(DFS.IntptrTy, Size)}); FallbackCall->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt); return FallbackCall; } void DFSanVisitor::visitLoadInst(LoadInst &LI) { - uint64_t Size = DFSF.DFS.DL->getTypeStoreSize(LI.getType()); + auto &DL = LI.getModule()->getDataLayout(); + uint64_t Size = DL.getTypeStoreSize(LI.getType()); if (Size == 0) { DFSF.setShadow(&LI, DFSF.DFS.ZeroShadow); return; @@ -1165,7 +1163,7 @@ void DFSanVisitor::visitLoadInst(LoadInst &LI) { if (ClPreserveAlignment) { Align = LI.getAlignment(); if (Align == 0) - Align = DFSF.DFS.DL->getABITypeAlignment(LI.getType()); + Align = DL.getABITypeAlignment(LI.getType()); } else { Align = 1; } @@ -1217,7 +1215,8 @@ void DFSanFunction::storeShadow(Value *Addr, uint64_t Size, uint64_t Align, Value *ShadowVecAddr = IRB.CreateBitCast(ShadowAddr, PointerType::getUnqual(ShadowVecTy)); do { - Value *CurShadowVecAddr = IRB.CreateConstGEP1_32(ShadowVecAddr, Offset); + Value *CurShadowVecAddr = + IRB.CreateConstGEP1_32(ShadowVecTy, ShadowVecAddr, Offset); IRB.CreateAlignedStore(ShadowVec, CurShadowVecAddr, ShadowAlign); Size -= ShadowVecSize; ++Offset; @@ -1225,7 +1224,8 @@ void DFSanFunction::storeShadow(Value *Addr, uint64_t Size, uint64_t Align, Offset *= ShadowVecSize; } while (Size > 0) { - Value *CurShadowAddr = IRB.CreateConstGEP1_32(ShadowAddr, Offset); + Value *CurShadowAddr = + IRB.CreateConstGEP1_32(DFS.ShadowTy, ShadowAddr, Offset); IRB.CreateAlignedStore(Shadow, CurShadowAddr, ShadowAlign); --Size; ++Offset; @@ -1233,8 +1233,8 @@ void DFSanFunction::storeShadow(Value *Addr, uint64_t Size, uint64_t Align, } void DFSanVisitor::visitStoreInst(StoreInst &SI) { - uint64_t Size = - DFSF.DFS.DL->getTypeStoreSize(SI.getValueOperand()->getType()); + auto &DL = SI.getModule()->getDataLayout(); + uint64_t Size = DL.getTypeStoreSize(SI.getValueOperand()->getType()); if (Size == 0) return; @@ -1242,7 +1242,7 @@ void DFSanVisitor::visitStoreInst(StoreInst &SI) { if (ClPreserveAlignment) { Align = SI.getAlignment(); if (Align == 0) - Align = DFSF.DFS.DL->getABITypeAlignment(SI.getValueOperand()->getType()); + Align = DL.getABITypeAlignment(SI.getValueOperand()->getType()); } else { Align = 1; } @@ -1333,10 +1333,10 @@ void DFSanVisitor::visitSelectInst(SelectInst &I) { void DFSanVisitor::visitMemSetInst(MemSetInst &I) { IRBuilder<> IRB(&I); Value *ValShadow = DFSF.getShadow(I.getValue()); - IRB.CreateCall3( - DFSF.DFS.DFSanSetLabelFn, ValShadow, - IRB.CreateBitCast(I.getDest(), Type::getInt8PtrTy(*DFSF.DFS.Ctx)), - IRB.CreateZExtOrTrunc(I.getLength(), DFSF.DFS.IntptrTy)); + IRB.CreateCall(DFSF.DFS.DFSanSetLabelFn, + {ValShadow, IRB.CreateBitCast(I.getDest(), Type::getInt8PtrTy( + *DFSF.DFS.Ctx)), + IRB.CreateZExtOrTrunc(I.getLength(), DFSF.DFS.IntptrTy)}); } void DFSanVisitor::visitMemTransferInst(MemTransferInst &I) { @@ -1358,8 +1358,8 @@ void DFSanVisitor::visitMemTransferInst(MemTransferInst &I) { Type *Int8Ptr = Type::getInt8PtrTy(*DFSF.DFS.Ctx); DestShadow = IRB.CreateBitCast(DestShadow, Int8Ptr); SrcShadow = IRB.CreateBitCast(SrcShadow, Int8Ptr); - IRB.CreateCall5(I.getCalledValue(), DestShadow, SrcShadow, LenShadow, - AlignShadow, I.getVolatileCst()); + IRB.CreateCall(I.getCalledValue(), {DestShadow, SrcShadow, LenShadow, + AlignShadow, I.getVolatileCst()}); } void DFSanVisitor::visitReturnInst(ReturnInst &RI) { @@ -1473,17 +1473,17 @@ void DFSanVisitor::visitCallSite(CallSite CS) { Args.push_back(DFSF.getShadow(*i)); if (FT->isVarArg()) { - auto LabelVAAlloca = - new AllocaInst(ArrayType::get(DFSF.DFS.ShadowTy, - CS.arg_size() - FT->getNumParams()), - "labelva", DFSF.F->getEntryBlock().begin()); + auto *LabelVATy = ArrayType::get(DFSF.DFS.ShadowTy, + CS.arg_size() - FT->getNumParams()); + auto *LabelVAAlloca = new AllocaInst(LabelVATy, "labelva", + DFSF.F->getEntryBlock().begin()); for (unsigned n = 0; i != CS.arg_end(); ++i, ++n) { - auto LabelVAPtr = IRB.CreateStructGEP(LabelVAAlloca, n); + auto LabelVAPtr = IRB.CreateStructGEP(LabelVATy, LabelVAAlloca, n); IRB.CreateStore(DFSF.getShadow(*i), LabelVAPtr); } - Args.push_back(IRB.CreateStructGEP(LabelVAAlloca, 0)); + Args.push_back(IRB.CreateStructGEP(LabelVATy, LabelVAAlloca, 0)); } if (!FT->getReturnType()->isVoidTy()) { @@ -1532,7 +1532,7 @@ void DFSanVisitor::visitCallSite(CallSite CS) { Next = II->getNormalDest()->begin(); } else { BasicBlock *NewBB = - SplitEdge(II->getParent(), II->getNormalDest(), &DFSF.DFS); + SplitEdge(II->getParent(), II->getNormalDest(), &DFSF.DT); Next = NewBB->begin(); } } else { @@ -1569,10 +1569,11 @@ void DFSanVisitor::visitCallSite(CallSite CS) { ArrayType *VarArgArrayTy = ArrayType::get(DFSF.DFS.ShadowTy, VarArgSize); AllocaInst *VarArgShadow = new AllocaInst(VarArgArrayTy, "", DFSF.F->getEntryBlock().begin()); - Args.push_back(IRB.CreateConstGEP2_32(VarArgShadow, 0, 0)); + Args.push_back(IRB.CreateConstGEP2_32(VarArgArrayTy, VarArgShadow, 0, 0)); for (unsigned n = 0; i != e; ++i, ++n) { - IRB.CreateStore(DFSF.getShadow(*i), - IRB.CreateConstGEP2_32(VarArgShadow, 0, n)); + IRB.CreateStore( + DFSF.getShadow(*i), + IRB.CreateConstGEP2_32(VarArgArrayTy, VarArgShadow, 0, n)); Args.push_back(*i); } } @@ -1587,8 +1588,7 @@ void DFSanVisitor::visitCallSite(CallSite CS) { NewCS.setCallingConv(CS.getCallingConv()); NewCS.setAttributes(CS.getAttributes().removeAttributes( *DFSF.DFS.Ctx, AttributeSet::ReturnIndex, - AttributeFuncs::typeIncompatible(NewCS.getInstruction()->getType(), - AttributeSet::ReturnIndex))); + AttributeFuncs::typeIncompatible(NewCS.getInstruction()->getType()))); if (Next) { ExtractValueInst *ExVal = diff --git a/contrib/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/contrib/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp index 60b541f..9a3ed5c 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp @@ -57,6 +57,7 @@ GCOVOptions GCOVOptions::getDefault() { Options.UseCfgChecksum = false; Options.NoRedZone = false; Options.FunctionNamesInData = true; + Options.ExitBlockBeforeBody = DefaultExitBlockBeforeBody; if (DefaultGCOVVersion.size() != 4) { llvm::report_fatal_error(std::string("Invalid -default-gcov-version: ") + @@ -72,20 +73,10 @@ namespace { class GCOVProfiler : public ModulePass { public: static char ID; - GCOVProfiler() : ModulePass(ID), Options(GCOVOptions::getDefault()) { - init(); - } - GCOVProfiler(const GCOVOptions &Options) : ModulePass(ID), Options(Options){ + GCOVProfiler() : GCOVProfiler(GCOVOptions::getDefault()) {} + GCOVProfiler(const GCOVOptions &Opts) : ModulePass(ID), Options(Opts) { assert((Options.EmitNotes || Options.EmitData) && "GCOVProfiler asked to do nothing?"); - init(); - } - const char *getPassName() const override { - return "GCOV Profiler"; - } - - private: - void init() { ReversedVersion[0] = Options.Version[3]; ReversedVersion[1] = Options.Version[2]; ReversedVersion[2] = Options.Version[1]; @@ -93,6 +84,11 @@ namespace { ReversedVersion[4] = '\0'; initializeGCOVProfilerPass(*PassRegistry::getPassRegistry()); } + const char *getPassName() const override { + return "GCOV Profiler"; + } + + private: bool runOnModule(Module &M) override; // Create the .gcno files for the Module based on DebugInfo. @@ -130,7 +126,7 @@ namespace { Function *insertFlush(ArrayRef<std::pair<GlobalVariable*, MDNode*> >); void insertIndirectCounterIncrement(); - std::string mangleName(DICompileUnit CU, const char *NewStem); + std::string mangleName(const DICompileUnit *CU, const char *NewStem); GCOVOptions Options; @@ -153,10 +149,10 @@ ModulePass *llvm::createGCOVProfilerPass(const GCOVOptions &Options) { return new GCOVProfiler(Options); } -static StringRef getFunctionName(DISubprogram SP) { - if (!SP.getLinkageName().empty()) - return SP.getLinkageName(); - return SP.getName(); +static StringRef getFunctionName(const DISubprogram *SP) { + if (!SP->getLinkageName().empty()) + return SP->getLinkageName(); + return SP->getName(); } namespace { @@ -167,7 +163,7 @@ namespace { static const char *const BlockTag; static const char *const EdgeTag; - GCOVRecord() {} + GCOVRecord() = default; void writeBytes(const char *Bytes, int Size) { os->write(Bytes, Size); @@ -313,13 +309,13 @@ namespace { // object users can construct, the blocks and lines will be rooted here. class GCOVFunction : public GCOVRecord { public: - GCOVFunction(DISubprogram SP, raw_ostream *os, uint32_t Ident, + GCOVFunction(const DISubprogram *SP, raw_ostream *os, uint32_t Ident, bool UseCfgChecksum, bool ExitBlockBeforeBody) : SP(SP), Ident(Ident), UseCfgChecksum(UseCfgChecksum), CfgChecksum(0), ReturnBlock(1, os) { this->os = os; - Function *F = SP.getFunction(); + Function *F = SP->getFunction(); DEBUG(dbgs() << "Function: " << getFunctionName(SP) << "\n"); uint32_t i = 0; @@ -334,7 +330,7 @@ namespace { std::string FunctionNameAndLine; raw_string_ostream FNLOS(FunctionNameAndLine); - FNLOS << getFunctionName(SP) << SP.getLineNumber(); + FNLOS << getFunctionName(SP) << SP->getLine(); FNLOS.flush(); FuncChecksum = hash_value(FunctionNameAndLine); } @@ -370,7 +366,7 @@ namespace { void writeOut() { writeBytes(FunctionTag, 4); uint32_t BlockLen = 1 + 1 + 1 + lengthOfGCOVString(getFunctionName(SP)) + - 1 + lengthOfGCOVString(SP.getFilename()) + 1; + 1 + lengthOfGCOVString(SP->getFilename()) + 1; if (UseCfgChecksum) ++BlockLen; write(BlockLen); @@ -379,8 +375,8 @@ namespace { if (UseCfgChecksum) write(CfgChecksum); writeGCOVString(getFunctionName(SP)); - writeGCOVString(SP.getFilename()); - write(SP.getLineNumber()); + writeGCOVString(SP->getFilename()); + write(SP->getLine()); // Emit count of blocks. writeBytes(BlockTag, 4); @@ -415,7 +411,7 @@ namespace { } private: - DISubprogram SP; + const DISubprogram *SP; uint32_t Ident; uint32_t FuncChecksum; bool UseCfgChecksum; @@ -425,7 +421,8 @@ namespace { }; } -std::string GCOVProfiler::mangleName(DICompileUnit CU, const char *NewStem) { +std::string GCOVProfiler::mangleName(const DICompileUnit *CU, + const char *NewStem) { if (NamedMDNode *GCov = M->getNamedMetadata("llvm.gcov")) { for (int i = 0, e = GCov->getNumOperands(); i != e; ++i) { MDNode *N = GCov->getOperand(i); @@ -441,12 +438,12 @@ std::string GCOVProfiler::mangleName(DICompileUnit CU, const char *NewStem) { } } - SmallString<128> Filename = CU.getFilename(); + SmallString<128> Filename = CU->getFilename(); sys::path::replace_extension(Filename, NewStem); StringRef FName = sys::path::filename(Filename); SmallString<128> CurPath; if (sys::fs::current_path(CurPath)) return FName; - sys::path::append(CurPath, FName.str()); + sys::path::append(CurPath, FName); return CurPath.str(); } @@ -470,7 +467,8 @@ static bool functionHasLines(Function *F) { if (isa<DbgInfoIntrinsic>(I)) continue; const DebugLoc &Loc = I->getDebugLoc(); - if (Loc.isUnknown()) continue; + if (!Loc) + continue; // Artificial lines such as calls to the global constructors. if (Loc.getLine() == 0) continue; @@ -490,21 +488,14 @@ void GCOVProfiler::emitProfileNotes() { // this pass over the original .o's as they're produced, or run it after // LTO, we'll generate the same .gcno files. - DICompileUnit CU(CU_Nodes->getOperand(i)); + auto *CU = cast<DICompileUnit>(CU_Nodes->getOperand(i)); std::error_code EC; raw_fd_ostream out(mangleName(CU, "gcno"), EC, sys::fs::F_None); std::string EdgeDestinations; - DIArray SPs = CU.getSubprograms(); unsigned FunctionIdent = 0; - for (unsigned i = 0, e = SPs.getNumElements(); i != e; ++i) { - DISubprogram SP(SPs.getElement(i)); - assert((!SP || SP.isSubprogram()) && - "A MDNode in subprograms of a CU should be null or a DISubprogram."); - if (!SP) - continue; - - Function *F = SP.getFunction(); + for (auto *SP : CU->getSubprograms()) { + Function *F = SP->getFunction(); if (!F) continue; if (!functionHasLines(F)) continue; @@ -518,7 +509,7 @@ void GCOVProfiler::emitProfileNotes() { Funcs.push_back(make_unique<GCOVFunction>(SP, &out, FunctionIdent++, Options.UseCfgChecksum, - DefaultExitBlockBeforeBody)); + Options.ExitBlockBeforeBody)); GCOVFunction &Func = *Funcs.back(); for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) { @@ -540,16 +531,18 @@ void GCOVProfiler::emitProfileNotes() { if (isa<DbgInfoIntrinsic>(I)) continue; const DebugLoc &Loc = I->getDebugLoc(); - if (Loc.isUnknown()) continue; + if (!Loc) + continue; // Artificial lines such as calls to the global constructors. if (Loc.getLine() == 0) continue; if (Line == Loc.getLine()) continue; Line = Loc.getLine(); - if (SP != getDISubprogram(Loc.getScope(*Ctx))) continue; + if (SP != getDISubprogram(Loc.getScope())) + continue; - GCOVLines &Lines = Block.getFile(SP.getFilename()); + GCOVLines &Lines = Block.getFile(SP->getFilename()); Lines.addLine(Loc.getLine()); } } @@ -578,16 +571,10 @@ bool GCOVProfiler::emitProfileArcs() { bool Result = false; bool InsertIndCounterIncrCode = false; for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) { - DICompileUnit CU(CU_Nodes->getOperand(i)); - DIArray SPs = CU.getSubprograms(); + auto *CU = cast<DICompileUnit>(CU_Nodes->getOperand(i)); SmallVector<std::pair<GlobalVariable *, MDNode *>, 8> CountersBySP; - for (unsigned i = 0, e = SPs.getNumElements(); i != e; ++i) { - DISubprogram SP(SPs.getElement(i)); - assert((!SP || SP.isSubprogram()) && - "A MDNode in subprograms of a CU should be null or a DISubprogram."); - if (!SP) - continue; - Function *F = SP.getFunction(); + for (auto *SP : CU->getSubprograms()) { + Function *F = SP->getFunction(); if (!F) continue; if (!functionHasLines(F)) continue; if (!Result) Result = true; @@ -607,7 +594,7 @@ bool GCOVProfiler::emitProfileArcs() { GlobalValue::InternalLinkage, Constant::getNullValue(CounterTy), "__llvm_gcov_ctr"); - CountersBySP.push_back(std::make_pair(Counters, (MDNode*)SP)); + CountersBySP.push_back(std::make_pair(Counters, SP)); UniqueVector<BasicBlock *> ComplexEdgePreds; UniqueVector<BasicBlock *> ComplexEdgeSuccs; @@ -632,7 +619,8 @@ bool GCOVProfiler::emitProfileArcs() { SmallVector<Value *, 2> Idx; Idx.push_back(Builder.getInt64(0)); Idx.push_back(Sel); - Value *Counter = Builder.CreateInBoundsGEP(Counters, Idx); + Value *Counter = Builder.CreateInBoundsGEP(Counters->getValueType(), + Counters, Idx); Value *Count = Builder.CreateLoad(Counter); Count = Builder.CreateAdd(Count, Builder.getInt64(1)); Builder.CreateStore(Count, Counter); @@ -666,8 +654,8 @@ bool GCOVProfiler::emitProfileArcs() { // Build code to increment the counter. InsertIndCounterIncrCode = true; - Builder.CreateCall2(getIncrementIndirectCounterFunc(), - EdgeState, CounterPtrArray); + Builder.CreateCall(getIncrementIndirectCounterFunc(), + {EdgeState, CounterPtrArray}); } } } @@ -700,7 +688,7 @@ bool GCOVProfiler::emitProfileArcs() { // Initialize the environment and register the local writeout and flush // functions. Constant *GCOVInit = M->getOrInsertFunction("llvm_gcov_init", FTy); - Builder.CreateCall2(GCOVInit, WriteoutF, FlushF); + Builder.CreateCall(GCOVInit, {WriteoutF, FlushF}); Builder.CreateRetVoid(); appendToGlobalCtors(*M, F, 0); @@ -859,34 +847,34 @@ Function *GCOVProfiler::insertCounterWriteout( NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu"); if (CU_Nodes) { for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) { - DICompileUnit CU(CU_Nodes->getOperand(i)); + auto *CU = cast<DICompileUnit>(CU_Nodes->getOperand(i)); std::string FilenameGcda = mangleName(CU, "gcda"); uint32_t CfgChecksum = FileChecksums.empty() ? 0 : FileChecksums[i]; - Builder.CreateCall3(StartFile, - Builder.CreateGlobalStringPtr(FilenameGcda), + Builder.CreateCall(StartFile, + {Builder.CreateGlobalStringPtr(FilenameGcda), Builder.CreateGlobalStringPtr(ReversedVersion), - Builder.getInt32(CfgChecksum)); + Builder.getInt32(CfgChecksum)}); for (unsigned j = 0, e = CountersBySP.size(); j != e; ++j) { - DISubprogram SP(CountersBySP[j].second); + auto *SP = cast_or_null<DISubprogram>(CountersBySP[j].second); uint32_t FuncChecksum = Funcs.empty() ? 0 : Funcs[j]->getFuncChecksum(); - Builder.CreateCall5( - EmitFunction, Builder.getInt32(j), - Options.FunctionNamesInData ? - Builder.CreateGlobalStringPtr(getFunctionName(SP)) : - Constant::getNullValue(Builder.getInt8PtrTy()), - Builder.getInt32(FuncChecksum), - Builder.getInt8(Options.UseCfgChecksum), - Builder.getInt32(CfgChecksum)); + Builder.CreateCall( + EmitFunction, + {Builder.getInt32(j), + Options.FunctionNamesInData + ? Builder.CreateGlobalStringPtr(getFunctionName(SP)) + : Constant::getNullValue(Builder.getInt8PtrTy()), + Builder.getInt32(FuncChecksum), + Builder.getInt8(Options.UseCfgChecksum), + Builder.getInt32(CfgChecksum)}); GlobalVariable *GV = CountersBySP[j].first; unsigned Arcs = cast<ArrayType>(GV->getType()->getElementType())->getNumElements(); - Builder.CreateCall2(EmitArcs, - Builder.getInt32(Arcs), - Builder.CreateConstGEP2_64(GV, 0, 0)); + Builder.CreateCall(EmitArcs, {Builder.getInt32(Arcs), + Builder.CreateConstGEP2_64(GV, 0, 0)}); } - Builder.CreateCall(SummaryInfo); - Builder.CreateCall(EndFile); + Builder.CreateCall(SummaryInfo, {}); + Builder.CreateCall(EndFile, {}); } } @@ -926,7 +914,7 @@ void GCOVProfiler::insertIndirectCounterIncrement() { Value *ZExtPred = Builder.CreateZExt(Pred, Builder.getInt64Ty()); Arg = std::next(Fn->arg_begin()); Arg->setName("counters"); - Value *GEP = Builder.CreateGEP(Arg, ZExtPred); + Value *GEP = Builder.CreateGEP(Type::getInt64PtrTy(*Ctx), Arg, ZExtPred); Value *Counter = Builder.CreateLoad(GEP, "counter"); Cond = Builder.CreateICmpEQ(Counter, Constant::getNullValue( @@ -966,7 +954,7 @@ insertFlush(ArrayRef<std::pair<GlobalVariable*, MDNode*> > CountersBySP) { assert(WriteoutF && "Need to create the writeout function first!"); IRBuilder<> Builder(Entry); - Builder.CreateCall(WriteoutF); + Builder.CreateCall(WriteoutF, {}); // Zero out the counters. for (ArrayRef<std::pair<GlobalVariable *, MDNode *> >::iterator diff --git a/contrib/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/contrib/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp index 2a3d154..610ff52 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -97,7 +97,8 @@ private: /// Add uses of our data variables and runtime hook. void emitUses(); - /// Create a static initializer for our data, on platforms that need it. + /// Create a static initializer for our data, on platforms that need it, + /// and for any profile output file that was specified. void emitInitialization(); }; @@ -202,6 +203,7 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) { uint64_t NumCounters = Inc->getNumCounters()->getZExtValue(); LLVMContext &Ctx = M->getContext(); ArrayType *CounterTy = ArrayType::get(Type::getInt64Ty(Ctx), NumCounters); + Function *Fn = Inc->getParent()->getParent(); // Create the counters variable. auto *Counters = new GlobalVariable(*M, CounterTy, false, Name->getLinkage(), @@ -210,6 +212,10 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) { Counters->setVisibility(Name->getVisibility()); Counters->setSection(getCountersSection()); Counters->setAlignment(8); + // Place the counters in the same comdat section as its parent function. + // Otherwise, we may get multiple counters for the same function in certain + // cases. + Counters->setComdat(Fn->getComdat()); RegionCounters[Inc->getName()] = Counters; @@ -234,6 +240,7 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) { Data->setVisibility(Name->getVisibility()); Data->setSection(getDataSection()); Data->setAlignment(8); + Data->setComdat(Fn->getComdat()); // Mark the data variable as used so that it isn't stripped out. UsedVars.push_back(Data); @@ -288,6 +295,7 @@ void InstrProfiling::emitRuntimeHook() { User->addFnAttr(Attribute::NoInline); if (Options.NoRedZone) User->addFnAttr(Attribute::NoRedZone); + User->setVisibility(GlobalValue::HiddenVisibility); IRBuilder<> IRB(BasicBlock::Create(M->getContext(), "", User)); auto *Load = IRB.CreateLoad(Var); @@ -327,8 +335,10 @@ void InstrProfiling::emitUses() { } void InstrProfiling::emitInitialization() { + std::string InstrProfileOutput = Options.InstrProfileOutput; + Constant *RegisterF = M->getFunction("__llvm_profile_register_functions"); - if (!RegisterF) + if (!RegisterF && InstrProfileOutput.empty()) return; // Create the initialization function. @@ -343,7 +353,24 @@ void InstrProfiling::emitInitialization() { // Add the basic block and the necessary calls. IRBuilder<> IRB(BasicBlock::Create(M->getContext(), "", F)); - IRB.CreateCall(RegisterF); + if (RegisterF) + IRB.CreateCall(RegisterF, {}); + if (!InstrProfileOutput.empty()) { + auto *Int8PtrTy = Type::getInt8PtrTy(M->getContext()); + auto *SetNameTy = FunctionType::get(VoidTy, Int8PtrTy, false); + auto *SetNameF = + Function::Create(SetNameTy, GlobalValue::ExternalLinkage, + "__llvm_profile_override_default_filename", M); + + // Create variable for profile name + Constant *ProfileNameConst = + ConstantDataArray::getString(M->getContext(), InstrProfileOutput, true); + GlobalVariable *ProfileName = + new GlobalVariable(*M, ProfileNameConst->getType(), true, + GlobalValue::PrivateLinkage, ProfileNameConst); + + IRB.CreateCall(SetNameF, IRB.CreatePointerCast(ProfileName, Int8PtrTy)); + } IRB.CreateRetVoid(); appendToGlobalCtors(*M, F, 0); diff --git a/contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index d7d752f..100824e 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -120,6 +120,7 @@ using namespace llvm; #define DEBUG_TYPE "msan" +static const unsigned kOriginSize = 4; static const unsigned kMinOriginAlignment = 4; static const unsigned kShadowTLSAlignment = 8; @@ -190,6 +191,9 @@ static cl::opt<bool> ClCheckConstantShadow("msan-check-constant-shadow", cl::desc("Insert checks for constant shadow values"), cl::Hidden, cl::init(false)); +static const char *const kMsanModuleCtorName = "msan.module_ctor"; +static const char *const kMsanInitName = "__msan_init"; + namespace { // Memory map parameters used in application-to-shadow address calculation. @@ -209,7 +213,7 @@ struct PlatformMemoryMapParams { }; // i386 Linux -static const MemoryMapParams LinuxMemoryMapParams32 = { +static const MemoryMapParams Linux_I386_MemoryMapParams = { 0x000080000000, // AndMask 0, // XorMask (not used) 0, // ShadowBase (not used) @@ -217,15 +221,23 @@ static const MemoryMapParams LinuxMemoryMapParams32 = { }; // x86_64 Linux -static const MemoryMapParams LinuxMemoryMapParams64 = { +static const MemoryMapParams Linux_X86_64_MemoryMapParams = { 0x400000000000, // AndMask 0, // XorMask (not used) 0, // ShadowBase (not used) 0x200000000000, // OriginBase }; +// mips64 Linux +static const MemoryMapParams Linux_MIPS64_MemoryMapParams = { + 0x004000000000, // AndMask + 0, // XorMask (not used) + 0, // ShadowBase (not used) + 0x002000000000, // OriginBase +}; + // i386 FreeBSD -static const MemoryMapParams FreeBSDMemoryMapParams32 = { +static const MemoryMapParams FreeBSD_I386_MemoryMapParams = { 0x000180000000, // AndMask 0x000040000000, // XorMask 0x000020000000, // ShadowBase @@ -233,21 +245,26 @@ static const MemoryMapParams FreeBSDMemoryMapParams32 = { }; // x86_64 FreeBSD -static const MemoryMapParams FreeBSDMemoryMapParams64 = { +static const MemoryMapParams FreeBSD_X86_64_MemoryMapParams = { 0xc00000000000, // AndMask 0x200000000000, // XorMask 0x100000000000, // ShadowBase 0x380000000000, // OriginBase }; -static const PlatformMemoryMapParams LinuxMemoryMapParams = { - &LinuxMemoryMapParams32, - &LinuxMemoryMapParams64, +static const PlatformMemoryMapParams Linux_X86_MemoryMapParams = { + &Linux_I386_MemoryMapParams, + &Linux_X86_64_MemoryMapParams, +}; + +static const PlatformMemoryMapParams Linux_MIPS_MemoryMapParams = { + NULL, + &Linux_MIPS64_MemoryMapParams, }; -static const PlatformMemoryMapParams FreeBSDMemoryMapParams = { - &FreeBSDMemoryMapParams32, - &FreeBSDMemoryMapParams64, +static const PlatformMemoryMapParams FreeBSD_X86_MemoryMapParams = { + &FreeBSD_I386_MemoryMapParams, + &FreeBSD_X86_64_MemoryMapParams, }; /// \brief An instrumentation pass implementing detection of uninitialized @@ -260,7 +277,6 @@ class MemorySanitizer : public FunctionPass { MemorySanitizer(int TrackOrigins = 0) : FunctionPass(ID), TrackOrigins(std::max(TrackOrigins, (int)ClTrackOrigins)), - DL(nullptr), WarningFn(nullptr) {} const char *getPassName() const override { return "MemorySanitizer"; } bool runOnFunction(Function &F) override; @@ -273,7 +289,6 @@ class MemorySanitizer : public FunctionPass { /// \brief Track origins (allocation points) of uninitialized values. int TrackOrigins; - const DataLayout *DL; LLVMContext *C; Type *IntptrTy; Type *OriginTy; @@ -320,9 +335,11 @@ class MemorySanitizer : public FunctionPass { MDNode *OriginStoreWeights; /// \brief An empty volatile inline asm that prevents callback merge. InlineAsm *EmptyAsm; + Function *MsanCtorFunction; friend struct MemorySanitizerVisitor; friend struct VarArgAMD64Helper; + friend struct VarArgMIPS64Helper; }; } // namespace @@ -434,32 +451,43 @@ void MemorySanitizer::initializeCallbacks(Module &M) { /// /// inserts a call to __msan_init to the module's constructor list. bool MemorySanitizer::doInitialization(Module &M) { - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - if (!DLP) - report_fatal_error("data layout missing"); - DL = &DLP->getDataLayout(); + auto &DL = M.getDataLayout(); Triple TargetTriple(M.getTargetTriple()); - const PlatformMemoryMapParams *PlatformMapParams; - if (TargetTriple.getOS() == Triple::FreeBSD) - PlatformMapParams = &FreeBSDMemoryMapParams; - else - PlatformMapParams = &LinuxMemoryMapParams; - - C = &(M.getContext()); - unsigned PtrSize = DL->getPointerSizeInBits(/* AddressSpace */0); - switch (PtrSize) { - case 64: - MapParams = PlatformMapParams->bits64; + switch (TargetTriple.getOS()) { + case Triple::FreeBSD: + switch (TargetTriple.getArch()) { + case Triple::x86_64: + MapParams = FreeBSD_X86_MemoryMapParams.bits64; + break; + case Triple::x86: + MapParams = FreeBSD_X86_MemoryMapParams.bits32; + break; + default: + report_fatal_error("unsupported architecture"); + } break; - case 32: - MapParams = PlatformMapParams->bits32; + case Triple::Linux: + switch (TargetTriple.getArch()) { + case Triple::x86_64: + MapParams = Linux_X86_MemoryMapParams.bits64; + break; + case Triple::x86: + MapParams = Linux_X86_MemoryMapParams.bits32; + break; + case Triple::mips64: + case Triple::mips64el: + MapParams = Linux_MIPS_MemoryMapParams.bits64; + break; + default: + report_fatal_error("unsupported architecture"); + } break; default: - report_fatal_error("unsupported pointer size"); - break; + report_fatal_error("unsupported operating system"); } + C = &(M.getContext()); IRBuilder<> IRB(*C); IntptrTy = IRB.getIntPtrTy(DL); OriginTy = IRB.getInt32Ty(); @@ -467,9 +495,12 @@ bool MemorySanitizer::doInitialization(Module &M) { ColdCallWeights = MDBuilder(*C).createBranchWeights(1, 1000); OriginStoreWeights = MDBuilder(*C).createBranchWeights(1, 1000); - // Insert a call to __msan_init/__msan_track_origins into the module's CTORs. - appendToGlobalCtors(M, cast<Function>(M.getOrInsertFunction( - "__msan_init", IRB.getVoidTy(), nullptr)), 0); + std::tie(MsanCtorFunction, std::ignore) = + createSanitizerCtorAndInitFunctions(M, kMsanModuleCtorName, kMsanInitName, + /*InitArgTypes=*/{}, + /*InitArgs=*/{}); + + appendToGlobalCtors(M, MsanCtorFunction, 0); if (TrackOrigins) new GlobalVariable(M, IRB.getInt32Ty(), true, GlobalValue::WeakODRLinkage, @@ -555,8 +586,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { MemorySanitizerVisitor(Function &F, MemorySanitizer &MS) : F(F), MS(MS), VAHelper(CreateVarArgHelper(F, MS, *this)) { - bool SanitizeFunction = F.getAttributes().hasAttribute( - AttributeSet::FunctionIndex, Attribute::SanitizeMemory); + bool SanitizeFunction = F.hasFnAttribute(Attribute::SanitizeMemory); InsertChecks = SanitizeFunction; PropagateShadow = SanitizeFunction; PoisonStack = SanitizeFunction && ClPoisonStack; @@ -575,39 +605,86 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { return IRB.CreateCall(MS.MsanChainOriginFn, V); } + Value *originToIntptr(IRBuilder<> &IRB, Value *Origin) { + const DataLayout &DL = F.getParent()->getDataLayout(); + unsigned IntptrSize = DL.getTypeStoreSize(MS.IntptrTy); + if (IntptrSize == kOriginSize) return Origin; + assert(IntptrSize == kOriginSize * 2); + Origin = IRB.CreateIntCast(Origin, MS.IntptrTy, /* isSigned */ false); + return IRB.CreateOr(Origin, IRB.CreateShl(Origin, kOriginSize * 8)); + } + + /// \brief Fill memory range with the given origin value. + void paintOrigin(IRBuilder<> &IRB, Value *Origin, Value *OriginPtr, + unsigned Size, unsigned Alignment) { + const DataLayout &DL = F.getParent()->getDataLayout(); + unsigned IntptrAlignment = DL.getABITypeAlignment(MS.IntptrTy); + unsigned IntptrSize = DL.getTypeStoreSize(MS.IntptrTy); + assert(IntptrAlignment >= kMinOriginAlignment); + assert(IntptrSize >= kOriginSize); + + unsigned Ofs = 0; + unsigned CurrentAlignment = Alignment; + if (Alignment >= IntptrAlignment && IntptrSize > kOriginSize) { + Value *IntptrOrigin = originToIntptr(IRB, Origin); + Value *IntptrOriginPtr = + IRB.CreatePointerCast(OriginPtr, PointerType::get(MS.IntptrTy, 0)); + for (unsigned i = 0; i < Size / IntptrSize; ++i) { + Value *Ptr = i ? IRB.CreateConstGEP1_32(MS.IntptrTy, IntptrOriginPtr, i) + : IntptrOriginPtr; + IRB.CreateAlignedStore(IntptrOrigin, Ptr, CurrentAlignment); + Ofs += IntptrSize / kOriginSize; + CurrentAlignment = IntptrAlignment; + } + } + + for (unsigned i = Ofs; i < (Size + kOriginSize - 1) / kOriginSize; ++i) { + Value *GEP = + i ? IRB.CreateConstGEP1_32(nullptr, OriginPtr, i) : OriginPtr; + IRB.CreateAlignedStore(Origin, GEP, CurrentAlignment); + CurrentAlignment = kMinOriginAlignment; + } + } + void storeOrigin(IRBuilder<> &IRB, Value *Addr, Value *Shadow, Value *Origin, unsigned Alignment, bool AsCall) { + const DataLayout &DL = F.getParent()->getDataLayout(); unsigned OriginAlignment = std::max(kMinOriginAlignment, Alignment); + unsigned StoreSize = DL.getTypeStoreSize(Shadow->getType()); if (isa<StructType>(Shadow->getType())) { - IRB.CreateAlignedStore(updateOrigin(Origin, IRB), - getOriginPtr(Addr, IRB, Alignment), - OriginAlignment); + paintOrigin(IRB, updateOrigin(Origin, IRB), + getOriginPtr(Addr, IRB, Alignment), StoreSize, + OriginAlignment); } else { Value *ConvertedShadow = convertToShadowTyNoVec(Shadow, IRB); - // TODO(eugenis): handle non-zero constant shadow by inserting an - // unconditional check (can not simply fail compilation as this could - // be in the dead code). - if (!ClCheckConstantShadow) - if (isa<Constant>(ConvertedShadow)) return; + Constant *ConstantShadow = dyn_cast_or_null<Constant>(ConvertedShadow); + if (ConstantShadow) { + if (ClCheckConstantShadow && !ConstantShadow->isZeroValue()) + paintOrigin(IRB, updateOrigin(Origin, IRB), + getOriginPtr(Addr, IRB, Alignment), StoreSize, + OriginAlignment); + return; + } + unsigned TypeSizeInBits = - MS.DL->getTypeSizeInBits(ConvertedShadow->getType()); + DL.getTypeSizeInBits(ConvertedShadow->getType()); unsigned SizeIndex = TypeSizeToSizeIndex(TypeSizeInBits); if (AsCall && SizeIndex < kNumberOfAccessSizes) { Value *Fn = MS.MaybeStoreOriginFn[SizeIndex]; Value *ConvertedShadow2 = IRB.CreateZExt( ConvertedShadow, IRB.getIntNTy(8 * (1 << SizeIndex))); - IRB.CreateCall3(Fn, ConvertedShadow2, - IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()), - Origin); + IRB.CreateCall(Fn, {ConvertedShadow2, + IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()), + Origin}); } else { Value *Cmp = IRB.CreateICmpNE( ConvertedShadow, getCleanShadow(ConvertedShadow), "_mscmp"); Instruction *CheckTerm = SplitBlockAndInsertIfThen( Cmp, IRB.GetInsertPoint(), false, MS.OriginStoreWeights); IRBuilder<> IRBNew(CheckTerm); - IRBNew.CreateAlignedStore(updateOrigin(Origin, IRBNew), - getOriginPtr(Addr, IRBNew, Alignment), - OriginAlignment); + paintOrigin(IRBNew, updateOrigin(Origin, IRBNew), + getOriginPtr(Addr, IRBNew, Alignment), StoreSize, + OriginAlignment); } } } @@ -643,19 +720,34 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { DEBUG(dbgs() << " SHAD0 : " << *Shadow << "\n"); Value *ConvertedShadow = convertToShadowTyNoVec(Shadow, IRB); DEBUG(dbgs() << " SHAD1 : " << *ConvertedShadow << "\n"); - // See the comment in storeOrigin(). - if (!ClCheckConstantShadow) - if (isa<Constant>(ConvertedShadow)) return; - unsigned TypeSizeInBits = - MS.DL->getTypeSizeInBits(ConvertedShadow->getType()); + + Constant *ConstantShadow = dyn_cast_or_null<Constant>(ConvertedShadow); + if (ConstantShadow) { + if (ClCheckConstantShadow && !ConstantShadow->isZeroValue()) { + if (MS.TrackOrigins) { + IRB.CreateStore(Origin ? (Value *)Origin : (Value *)IRB.getInt32(0), + MS.OriginTLS); + } + IRB.CreateCall(MS.WarningFn, {}); + IRB.CreateCall(MS.EmptyAsm, {}); + // FIXME: Insert UnreachableInst if !ClKeepGoing? + // This may invalidate some of the following checks and needs to be done + // at the very end. + } + return; + } + + const DataLayout &DL = OrigIns->getModule()->getDataLayout(); + + unsigned TypeSizeInBits = DL.getTypeSizeInBits(ConvertedShadow->getType()); unsigned SizeIndex = TypeSizeToSizeIndex(TypeSizeInBits); if (AsCall && SizeIndex < kNumberOfAccessSizes) { Value *Fn = MS.MaybeWarningFn[SizeIndex]; Value *ConvertedShadow2 = IRB.CreateZExt(ConvertedShadow, IRB.getIntNTy(8 * (1 << SizeIndex))); - IRB.CreateCall2(Fn, ConvertedShadow2, MS.TrackOrigins && Origin + IRB.CreateCall(Fn, {ConvertedShadow2, MS.TrackOrigins && Origin ? Origin - : (Value *)IRB.getInt32(0)); + : (Value *)IRB.getInt32(0)}); } else { Value *Cmp = IRB.CreateICmpNE(ConvertedShadow, getCleanShadow(ConvertedShadow), "_mscmp"); @@ -668,8 +760,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { IRB.CreateStore(Origin ? (Value *)Origin : (Value *)IRB.getInt32(0), MS.OriginTLS); } - IRB.CreateCall(MS.WarningFn); - IRB.CreateCall(MS.EmptyAsm); + IRB.CreateCall(MS.WarningFn, {}); + IRB.CreateCall(MS.EmptyAsm, {}); DEBUG(dbgs() << " CHECK: " << *Cmp << "\n"); } } @@ -687,7 +779,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { /// \brief Add MemorySanitizer instrumentation to a function. bool runOnFunction() { MS.initializeCallbacks(*F.getParent()); - if (!MS.DL) return false; // In the presence of unreachable blocks, we may see Phi nodes with // incoming nodes from such blocks. Since InstVisitor skips unreachable @@ -743,8 +834,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { // This may return weird-sized types like i1. if (IntegerType *IT = dyn_cast<IntegerType>(OrigTy)) return IT; + const DataLayout &DL = F.getParent()->getDataLayout(); if (VectorType *VT = dyn_cast<VectorType>(OrigTy)) { - uint32_t EltSize = MS.DL->getTypeSizeInBits(VT->getElementType()); + uint32_t EltSize = DL.getTypeSizeInBits(VT->getElementType()); return VectorType::get(IntegerType::get(*MS.C, EltSize), VT->getNumElements()); } @@ -760,7 +852,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { DEBUG(dbgs() << "getShadowTy: " << *ST << " ===> " << *Res << "\n"); return Res; } - uint32_t TypeSize = MS.DL->getTypeSizeInBits(OrigTy); + uint32_t TypeSize = DL.getTypeSizeInBits(OrigTy); return IntegerType::get(*MS.C, TypeSize); } @@ -953,14 +1045,16 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { Function *F = A->getParent(); IRBuilder<> EntryIRB(F->getEntryBlock().getFirstNonPHI()); unsigned ArgOffset = 0; + const DataLayout &DL = F->getParent()->getDataLayout(); for (auto &FArg : F->args()) { if (!FArg.getType()->isSized()) { DEBUG(dbgs() << "Arg is not sized\n"); continue; } - unsigned Size = FArg.hasByValAttr() - ? MS.DL->getTypeAllocSize(FArg.getType()->getPointerElementType()) - : MS.DL->getTypeAllocSize(FArg.getType()); + unsigned Size = + FArg.hasByValAttr() + ? DL.getTypeAllocSize(FArg.getType()->getPointerElementType()) + : DL.getTypeAllocSize(FArg.getType()); if (A == &FArg) { bool Overflow = ArgOffset + Size > kParamTLSSize; Value *Base = getShadowPtrForArgument(&FArg, EntryIRB, ArgOffset); @@ -971,7 +1065,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { unsigned ArgAlign = FArg.getParamAlignment(); if (ArgAlign == 0) { Type *EltType = A->getType()->getPointerElementType(); - ArgAlign = MS.DL->getABITypeAlignment(EltType); + ArgAlign = DL.getABITypeAlignment(EltType); } if (Overflow) { // ParamTLS overflow. @@ -1708,11 +1802,11 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { /// Similar situation exists for memcpy and memset. void visitMemMoveInst(MemMoveInst &I) { IRBuilder<> IRB(&I); - IRB.CreateCall3( - MS.MemmoveFn, - IRB.CreatePointerCast(I.getArgOperand(0), IRB.getInt8PtrTy()), - IRB.CreatePointerCast(I.getArgOperand(1), IRB.getInt8PtrTy()), - IRB.CreateIntCast(I.getArgOperand(2), MS.IntptrTy, false)); + IRB.CreateCall( + MS.MemmoveFn, + {IRB.CreatePointerCast(I.getArgOperand(0), IRB.getInt8PtrTy()), + IRB.CreatePointerCast(I.getArgOperand(1), IRB.getInt8PtrTy()), + IRB.CreateIntCast(I.getArgOperand(2), MS.IntptrTy, false)}); I.eraseFromParent(); } @@ -1722,22 +1816,22 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { // alignment. void visitMemCpyInst(MemCpyInst &I) { IRBuilder<> IRB(&I); - IRB.CreateCall3( - MS.MemcpyFn, - IRB.CreatePointerCast(I.getArgOperand(0), IRB.getInt8PtrTy()), - IRB.CreatePointerCast(I.getArgOperand(1), IRB.getInt8PtrTy()), - IRB.CreateIntCast(I.getArgOperand(2), MS.IntptrTy, false)); + IRB.CreateCall( + MS.MemcpyFn, + {IRB.CreatePointerCast(I.getArgOperand(0), IRB.getInt8PtrTy()), + IRB.CreatePointerCast(I.getArgOperand(1), IRB.getInt8PtrTy()), + IRB.CreateIntCast(I.getArgOperand(2), MS.IntptrTy, false)}); I.eraseFromParent(); } // Same as memcpy. void visitMemSetInst(MemSetInst &I) { IRBuilder<> IRB(&I); - IRB.CreateCall3( - MS.MemsetFn, - IRB.CreatePointerCast(I.getArgOperand(0), IRB.getInt8PtrTy()), - IRB.CreateIntCast(I.getArgOperand(1), IRB.getInt32Ty(), false), - IRB.CreateIntCast(I.getArgOperand(2), MS.IntptrTy, false)); + IRB.CreateCall( + MS.MemsetFn, + {IRB.CreatePointerCast(I.getArgOperand(0), IRB.getInt8PtrTy()), + IRB.CreateIntCast(I.getArgOperand(1), IRB.getInt32Ty(), false), + IRB.CreateIntCast(I.getArgOperand(2), MS.IntptrTy, false)}); I.eraseFromParent(); } @@ -2018,8 +2112,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { : Lower64ShadowExtend(IRB, S2, getShadowTy(&I)); Value *V1 = I.getOperand(0); Value *V2 = I.getOperand(1); - Value *Shift = IRB.CreateCall2(I.getCalledValue(), - IRB.CreateBitCast(S1, V1->getType()), V2); + Value *Shift = IRB.CreateCall(I.getCalledValue(), + {IRB.CreateBitCast(S1, V1->getType()), V2}); Shift = IRB.CreateBitCast(Shift, getShadowTy(&I)); setShadow(&I, IRB.CreateOr(Shift, S2Conv)); setOriginForNaryOp(I); @@ -2099,7 +2193,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { Function *ShadowFn = Intrinsic::getDeclaration( F.getParent(), getSignedPackIntrinsic(I.getIntrinsicID())); - Value *S = IRB.CreateCall2(ShadowFn, S1_ext, S2_ext, "_msprop_vector_pack"); + Value *S = + IRB.CreateCall(ShadowFn, {S1_ext, S2_ext}, "_msprop_vector_pack"); if (isX86_MMX) S = IRB.CreateBitCast(S, getShadowTy(&I)); setShadow(&I, S); setOriginForNaryOp(I); @@ -2178,15 +2273,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { case llvm::Intrinsic::x86_sse_cvttps2pi: handleVectorConvertIntrinsic(I, 2); break; - case llvm::Intrinsic::x86_avx512_psll_dq: - case llvm::Intrinsic::x86_avx512_psrl_dq: case llvm::Intrinsic::x86_avx2_psll_w: case llvm::Intrinsic::x86_avx2_psll_d: case llvm::Intrinsic::x86_avx2_psll_q: case llvm::Intrinsic::x86_avx2_pslli_w: case llvm::Intrinsic::x86_avx2_pslli_d: case llvm::Intrinsic::x86_avx2_pslli_q: - case llvm::Intrinsic::x86_avx2_psll_dq: case llvm::Intrinsic::x86_avx2_psrl_w: case llvm::Intrinsic::x86_avx2_psrl_d: case llvm::Intrinsic::x86_avx2_psrl_q: @@ -2197,14 +2289,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { case llvm::Intrinsic::x86_avx2_psrli_q: case llvm::Intrinsic::x86_avx2_psrai_w: case llvm::Intrinsic::x86_avx2_psrai_d: - case llvm::Intrinsic::x86_avx2_psrl_dq: case llvm::Intrinsic::x86_sse2_psll_w: case llvm::Intrinsic::x86_sse2_psll_d: case llvm::Intrinsic::x86_sse2_psll_q: case llvm::Intrinsic::x86_sse2_pslli_w: case llvm::Intrinsic::x86_sse2_pslli_d: case llvm::Intrinsic::x86_sse2_pslli_q: - case llvm::Intrinsic::x86_sse2_psll_dq: case llvm::Intrinsic::x86_sse2_psrl_w: case llvm::Intrinsic::x86_sse2_psrl_d: case llvm::Intrinsic::x86_sse2_psrl_q: @@ -2215,7 +2305,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { case llvm::Intrinsic::x86_sse2_psrli_q: case llvm::Intrinsic::x86_sse2_psrai_w: case llvm::Intrinsic::x86_sse2_psrai_d: - case llvm::Intrinsic::x86_sse2_psrl_dq: case llvm::Intrinsic::x86_mmx_psll_w: case llvm::Intrinsic::x86_mmx_psll_d: case llvm::Intrinsic::x86_mmx_psll_q: @@ -2247,14 +2336,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { handleVectorShiftIntrinsic(I, /* Variable */ true); break; - // Byte shifts are not implemented. - // case llvm::Intrinsic::x86_avx512_psll_dq_bs: - // case llvm::Intrinsic::x86_avx512_psrl_dq_bs: - // case llvm::Intrinsic::x86_avx2_psll_dq_bs: - // case llvm::Intrinsic::x86_avx2_psrl_dq_bs: - // case llvm::Intrinsic::x86_sse2_psll_dq_bs: - // case llvm::Intrinsic::x86_sse2_psrl_dq_bs: - case llvm::Intrinsic::x86_sse2_packsswb_128: case llvm::Intrinsic::x86_sse2_packssdw_128: case llvm::Intrinsic::x86_sse2_packuswb_128: @@ -2356,10 +2437,11 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { DEBUG(dbgs() << " Arg#" << i << ": " << *A << " Shadow: " << *ArgShadow << "\n"); bool ArgIsInitialized = false; + const DataLayout &DL = F.getParent()->getDataLayout(); if (CS.paramHasAttr(i + 1, Attribute::ByVal)) { assert(A->getType()->isPointerTy() && "ByVal argument is not a pointer!"); - Size = MS.DL->getTypeAllocSize(A->getType()->getPointerElementType()); + Size = DL.getTypeAllocSize(A->getType()->getPointerElementType()); if (ArgOffset + Size > kParamTLSSize) break; unsigned ParamAlignment = CS.getParamAlignment(i + 1); unsigned Alignment = std::min(ParamAlignment, kShadowTLSAlignment); @@ -2367,7 +2449,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { getShadowPtr(A, Type::getInt8Ty(*MS.C), IRB), Size, Alignment); } else { - Size = MS.DL->getTypeAllocSize(A->getType()); + Size = DL.getTypeAllocSize(A->getType()); if (ArgOffset + Size > kParamTLSSize) break; Store = IRB.CreateAlignedStore(ArgShadow, ArgShadowBase, kShadowTLSAlignment); @@ -2460,11 +2542,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { setShadow(&I, getCleanShadow(&I)); setOrigin(&I, getCleanOrigin()); IRBuilder<> IRB(I.getNextNode()); - uint64_t Size = MS.DL->getTypeAllocSize(I.getAllocatedType()); + const DataLayout &DL = F.getParent()->getDataLayout(); + uint64_t Size = DL.getTypeAllocSize(I.getAllocatedType()); if (PoisonStack && ClPoisonStackWithCall) { - IRB.CreateCall2(MS.MsanPoisonStackFn, - IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()), - ConstantInt::get(MS.IntptrTy, Size)); + IRB.CreateCall(MS.MsanPoisonStackFn, + {IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()), + ConstantInt::get(MS.IntptrTy, Size)}); } else { Value *ShadowBase = getShadowPtr(&I, Type::getInt8PtrTy(*MS.C), IRB); Value *PoisonValue = IRB.getInt8(PoisonStack ? ClPoisonStackPattern : 0); @@ -2484,11 +2567,11 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { createPrivateNonConstGlobalForString(*F.getParent(), StackDescription.str()); - IRB.CreateCall4(MS.MsanSetAllocaOrigin4Fn, - IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()), + IRB.CreateCall(MS.MsanSetAllocaOrigin4Fn, + {IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()), ConstantInt::get(MS.IntptrTy, Size), IRB.CreatePointerCast(Descr, IRB.getInt8PtrTy()), - IRB.CreatePointerCast(&F, MS.IntptrTy)); + IRB.CreatePointerCast(&F, MS.IntptrTy)}); } } @@ -2652,6 +2735,7 @@ struct VarArgAMD64Helper : public VarArgHelper { unsigned GpOffset = 0; unsigned FpOffset = AMD64GpEndOffset; unsigned OverflowOffset = AMD64FpEndOffset; + const DataLayout &DL = F.getParent()->getDataLayout(); for (CallSite::arg_iterator ArgIt = CS.arg_begin(), End = CS.arg_end(); ArgIt != End; ++ArgIt) { Value *A = *ArgIt; @@ -2661,7 +2745,7 @@ struct VarArgAMD64Helper : public VarArgHelper { // ByVal arguments always go to the overflow area. assert(A->getType()->isPointerTy()); Type *RealTy = A->getType()->getPointerElementType(); - uint64_t ArgSize = MS.DL->getTypeAllocSize(RealTy); + uint64_t ArgSize = DL.getTypeAllocSize(RealTy); Value *Base = getShadowPtrForVAArgument(RealTy, IRB, OverflowOffset); OverflowOffset += RoundUpToAlignment(ArgSize, 8); IRB.CreateMemCpy(Base, MSV.getShadowPtr(A, IRB.getInt8Ty(), IRB), @@ -2683,7 +2767,7 @@ struct VarArgAMD64Helper : public VarArgHelper { FpOffset += 16; break; case AK_Memory: - uint64_t ArgSize = MS.DL->getTypeAllocSize(A->getType()); + uint64_t ArgSize = DL.getTypeAllocSize(A->getType()); Base = getShadowPtrForVAArgument(A->getType(), IRB, OverflowOffset); OverflowOffset += RoundUpToAlignment(ArgSize, 8); } @@ -2768,12 +2852,114 @@ struct VarArgAMD64Helper : public VarArgHelper { Value *OverflowArgAreaPtr = IRB.CreateLoad(OverflowArgAreaPtrPtr); Value *OverflowArgAreaShadowPtr = MSV.getShadowPtr(OverflowArgAreaPtr, IRB.getInt8Ty(), IRB); - Value *SrcPtr = IRB.CreateConstGEP1_32(VAArgTLSCopy, AMD64FpEndOffset); + Value *SrcPtr = IRB.CreateConstGEP1_32(IRB.getInt8Ty(), VAArgTLSCopy, + AMD64FpEndOffset); IRB.CreateMemCpy(OverflowArgAreaShadowPtr, SrcPtr, VAArgOverflowSize, 16); } } }; +/// \brief MIPS64-specific implementation of VarArgHelper. +struct VarArgMIPS64Helper : public VarArgHelper { + Function &F; + MemorySanitizer &MS; + MemorySanitizerVisitor &MSV; + Value *VAArgTLSCopy; + Value *VAArgSize; + + SmallVector<CallInst*, 16> VAStartInstrumentationList; + + VarArgMIPS64Helper(Function &F, MemorySanitizer &MS, + MemorySanitizerVisitor &MSV) + : F(F), MS(MS), MSV(MSV), VAArgTLSCopy(nullptr), + VAArgSize(nullptr) {} + + void visitCallSite(CallSite &CS, IRBuilder<> &IRB) override { + unsigned VAArgOffset = 0; + const DataLayout &DL = F.getParent()->getDataLayout(); + for (CallSite::arg_iterator ArgIt = CS.arg_begin() + 1, End = CS.arg_end(); + ArgIt != End; ++ArgIt) { + Value *A = *ArgIt; + Value *Base; + uint64_t ArgSize = DL.getTypeAllocSize(A->getType()); +#if defined(__MIPSEB__) || defined(MIPSEB) + // Adjusting the shadow for argument with size < 8 to match the placement + // of bits in big endian system + if (ArgSize < 8) + VAArgOffset += (8 - ArgSize); +#endif + Base = getShadowPtrForVAArgument(A->getType(), IRB, VAArgOffset); + VAArgOffset += ArgSize; + VAArgOffset = RoundUpToAlignment(VAArgOffset, 8); + IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment); + } + + Constant *TotalVAArgSize = ConstantInt::get(IRB.getInt64Ty(), VAArgOffset); + // Here using VAArgOverflowSizeTLS as VAArgSizeTLS to avoid creation of + // a new class member i.e. it is the total size of all VarArgs. + IRB.CreateStore(TotalVAArgSize, MS.VAArgOverflowSizeTLS); + } + + /// \brief Compute the shadow address for a given va_arg. + Value *getShadowPtrForVAArgument(Type *Ty, IRBuilder<> &IRB, + int ArgOffset) { + Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy); + Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset)); + return IRB.CreateIntToPtr(Base, PointerType::get(MSV.getShadowTy(Ty), 0), + "_msarg"); + } + + void visitVAStartInst(VAStartInst &I) override { + IRBuilder<> IRB(&I); + VAStartInstrumentationList.push_back(&I); + Value *VAListTag = I.getArgOperand(0); + Value *ShadowPtr = MSV.getShadowPtr(VAListTag, IRB.getInt8Ty(), IRB); + IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()), + /* size */8, /* alignment */8, false); + } + + void visitVACopyInst(VACopyInst &I) override { + IRBuilder<> IRB(&I); + Value *VAListTag = I.getArgOperand(0); + Value *ShadowPtr = MSV.getShadowPtr(VAListTag, IRB.getInt8Ty(), IRB); + // Unpoison the whole __va_list_tag. + // FIXME: magic ABI constants. + IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()), + /* size */8, /* alignment */8, false); + } + + void finalizeInstrumentation() override { + assert(!VAArgSize && !VAArgTLSCopy && + "finalizeInstrumentation called twice"); + IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI()); + VAArgSize = IRB.CreateLoad(MS.VAArgOverflowSizeTLS); + Value *CopySize = IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, 0), + VAArgSize); + + if (!VAStartInstrumentationList.empty()) { + // If there is a va_start in this function, make a backup copy of + // va_arg_tls somewhere in the function entry block. + VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize); + IRB.CreateMemCpy(VAArgTLSCopy, MS.VAArgTLS, CopySize, 8); + } + + // Instrument va_start. + // Copy va_list shadow from the backup copy of the TLS contents. + for (size_t i = 0, n = VAStartInstrumentationList.size(); i < n; i++) { + CallInst *OrigInst = VAStartInstrumentationList[i]; + IRBuilder<> IRB(OrigInst->getNextNode()); + Value *VAListTag = OrigInst->getArgOperand(0); + Value *RegSaveAreaPtrPtr = + IRB.CreateIntToPtr(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy), + Type::getInt64PtrTy(*MS.C)); + Value *RegSaveAreaPtr = IRB.CreateLoad(RegSaveAreaPtrPtr); + Value *RegSaveAreaShadowPtr = + MSV.getShadowPtr(RegSaveAreaPtr, IRB.getInt8Ty(), IRB); + IRB.CreateMemCpy(RegSaveAreaShadowPtr, VAArgTLSCopy, CopySize, 8); + } + } +}; + /// \brief A no-op implementation of VarArgHelper. struct VarArgNoOpHelper : public VarArgHelper { VarArgNoOpHelper(Function &F, MemorySanitizer &MS, @@ -2795,6 +2981,9 @@ VarArgHelper *CreateVarArgHelper(Function &Func, MemorySanitizer &Msan, llvm::Triple TargetTriple(Func.getParent()->getTargetTriple()); if (TargetTriple.getArch() == llvm::Triple::x86_64) return new VarArgAMD64Helper(Func, Msan, Visitor); + else if (TargetTriple.getArch() == llvm::Triple::mips64 || + TargetTriple.getArch() == llvm::Triple::mips64el) + return new VarArgMIPS64Helper(Func, Msan, Visitor); else return new VarArgNoOpHelper(Func, Msan, Visitor); } @@ -2802,6 +2991,8 @@ VarArgHelper *CreateVarArgHelper(Function &Func, MemorySanitizer &Msan, } // namespace bool MemorySanitizer::runOnFunction(Function &F) { + if (&F == MsanCtorFunction) + return false; MemorySanitizerVisitor Visitor(F, *this); // Clear out readonly/readnone attributes. diff --git a/contrib/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/contrib/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp index c048a99..f6ae0c2 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp @@ -11,19 +11,17 @@ // and potentially with other Sanitizers. // // We create a Guard variable with the same linkage -// as the function and inject this code into the entry block (CoverageLevel=1) -// or all blocks (CoverageLevel>=2): +// as the function and inject this code into the entry block (SCK_Function) +// or all blocks (SCK_BB): // if (Guard < 0) { // __sanitizer_cov(&Guard); // } // The accesses to Guard are atomic. The rest of the logic is // in __sanitizer_cov (it's fine to call it more than once). // -// With CoverageLevel>=3 we also split critical edges this effectively +// With SCK_Edge we also split critical edges this effectively // instrumenting all edges. // -// CoverageLevel>=4 add indirect call profiling implented as a function call. -// // This coverage implementation provides very limited data: // it only tells if a given function (block) was ever executed. No counters. // But for many use cases this is what we need and the added slowdown small. @@ -55,11 +53,13 @@ using namespace llvm; static const char *const kSanCovModuleInitName = "__sanitizer_cov_module_init"; static const char *const kSanCovName = "__sanitizer_cov"; +static const char *const kSanCovWithCheckName = "__sanitizer_cov_with_check"; static const char *const kSanCovIndirCallName = "__sanitizer_cov_indir_call16"; static const char *const kSanCovTraceEnter = "__sanitizer_cov_trace_func_enter"; static const char *const kSanCovTraceBB = "__sanitizer_cov_trace_basic_block"; +static const char *const kSanCovTraceCmp = "__sanitizer_cov_trace_cmp"; static const char *const kSanCovModuleCtorName = "sancov.module_ctor"; -static const uint64_t kSanCtorAndDtorPriority = 1; +static const uint64_t kSanCtorAndDtorPriority = 2; static cl::opt<int> ClCoverageLevel("sanitizer-coverage-level", cl::desc("Sanitizer Coverage. 0: none, 1: entry block, 2: all blocks, " @@ -67,11 +67,11 @@ static cl::opt<int> ClCoverageLevel("sanitizer-coverage-level", "4: above plus indirect calls"), cl::Hidden, cl::init(0)); -static cl::opt<int> ClCoverageBlockThreshold( +static cl::opt<unsigned> ClCoverageBlockThreshold( "sanitizer-coverage-block-threshold", - cl::desc("Add coverage instrumentation only to the entry block if there " - "are more than this number of blocks."), - cl::Hidden, cl::init(1500)); + cl::desc("Use a callback with a guard check inside it if there are" + " more than this number of blocks."), + cl::Hidden, cl::init(500)); static cl::opt<bool> ClExperimentalTracing("sanitizer-coverage-experimental-tracing", @@ -79,13 +79,63 @@ static cl::opt<bool> "callbacks at every basic block"), cl::Hidden, cl::init(false)); +static cl::opt<bool> + ClExperimentalCMPTracing("sanitizer-coverage-experimental-trace-compares", + cl::desc("Experimental tracing of CMP and similar " + "instructions"), + cl::Hidden, cl::init(false)); + +// Experimental 8-bit counters used as an additional search heuristic during +// coverage-guided fuzzing. +// The counters are not thread-friendly: +// - contention on these counters may cause significant slowdown; +// - the counter updates are racy and the results may be inaccurate. +// They are also inaccurate due to 8-bit integer overflow. +static cl::opt<bool> ClUse8bitCounters("sanitizer-coverage-8bit-counters", + cl::desc("Experimental 8-bit counters"), + cl::Hidden, cl::init(false)); + namespace { +SanitizerCoverageOptions getOptions(int LegacyCoverageLevel) { + SanitizerCoverageOptions Res; + switch (LegacyCoverageLevel) { + case 0: + Res.CoverageType = SanitizerCoverageOptions::SCK_None; + break; + case 1: + Res.CoverageType = SanitizerCoverageOptions::SCK_Function; + break; + case 2: + Res.CoverageType = SanitizerCoverageOptions::SCK_BB; + break; + case 3: + Res.CoverageType = SanitizerCoverageOptions::SCK_Edge; + break; + case 4: + Res.CoverageType = SanitizerCoverageOptions::SCK_Edge; + Res.IndirectCalls = true; + break; + } + return Res; +} + +SanitizerCoverageOptions OverrideFromCL(SanitizerCoverageOptions Options) { + // Sets CoverageType and IndirectCalls. + SanitizerCoverageOptions CLOpts = getOptions(ClCoverageLevel); + Options.CoverageType = std::max(Options.CoverageType, CLOpts.CoverageType); + Options.IndirectCalls |= CLOpts.IndirectCalls; + Options.TraceBB |= ClExperimentalTracing; + Options.TraceCmp |= ClExperimentalCMPTracing; + Options.Use8bitCounters |= ClUse8bitCounters; + return Options; +} + class SanitizerCoverageModule : public ModulePass { public: - SanitizerCoverageModule(int CoverageLevel = 0) - : ModulePass(ID), - CoverageLevel(std::max(CoverageLevel, (int)ClCoverageLevel)) {} + SanitizerCoverageModule( + const SanitizerCoverageOptions &Options = SanitizerCoverageOptions()) + : ModulePass(ID), Options(OverrideFromCL(Options)) {} bool runOnModule(Module &M) override; bool runOnFunction(Function &F); static char ID; // Pass identification, replacement for typeid @@ -93,104 +143,135 @@ class SanitizerCoverageModule : public ModulePass { return "SanitizerCoverageModule"; } - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<DataLayoutPass>(); - } - private: void InjectCoverageForIndirectCalls(Function &F, ArrayRef<Instruction *> IndirCalls); - bool InjectCoverage(Function &F, ArrayRef<BasicBlock *> AllBlocks, - ArrayRef<Instruction *> IndirCalls); - void InjectCoverageAtBlock(Function &F, BasicBlock &BB); + void InjectTraceForCmp(Function &F, ArrayRef<Instruction *> CmpTraceTargets); + bool InjectCoverage(Function &F, ArrayRef<BasicBlock *> AllBlocks); + void SetNoSanitizeMetadata(Instruction *I); + void InjectCoverageAtBlock(Function &F, BasicBlock &BB, bool UseCalls); + unsigned NumberOfInstrumentedBlocks() { + return SanCovFunction->getNumUses() + SanCovWithCheckFunction->getNumUses(); + } Function *SanCovFunction; + Function *SanCovWithCheckFunction; Function *SanCovIndirCallFunction; - Function *SanCovModuleInit; Function *SanCovTraceEnter, *SanCovTraceBB; + Function *SanCovTraceCmpFunction; InlineAsm *EmptyAsm; - Type *IntptrTy; + Type *IntptrTy, *Int64Ty; LLVMContext *C; + const DataLayout *DL; GlobalVariable *GuardArray; + GlobalVariable *EightBitCounterArray; - int CoverageLevel; + SanitizerCoverageOptions Options; }; } // namespace -static Function *checkInterfaceFunction(Constant *FuncOrBitcast) { - if (Function *F = dyn_cast<Function>(FuncOrBitcast)) - return F; - std::string Err; - raw_string_ostream Stream(Err); - Stream << "SanitizerCoverage interface function redefined: " - << *FuncOrBitcast; - report_fatal_error(Err); -} - bool SanitizerCoverageModule::runOnModule(Module &M) { - if (!CoverageLevel) return false; + if (Options.CoverageType == SanitizerCoverageOptions::SCK_None) + return false; C = &(M.getContext()); - DataLayoutPass *DLP = &getAnalysis<DataLayoutPass>(); - IntptrTy = Type::getIntNTy(*C, DLP->getDataLayout().getPointerSizeInBits()); + DL = &M.getDataLayout(); + IntptrTy = Type::getIntNTy(*C, DL->getPointerSizeInBits()); Type *VoidTy = Type::getVoidTy(*C); IRBuilder<> IRB(*C); + Type *Int8PtrTy = PointerType::getUnqual(IRB.getInt8Ty()); Type *Int32PtrTy = PointerType::getUnqual(IRB.getInt32Ty()); + Int64Ty = IRB.getInt64Ty(); - Function *CtorFunc = - Function::Create(FunctionType::get(VoidTy, false), - GlobalValue::InternalLinkage, kSanCovModuleCtorName, &M); - ReturnInst::Create(*C, BasicBlock::Create(*C, "", CtorFunc)); - appendToGlobalCtors(M, CtorFunc, kSanCtorAndDtorPriority); - - SanCovFunction = checkInterfaceFunction( + SanCovFunction = checkSanitizerInterfaceFunction( M.getOrInsertFunction(kSanCovName, VoidTy, Int32PtrTy, nullptr)); - SanCovIndirCallFunction = checkInterfaceFunction(M.getOrInsertFunction( - kSanCovIndirCallName, VoidTy, IntptrTy, IntptrTy, nullptr)); - SanCovModuleInit = checkInterfaceFunction( - M.getOrInsertFunction(kSanCovModuleInitName, Type::getVoidTy(*C), - Int32PtrTy, IntptrTy, nullptr)); - SanCovModuleInit->setLinkage(Function::ExternalLinkage); + SanCovWithCheckFunction = checkSanitizerInterfaceFunction( + M.getOrInsertFunction(kSanCovWithCheckName, VoidTy, Int32PtrTy, nullptr)); + SanCovIndirCallFunction = + checkSanitizerInterfaceFunction(M.getOrInsertFunction( + kSanCovIndirCallName, VoidTy, IntptrTy, IntptrTy, nullptr)); + SanCovTraceCmpFunction = + checkSanitizerInterfaceFunction(M.getOrInsertFunction( + kSanCovTraceCmp, VoidTy, Int64Ty, Int64Ty, Int64Ty, nullptr)); + // We insert an empty inline asm after cov callbacks to avoid callback merge. EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false), StringRef(""), StringRef(""), /*hasSideEffects=*/true); - if (ClExperimentalTracing) { - SanCovTraceEnter = checkInterfaceFunction( + if (Options.TraceBB) { + SanCovTraceEnter = checkSanitizerInterfaceFunction( M.getOrInsertFunction(kSanCovTraceEnter, VoidTy, Int32PtrTy, nullptr)); - SanCovTraceBB = checkInterfaceFunction( + SanCovTraceBB = checkSanitizerInterfaceFunction( M.getOrInsertFunction(kSanCovTraceBB, VoidTy, Int32PtrTy, nullptr)); } // At this point we create a dummy array of guards because we don't // know how many elements we will need. Type *Int32Ty = IRB.getInt32Ty(); + Type *Int8Ty = IRB.getInt8Ty(); + GuardArray = new GlobalVariable(M, Int32Ty, false, GlobalValue::ExternalLinkage, nullptr, "__sancov_gen_cov_tmp"); + if (Options.Use8bitCounters) + EightBitCounterArray = + new GlobalVariable(M, Int8Ty, false, GlobalVariable::ExternalLinkage, + nullptr, "__sancov_gen_cov_tmp"); for (auto &F : M) runOnFunction(F); + auto N = NumberOfInstrumentedBlocks(); + // Now we know how many elements we need. Create an array of guards // with one extra element at the beginning for the size. - Type *Int32ArrayNTy = - ArrayType::get(Int32Ty, SanCovFunction->getNumUses() + 1); + Type *Int32ArrayNTy = ArrayType::get(Int32Ty, N + 1); GlobalVariable *RealGuardArray = new GlobalVariable( M, Int32ArrayNTy, false, GlobalValue::PrivateLinkage, Constant::getNullValue(Int32ArrayNTy), "__sancov_gen_cov"); + // Replace the dummy array with the real one. GuardArray->replaceAllUsesWith( IRB.CreatePointerCast(RealGuardArray, Int32PtrTy)); GuardArray->eraseFromParent(); - // Call __sanitizer_cov_module_init - IRB.SetInsertPoint(CtorFunc->getEntryBlock().getTerminator()); - IRB.CreateCall2(SanCovModuleInit, - IRB.CreatePointerCast(RealGuardArray, Int32PtrTy), - ConstantInt::get(IntptrTy, SanCovFunction->getNumUses())); + GlobalVariable *RealEightBitCounterArray; + if (Options.Use8bitCounters) { + // Make sure the array is 16-aligned. + static const int kCounterAlignment = 16; + Type *Int8ArrayNTy = + ArrayType::get(Int8Ty, RoundUpToAlignment(N, kCounterAlignment)); + RealEightBitCounterArray = new GlobalVariable( + M, Int8ArrayNTy, false, GlobalValue::PrivateLinkage, + Constant::getNullValue(Int8ArrayNTy), "__sancov_gen_cov_counter"); + RealEightBitCounterArray->setAlignment(kCounterAlignment); + EightBitCounterArray->replaceAllUsesWith( + IRB.CreatePointerCast(RealEightBitCounterArray, Int8PtrTy)); + EightBitCounterArray->eraseFromParent(); + } + + // Create variable for module (compilation unit) name + Constant *ModNameStrConst = + ConstantDataArray::getString(M.getContext(), M.getName(), true); + GlobalVariable *ModuleName = + new GlobalVariable(M, ModNameStrConst->getType(), true, + GlobalValue::PrivateLinkage, ModNameStrConst); + + Function *CtorFunc; + std::tie(CtorFunc, std::ignore) = createSanitizerCtorAndInitFunctions( + M, kSanCovModuleCtorName, kSanCovModuleInitName, + {Int32PtrTy, IntptrTy, Int8PtrTy, Int8PtrTy}, + {IRB.CreatePointerCast(RealGuardArray, Int32PtrTy), + ConstantInt::get(IntptrTy, N), + Options.Use8bitCounters + ? IRB.CreatePointerCast(RealEightBitCounterArray, Int8PtrTy) + : Constant::getNullValue(Int8PtrTy), + IRB.CreatePointerCast(ModuleName, Int8PtrTy)}); + + appendToGlobalCtors(M, CtorFunc, kSanCtorAndDtorPriority); + return true; } @@ -198,38 +279,44 @@ bool SanitizerCoverageModule::runOnFunction(Function &F) { if (F.empty()) return false; if (F.getName().find(".module_ctor") != std::string::npos) return false; // Should not instrument sanitizer init functions. - if (CoverageLevel >= 3) - SplitAllCriticalEdges(F, this); + if (Options.CoverageType >= SanitizerCoverageOptions::SCK_Edge) + SplitAllCriticalEdges(F); SmallVector<Instruction*, 8> IndirCalls; SmallVector<BasicBlock*, 16> AllBlocks; + SmallVector<Instruction*, 8> CmpTraceTargets; for (auto &BB : F) { AllBlocks.push_back(&BB); - if (CoverageLevel >= 4) - for (auto &Inst : BB) { + for (auto &Inst : BB) { + if (Options.IndirectCalls) { CallSite CS(&Inst); if (CS && !CS.getCalledFunction()) IndirCalls.push_back(&Inst); } + if (Options.TraceCmp && isa<ICmpInst>(&Inst)) + CmpTraceTargets.push_back(&Inst); + } } - InjectCoverage(F, AllBlocks, IndirCalls); + InjectCoverage(F, AllBlocks); + InjectCoverageForIndirectCalls(F, IndirCalls); + InjectTraceForCmp(F, CmpTraceTargets); return true; } -bool -SanitizerCoverageModule::InjectCoverage(Function &F, - ArrayRef<BasicBlock *> AllBlocks, - ArrayRef<Instruction *> IndirCalls) { - if (!CoverageLevel) return false; - - if (CoverageLevel == 1 || - (unsigned)ClCoverageBlockThreshold < AllBlocks.size()) { - InjectCoverageAtBlock(F, F.getEntryBlock()); - } else { +bool SanitizerCoverageModule::InjectCoverage(Function &F, + ArrayRef<BasicBlock *> AllBlocks) { + switch (Options.CoverageType) { + case SanitizerCoverageOptions::SCK_None: + return false; + case SanitizerCoverageOptions::SCK_Function: + InjectCoverageAtBlock(F, F.getEntryBlock(), false); + return true; + default: { + bool UseCalls = ClCoverageBlockThreshold < AllBlocks.size(); for (auto BB : AllBlocks) - InjectCoverageAtBlock(F, *BB); + InjectCoverageAtBlock(F, *BB, UseCalls); + return true; + } } - InjectCoverageForIndirectCalls(F, IndirCalls); - return true; } // On every indirect call we call a run-time function @@ -249,19 +336,44 @@ void SanitizerCoverageModule::InjectCoverageForIndirectCalls( IRBuilder<> IRB(I); CallSite CS(I); Value *Callee = CS.getCalledValue(); - if (dyn_cast<InlineAsm>(Callee)) continue; + if (isa<InlineAsm>(Callee)) continue; GlobalVariable *CalleeCache = new GlobalVariable( *F.getParent(), Ty, false, GlobalValue::PrivateLinkage, Constant::getNullValue(Ty), "__sancov_gen_callee_cache"); CalleeCache->setAlignment(kCacheAlignment); - IRB.CreateCall2(SanCovIndirCallFunction, - IRB.CreatePointerCast(Callee, IntptrTy), - IRB.CreatePointerCast(CalleeCache, IntptrTy)); + IRB.CreateCall(SanCovIndirCallFunction, + {IRB.CreatePointerCast(Callee, IntptrTy), + IRB.CreatePointerCast(CalleeCache, IntptrTy)}); + } +} + +void SanitizerCoverageModule::InjectTraceForCmp( + Function &F, ArrayRef<Instruction *> CmpTraceTargets) { + for (auto I : CmpTraceTargets) { + if (ICmpInst *ICMP = dyn_cast<ICmpInst>(I)) { + IRBuilder<> IRB(ICMP); + Value *A0 = ICMP->getOperand(0); + Value *A1 = ICMP->getOperand(1); + if (!A0->getType()->isIntegerTy()) continue; + uint64_t TypeSize = DL->getTypeStoreSizeInBits(A0->getType()); + // __sanitizer_cov_trace_cmp((type_size << 32) | predicate, A0, A1); + IRB.CreateCall( + SanCovTraceCmpFunction, + {ConstantInt::get(Int64Ty, (TypeSize << 32) | ICMP->getPredicate()), + IRB.CreateIntCast(A0, Int64Ty, true), + IRB.CreateIntCast(A1, Int64Ty, true)}); + } } } -void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, - BasicBlock &BB) { +void SanitizerCoverageModule::SetNoSanitizeMetadata(Instruction *I) { + I->setMetadata( + I->getParent()->getParent()->getParent()->getMDKindID("nosanitize"), + MDNode::get(*C, None)); +} + +void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB, + bool UseCalls) { BasicBlock::iterator IP = BB.getFirstInsertionPt(), BE = BB.end(); // Skip static allocas at the top of the entry block so they don't become // dynamic when we split the block. If we used our optimized stack layout, @@ -273,31 +385,48 @@ void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, } bool IsEntryBB = &BB == &F.getEntryBlock(); - DebugLoc EntryLoc = - IsEntryBB ? IP->getDebugLoc().getFnDebugLoc(*C) : IP->getDebugLoc(); + DebugLoc EntryLoc = IsEntryBB && IP->getDebugLoc() + ? IP->getDebugLoc().getFnDebugLoc() + : IP->getDebugLoc(); IRBuilder<> IRB(IP); IRB.SetCurrentDebugLocation(EntryLoc); SmallVector<Value *, 1> Indices; Value *GuardP = IRB.CreateAdd( IRB.CreatePointerCast(GuardArray, IntptrTy), - ConstantInt::get(IntptrTy, (1 + SanCovFunction->getNumUses()) * 4)); + ConstantInt::get(IntptrTy, (1 + NumberOfInstrumentedBlocks()) * 4)); Type *Int32PtrTy = PointerType::getUnqual(IRB.getInt32Ty()); GuardP = IRB.CreateIntToPtr(GuardP, Int32PtrTy); - LoadInst *Load = IRB.CreateLoad(GuardP); - Load->setAtomic(Monotonic); - Load->setAlignment(4); - Load->setMetadata(F.getParent()->getMDKindID("nosanitize"), - MDNode::get(*C, None)); - Value *Cmp = IRB.CreateICmpSGE(Constant::getNullValue(Load->getType()), Load); - Instruction *Ins = SplitBlockAndInsertIfThen( - Cmp, IP, false, MDBuilder(*C).createBranchWeights(1, 100000)); - IRB.SetInsertPoint(Ins); - IRB.SetCurrentDebugLocation(EntryLoc); - // __sanitizer_cov gets the PC of the instruction using GET_CALLER_PC. - IRB.CreateCall(SanCovFunction, GuardP); - IRB.CreateCall(EmptyAsm); // Avoids callback merge. + if (UseCalls) { + IRB.CreateCall(SanCovWithCheckFunction, GuardP); + } else { + LoadInst *Load = IRB.CreateLoad(GuardP); + Load->setAtomic(Monotonic); + Load->setAlignment(4); + SetNoSanitizeMetadata(Load); + Value *Cmp = IRB.CreateICmpSGE(Constant::getNullValue(Load->getType()), Load); + Instruction *Ins = SplitBlockAndInsertIfThen( + Cmp, IP, false, MDBuilder(*C).createBranchWeights(1, 100000)); + IRB.SetInsertPoint(Ins); + IRB.SetCurrentDebugLocation(EntryLoc); + // __sanitizer_cov gets the PC of the instruction using GET_CALLER_PC. + IRB.CreateCall(SanCovFunction, GuardP); + IRB.CreateCall(EmptyAsm, {}); // Avoids callback merge. + } + + if (Options.Use8bitCounters) { + IRB.SetInsertPoint(IP); + Value *P = IRB.CreateAdd( + IRB.CreatePointerCast(EightBitCounterArray, IntptrTy), + ConstantInt::get(IntptrTy, NumberOfInstrumentedBlocks() - 1)); + P = IRB.CreateIntToPtr(P, IRB.getInt8PtrTy()); + LoadInst *LI = IRB.CreateLoad(P); + Value *Inc = IRB.CreateAdd(LI, ConstantInt::get(IRB.getInt8Ty(), 1)); + StoreInst *SI = IRB.CreateStore(Inc, P); + SetNoSanitizeMetadata(LI); + SetNoSanitizeMetadata(SI); + } - if (ClExperimentalTracing) { + if (Options.TraceBB) { // Experimental support for tracing. // Insert a callback with the same guard variable as used for coverage. IRB.SetInsertPoint(IP); @@ -309,6 +438,7 @@ char SanitizerCoverageModule::ID = 0; INITIALIZE_PASS(SanitizerCoverageModule, "sancov", "SanitizerCoverage: TODO." "ModulePass", false, false) -ModulePass *llvm::createSanitizerCoverageModulePass(int CoverageLevel) { - return new SanitizerCoverageModule(CoverageLevel); +ModulePass *llvm::createSanitizerCoverageModulePass( + const SanitizerCoverageOptions &Options) { + return new SanitizerCoverageModule(Options); } diff --git a/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp index 1b86ae5..1a46bbb 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp @@ -25,6 +25,8 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/Analysis/CaptureTracking.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" @@ -68,12 +70,16 @@ STATISTIC(NumInstrumentedVtableReads, "Number of vtable ptr reads"); STATISTIC(NumOmittedReadsFromConstantGlobals, "Number of reads from constant globals"); STATISTIC(NumOmittedReadsFromVtable, "Number of vtable reads"); +STATISTIC(NumOmittedNonCaptured, "Number of accesses ignored due to capturing"); + +static const char *const kTsanModuleCtorName = "tsan.module_ctor"; +static const char *const kTsanInitName = "__tsan_init"; namespace { /// ThreadSanitizer: instrument the code in module to find races. struct ThreadSanitizer : public FunctionPass { - ThreadSanitizer() : FunctionPass(ID), DL(nullptr) {} + ThreadSanitizer() : FunctionPass(ID) {} const char *getPassName() const override; bool runOnFunction(Function &F) override; bool doInitialization(Module &M) override; @@ -81,15 +87,15 @@ struct ThreadSanitizer : public FunctionPass { private: void initializeCallbacks(Module &M); - bool instrumentLoadOrStore(Instruction *I); - bool instrumentAtomic(Instruction *I); + bool instrumentLoadOrStore(Instruction *I, const DataLayout &DL); + bool instrumentAtomic(Instruction *I, const DataLayout &DL); bool instrumentMemIntrinsic(Instruction *I); - void chooseInstructionsToInstrument(SmallVectorImpl<Instruction*> &Local, - SmallVectorImpl<Instruction*> &All); + void chooseInstructionsToInstrument(SmallVectorImpl<Instruction *> &Local, + SmallVectorImpl<Instruction *> &All, + const DataLayout &DL); bool addrPointsToConstantData(Value *Addr); - int getMemoryAccessFuncIndex(Value *Addr); + int getMemoryAccessFuncIndex(Value *Addr, const DataLayout &DL); - const DataLayout *DL; Type *IntptrTy; IntegerType *OrdTy; // Callbacks to run-time library are computed in doInitialization. @@ -99,6 +105,8 @@ struct ThreadSanitizer : public FunctionPass { static const size_t kNumberOfAccessSizes = 5; Function *TsanRead[kNumberOfAccessSizes]; Function *TsanWrite[kNumberOfAccessSizes]; + Function *TsanUnalignedRead[kNumberOfAccessSizes]; + Function *TsanUnalignedWrite[kNumberOfAccessSizes]; Function *TsanAtomicLoad[kNumberOfAccessSizes]; Function *TsanAtomicStore[kNumberOfAccessSizes]; Function *TsanAtomicRMW[AtomicRMWInst::LAST_BINOP + 1][kNumberOfAccessSizes]; @@ -108,6 +116,7 @@ struct ThreadSanitizer : public FunctionPass { Function *TsanVptrUpdate; Function *TsanVptrLoad; Function *MemmoveFn, *MemcpyFn, *MemsetFn; + Function *TsanCtorFunction; }; } // namespace @@ -124,44 +133,48 @@ FunctionPass *llvm::createThreadSanitizerPass() { return new ThreadSanitizer(); } -static Function *checkInterfaceFunction(Constant *FuncOrBitcast) { - if (Function *F = dyn_cast<Function>(FuncOrBitcast)) - return F; - FuncOrBitcast->dump(); - report_fatal_error("ThreadSanitizer interface function redefined"); -} - void ThreadSanitizer::initializeCallbacks(Module &M) { IRBuilder<> IRB(M.getContext()); // Initialize the callbacks. - TsanFuncEntry = checkInterfaceFunction(M.getOrInsertFunction( + TsanFuncEntry = checkSanitizerInterfaceFunction(M.getOrInsertFunction( "__tsan_func_entry", IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr)); - TsanFuncExit = checkInterfaceFunction(M.getOrInsertFunction( - "__tsan_func_exit", IRB.getVoidTy(), nullptr)); + TsanFuncExit = checkSanitizerInterfaceFunction( + M.getOrInsertFunction("__tsan_func_exit", IRB.getVoidTy(), nullptr)); OrdTy = IRB.getInt32Ty(); for (size_t i = 0; i < kNumberOfAccessSizes; ++i) { const size_t ByteSize = 1 << i; const size_t BitSize = ByteSize * 8; SmallString<32> ReadName("__tsan_read" + itostr(ByteSize)); - TsanRead[i] = checkInterfaceFunction(M.getOrInsertFunction( + TsanRead[i] = checkSanitizerInterfaceFunction(M.getOrInsertFunction( ReadName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr)); SmallString<32> WriteName("__tsan_write" + itostr(ByteSize)); - TsanWrite[i] = checkInterfaceFunction(M.getOrInsertFunction( + TsanWrite[i] = checkSanitizerInterfaceFunction(M.getOrInsertFunction( WriteName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr)); + SmallString<64> UnalignedReadName("__tsan_unaligned_read" + + itostr(ByteSize)); + TsanUnalignedRead[i] = + checkSanitizerInterfaceFunction(M.getOrInsertFunction( + UnalignedReadName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr)); + + SmallString<64> UnalignedWriteName("__tsan_unaligned_write" + + itostr(ByteSize)); + TsanUnalignedWrite[i] = + checkSanitizerInterfaceFunction(M.getOrInsertFunction( + UnalignedWriteName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr)); + Type *Ty = Type::getIntNTy(M.getContext(), BitSize); Type *PtrTy = Ty->getPointerTo(); SmallString<32> AtomicLoadName("__tsan_atomic" + itostr(BitSize) + "_load"); - TsanAtomicLoad[i] = checkInterfaceFunction(M.getOrInsertFunction( - AtomicLoadName, Ty, PtrTy, OrdTy, nullptr)); + TsanAtomicLoad[i] = checkSanitizerInterfaceFunction( + M.getOrInsertFunction(AtomicLoadName, Ty, PtrTy, OrdTy, nullptr)); SmallString<32> AtomicStoreName("__tsan_atomic" + itostr(BitSize) + "_store"); - TsanAtomicStore[i] = checkInterfaceFunction(M.getOrInsertFunction( - AtomicStoreName, IRB.getVoidTy(), PtrTy, Ty, OrdTy, - nullptr)); + TsanAtomicStore[i] = checkSanitizerInterfaceFunction(M.getOrInsertFunction( + AtomicStoreName, IRB.getVoidTy(), PtrTy, Ty, OrdTy, nullptr)); for (int op = AtomicRMWInst::FIRST_BINOP; op <= AtomicRMWInst::LAST_BINOP; ++op) { @@ -184,48 +197,44 @@ void ThreadSanitizer::initializeCallbacks(Module &M) { else continue; SmallString<32> RMWName("__tsan_atomic" + itostr(BitSize) + NamePart); - TsanAtomicRMW[op][i] = checkInterfaceFunction(M.getOrInsertFunction( - RMWName, Ty, PtrTy, Ty, OrdTy, nullptr)); + TsanAtomicRMW[op][i] = checkSanitizerInterfaceFunction( + M.getOrInsertFunction(RMWName, Ty, PtrTy, Ty, OrdTy, nullptr)); } SmallString<32> AtomicCASName("__tsan_atomic" + itostr(BitSize) + "_compare_exchange_val"); - TsanAtomicCAS[i] = checkInterfaceFunction(M.getOrInsertFunction( + TsanAtomicCAS[i] = checkSanitizerInterfaceFunction(M.getOrInsertFunction( AtomicCASName, Ty, PtrTy, Ty, Ty, OrdTy, OrdTy, nullptr)); } - TsanVptrUpdate = checkInterfaceFunction(M.getOrInsertFunction( - "__tsan_vptr_update", IRB.getVoidTy(), IRB.getInt8PtrTy(), - IRB.getInt8PtrTy(), nullptr)); - TsanVptrLoad = checkInterfaceFunction(M.getOrInsertFunction( + TsanVptrUpdate = checkSanitizerInterfaceFunction( + M.getOrInsertFunction("__tsan_vptr_update", IRB.getVoidTy(), + IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), nullptr)); + TsanVptrLoad = checkSanitizerInterfaceFunction(M.getOrInsertFunction( "__tsan_vptr_read", IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr)); - TsanAtomicThreadFence = checkInterfaceFunction(M.getOrInsertFunction( + TsanAtomicThreadFence = checkSanitizerInterfaceFunction(M.getOrInsertFunction( "__tsan_atomic_thread_fence", IRB.getVoidTy(), OrdTy, nullptr)); - TsanAtomicSignalFence = checkInterfaceFunction(M.getOrInsertFunction( + TsanAtomicSignalFence = checkSanitizerInterfaceFunction(M.getOrInsertFunction( "__tsan_atomic_signal_fence", IRB.getVoidTy(), OrdTy, nullptr)); - MemmoveFn = checkInterfaceFunction(M.getOrInsertFunction( - "memmove", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), - IRB.getInt8PtrTy(), IntptrTy, nullptr)); - MemcpyFn = checkInterfaceFunction(M.getOrInsertFunction( - "memcpy", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), - IntptrTy, nullptr)); - MemsetFn = checkInterfaceFunction(M.getOrInsertFunction( - "memset", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt32Ty(), - IntptrTy, nullptr)); + MemmoveFn = checkSanitizerInterfaceFunction( + M.getOrInsertFunction("memmove", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), + IRB.getInt8PtrTy(), IntptrTy, nullptr)); + MemcpyFn = checkSanitizerInterfaceFunction( + M.getOrInsertFunction("memcpy", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), + IRB.getInt8PtrTy(), IntptrTy, nullptr)); + MemsetFn = checkSanitizerInterfaceFunction( + M.getOrInsertFunction("memset", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), + IRB.getInt32Ty(), IntptrTy, nullptr)); } bool ThreadSanitizer::doInitialization(Module &M) { - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - if (!DLP) - report_fatal_error("data layout missing"); - DL = &DLP->getDataLayout(); + const DataLayout &DL = M.getDataLayout(); + IntptrTy = DL.getIntPtrType(M.getContext()); + std::tie(TsanCtorFunction, std::ignore) = createSanitizerCtorAndInitFunctions( + M, kTsanModuleCtorName, kTsanInitName, /*InitArgTypes=*/{}, + /*InitArgs=*/{}); - // Always insert a call to __tsan_init into the module's CTORs. - IRBuilder<> IRB(M.getContext()); - IntptrTy = IRB.getIntPtrTy(DL); - Value *TsanInit = M.getOrInsertFunction("__tsan_init", - IRB.getVoidTy(), nullptr); - appendToGlobalCtors(M, cast<Function>(TsanInit), 0); + appendToGlobalCtors(M, TsanCtorFunction, 0); return true; } @@ -260,6 +269,7 @@ bool ThreadSanitizer::addrPointsToConstantData(Value *Addr) { // Instrumenting some of the accesses may be proven redundant. // Currently handled: // - read-before-write (within same BB, no calls between) +// - not captured variables // // We do not handle some of the patterns that should not survive // after the classic compiler optimizations. @@ -269,8 +279,8 @@ bool ThreadSanitizer::addrPointsToConstantData(Value *Addr) { // 'Local' is a vector of insns within the same BB (no calls between). // 'All' is a vector of insns that will be instrumented. void ThreadSanitizer::chooseInstructionsToInstrument( - SmallVectorImpl<Instruction*> &Local, - SmallVectorImpl<Instruction*> &All) { + SmallVectorImpl<Instruction *> &Local, SmallVectorImpl<Instruction *> &All, + const DataLayout &DL) { SmallSet<Value*, 8> WriteTargets; // Iterate from the end. for (SmallVectorImpl<Instruction*>::reverse_iterator It = Local.rbegin(), @@ -291,6 +301,17 @@ void ThreadSanitizer::chooseInstructionsToInstrument( continue; } } + Value *Addr = isa<StoreInst>(*I) + ? cast<StoreInst>(I)->getPointerOperand() + : cast<LoadInst>(I)->getPointerOperand(); + if (isa<AllocaInst>(GetUnderlyingObject(Addr, DL)) && + !PointerMayBeCaptured(Addr, true, true)) { + // The variable is addressable but not captured, so it cannot be + // referenced from a different thread and participate in a data race + // (see llvm/Analysis/CaptureTracking.h for details). + NumOmittedNonCaptured++; + continue; + } All.push_back(I); } Local.clear(); @@ -311,7 +332,10 @@ static bool isAtomic(Instruction *I) { } bool ThreadSanitizer::runOnFunction(Function &F) { - if (!DL) return false; + // This is required to prevent instrumenting call to __tsan_init from within + // the module constructor. + if (&F == TsanCtorFunction) + return false; initializeCallbacks(*F.getParent()); SmallVector<Instruction*, 8> RetVec; SmallVector<Instruction*, 8> AllLoadsAndStores; @@ -321,6 +345,7 @@ bool ThreadSanitizer::runOnFunction(Function &F) { bool Res = false; bool HasCalls = false; bool SanitizeFunction = F.hasFnAttribute(Attribute::SanitizeThread); + const DataLayout &DL = F.getParent()->getDataLayout(); // Traverse all instructions, collect loads/stores/returns, check for calls. for (auto &BB : F) { @@ -335,10 +360,11 @@ bool ThreadSanitizer::runOnFunction(Function &F) { if (isa<MemIntrinsic>(Inst)) MemIntrinCalls.push_back(&Inst); HasCalls = true; - chooseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores); + chooseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores, + DL); } } - chooseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores); + chooseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores, DL); } // We have collected all loads and stores. @@ -348,14 +374,14 @@ bool ThreadSanitizer::runOnFunction(Function &F) { // Instrument memory accesses only if we want to report bugs in the function. if (ClInstrumentMemoryAccesses && SanitizeFunction) for (auto Inst : AllLoadsAndStores) { - Res |= instrumentLoadOrStore(Inst); + Res |= instrumentLoadOrStore(Inst, DL); } // Instrument atomic memory accesses in any case (they can be used to // implement synchronization). if (ClInstrumentAtomics) for (auto Inst : AtomicAccesses) { - Res |= instrumentAtomic(Inst); + Res |= instrumentAtomic(Inst, DL); } if (ClInstrumentMemIntrinsics && SanitizeFunction) @@ -372,20 +398,21 @@ bool ThreadSanitizer::runOnFunction(Function &F) { IRB.CreateCall(TsanFuncEntry, ReturnAddress); for (auto RetInst : RetVec) { IRBuilder<> IRBRet(RetInst); - IRBRet.CreateCall(TsanFuncExit); + IRBRet.CreateCall(TsanFuncExit, {}); } Res = true; } return Res; } -bool ThreadSanitizer::instrumentLoadOrStore(Instruction *I) { +bool ThreadSanitizer::instrumentLoadOrStore(Instruction *I, + const DataLayout &DL) { IRBuilder<> IRB(I); bool IsWrite = isa<StoreInst>(*I); Value *Addr = IsWrite ? cast<StoreInst>(I)->getPointerOperand() : cast<LoadInst>(I)->getPointerOperand(); - int Idx = getMemoryAccessFuncIndex(Addr); + int Idx = getMemoryAccessFuncIndex(Addr, DL); if (Idx < 0) return false; if (IsWrite && isVtableAccess(I)) { @@ -400,9 +427,9 @@ bool ThreadSanitizer::instrumentLoadOrStore(Instruction *I) { if (StoredValue->getType()->isIntegerTy()) StoredValue = IRB.CreateIntToPtr(StoredValue, IRB.getInt8PtrTy()); // Call TsanVptrUpdate. - IRB.CreateCall2(TsanVptrUpdate, - IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()), - IRB.CreatePointerCast(StoredValue, IRB.getInt8PtrTy())); + IRB.CreateCall(TsanVptrUpdate, + {IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()), + IRB.CreatePointerCast(StoredValue, IRB.getInt8PtrTy())}); NumInstrumentedVtableWrites++; return true; } @@ -412,7 +439,16 @@ bool ThreadSanitizer::instrumentLoadOrStore(Instruction *I) { NumInstrumentedVtableReads++; return true; } - Value *OnAccessFunc = IsWrite ? TsanWrite[Idx] : TsanRead[Idx]; + const unsigned Alignment = IsWrite + ? cast<StoreInst>(I)->getAlignment() + : cast<LoadInst>(I)->getAlignment(); + Type *OrigTy = cast<PointerType>(Addr->getType())->getElementType(); + const uint32_t TypeSize = DL.getTypeStoreSizeInBits(OrigTy); + Value *OnAccessFunc = nullptr; + if (Alignment == 0 || Alignment >= 8 || (Alignment % (TypeSize / 8)) == 0) + OnAccessFunc = IsWrite ? TsanWrite[Idx] : TsanRead[Idx]; + else + OnAccessFunc = IsWrite ? TsanUnalignedWrite[Idx] : TsanUnalignedRead[Idx]; IRB.CreateCall(OnAccessFunc, IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy())); if (IsWrite) NumInstrumentedWrites++; else NumInstrumentedReads++; @@ -445,16 +481,18 @@ static ConstantInt *createOrdering(IRBuilder<> *IRB, AtomicOrdering ord) { bool ThreadSanitizer::instrumentMemIntrinsic(Instruction *I) { IRBuilder<> IRB(I); if (MemSetInst *M = dyn_cast<MemSetInst>(I)) { - IRB.CreateCall3(MemsetFn, - IRB.CreatePointerCast(M->getArgOperand(0), IRB.getInt8PtrTy()), - IRB.CreateIntCast(M->getArgOperand(1), IRB.getInt32Ty(), false), - IRB.CreateIntCast(M->getArgOperand(2), IntptrTy, false)); + IRB.CreateCall( + MemsetFn, + {IRB.CreatePointerCast(M->getArgOperand(0), IRB.getInt8PtrTy()), + IRB.CreateIntCast(M->getArgOperand(1), IRB.getInt32Ty(), false), + IRB.CreateIntCast(M->getArgOperand(2), IntptrTy, false)}); I->eraseFromParent(); } else if (MemTransferInst *M = dyn_cast<MemTransferInst>(I)) { - IRB.CreateCall3(isa<MemCpyInst>(M) ? MemcpyFn : MemmoveFn, - IRB.CreatePointerCast(M->getArgOperand(0), IRB.getInt8PtrTy()), - IRB.CreatePointerCast(M->getArgOperand(1), IRB.getInt8PtrTy()), - IRB.CreateIntCast(M->getArgOperand(2), IntptrTy, false)); + IRB.CreateCall( + isa<MemCpyInst>(M) ? MemcpyFn : MemmoveFn, + {IRB.CreatePointerCast(M->getArgOperand(0), IRB.getInt8PtrTy()), + IRB.CreatePointerCast(M->getArgOperand(1), IRB.getInt8PtrTy()), + IRB.CreateIntCast(M->getArgOperand(2), IntptrTy, false)}); I->eraseFromParent(); } return false; @@ -468,11 +506,11 @@ bool ThreadSanitizer::instrumentMemIntrinsic(Instruction *I) { // The following page contains more background information: // http://www.hpl.hp.com/personal/Hans_Boehm/c++mm/ -bool ThreadSanitizer::instrumentAtomic(Instruction *I) { +bool ThreadSanitizer::instrumentAtomic(Instruction *I, const DataLayout &DL) { IRBuilder<> IRB(I); if (LoadInst *LI = dyn_cast<LoadInst>(I)) { Value *Addr = LI->getPointerOperand(); - int Idx = getMemoryAccessFuncIndex(Addr); + int Idx = getMemoryAccessFuncIndex(Addr, DL); if (Idx < 0) return false; const size_t ByteSize = 1 << Idx; @@ -486,7 +524,7 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I) { } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) { Value *Addr = SI->getPointerOperand(); - int Idx = getMemoryAccessFuncIndex(Addr); + int Idx = getMemoryAccessFuncIndex(Addr, DL); if (Idx < 0) return false; const size_t ByteSize = 1 << Idx; @@ -500,7 +538,7 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I) { ReplaceInstWithInst(I, C); } else if (AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(I)) { Value *Addr = RMWI->getPointerOperand(); - int Idx = getMemoryAccessFuncIndex(Addr); + int Idx = getMemoryAccessFuncIndex(Addr, DL); if (Idx < 0) return false; Function *F = TsanAtomicRMW[RMWI->getOperation()][Idx]; @@ -517,7 +555,7 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I) { ReplaceInstWithInst(I, C); } else if (AtomicCmpXchgInst *CASI = dyn_cast<AtomicCmpXchgInst>(I)) { Value *Addr = CASI->getPointerOperand(); - int Idx = getMemoryAccessFuncIndex(Addr); + int Idx = getMemoryAccessFuncIndex(Addr, DL); if (Idx < 0) return false; const size_t ByteSize = 1 << Idx; @@ -547,11 +585,12 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I) { return true; } -int ThreadSanitizer::getMemoryAccessFuncIndex(Value *Addr) { +int ThreadSanitizer::getMemoryAccessFuncIndex(Value *Addr, + const DataLayout &DL) { Type *OrigPtrTy = Addr->getType(); Type *OrigTy = cast<PointerType>(OrigPtrTy)->getElementType(); assert(OrigTy->isSized()); - uint32_t TypeSize = DL->getTypeStoreSizeInBits(OrigTy); + uint32_t TypeSize = DL.getTypeStoreSizeInBits(OrigTy); if (TypeSize != 8 && TypeSize != 16 && TypeSize != 32 && TypeSize != 64 && TypeSize != 128) { NumAccessesWithBadSize++; diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ARCInstKind.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ARCInstKind.cpp new file mode 100644 index 0000000..afb873a --- /dev/null +++ b/contrib/llvm/lib/Transforms/ObjCARC/ARCInstKind.cpp @@ -0,0 +1,673 @@ +//===- ARCInstKind.cpp - ObjC ARC Optimization ----------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file defines several utility functions used by various ARC +/// optimizations which are IMHO too big to be in a header file. +/// +/// WARNING: This file knows about certain library functions. It recognizes them +/// by name, and hardwires knowledge of their semantics. +/// +/// WARNING: This file knows about how certain Objective-C library functions are +/// used. Naive LLVM IR transformations which would otherwise be +/// behavior-preserving may break these assumptions. +/// +//===----------------------------------------------------------------------===// + +#include "ObjCARC.h" +#include "llvm/IR/Intrinsics.h" + +using namespace llvm; +using namespace llvm::objcarc; + +raw_ostream &llvm::objcarc::operator<<(raw_ostream &OS, + const ARCInstKind Class) { + switch (Class) { + case ARCInstKind::Retain: + return OS << "ARCInstKind::Retain"; + case ARCInstKind::RetainRV: + return OS << "ARCInstKind::RetainRV"; + case ARCInstKind::RetainBlock: + return OS << "ARCInstKind::RetainBlock"; + case ARCInstKind::Release: + return OS << "ARCInstKind::Release"; + case ARCInstKind::Autorelease: + return OS << "ARCInstKind::Autorelease"; + case ARCInstKind::AutoreleaseRV: + return OS << "ARCInstKind::AutoreleaseRV"; + case ARCInstKind::AutoreleasepoolPush: + return OS << "ARCInstKind::AutoreleasepoolPush"; + case ARCInstKind::AutoreleasepoolPop: + return OS << "ARCInstKind::AutoreleasepoolPop"; + case ARCInstKind::NoopCast: + return OS << "ARCInstKind::NoopCast"; + case ARCInstKind::FusedRetainAutorelease: + return OS << "ARCInstKind::FusedRetainAutorelease"; + case ARCInstKind::FusedRetainAutoreleaseRV: + return OS << "ARCInstKind::FusedRetainAutoreleaseRV"; + case ARCInstKind::LoadWeakRetained: + return OS << "ARCInstKind::LoadWeakRetained"; + case ARCInstKind::StoreWeak: + return OS << "ARCInstKind::StoreWeak"; + case ARCInstKind::InitWeak: + return OS << "ARCInstKind::InitWeak"; + case ARCInstKind::LoadWeak: + return OS << "ARCInstKind::LoadWeak"; + case ARCInstKind::MoveWeak: + return OS << "ARCInstKind::MoveWeak"; + case ARCInstKind::CopyWeak: + return OS << "ARCInstKind::CopyWeak"; + case ARCInstKind::DestroyWeak: + return OS << "ARCInstKind::DestroyWeak"; + case ARCInstKind::StoreStrong: + return OS << "ARCInstKind::StoreStrong"; + case ARCInstKind::CallOrUser: + return OS << "ARCInstKind::CallOrUser"; + case ARCInstKind::Call: + return OS << "ARCInstKind::Call"; + case ARCInstKind::User: + return OS << "ARCInstKind::User"; + case ARCInstKind::IntrinsicUser: + return OS << "ARCInstKind::IntrinsicUser"; + case ARCInstKind::None: + return OS << "ARCInstKind::None"; + } + llvm_unreachable("Unknown instruction class!"); +} + +ARCInstKind llvm::objcarc::GetFunctionClass(const Function *F) { + Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end(); + + // No (mandatory) arguments. + if (AI == AE) + return StringSwitch<ARCInstKind>(F->getName()) + .Case("objc_autoreleasePoolPush", ARCInstKind::AutoreleasepoolPush) + .Case("clang.arc.use", ARCInstKind::IntrinsicUser) + .Default(ARCInstKind::CallOrUser); + + // One argument. + const Argument *A0 = AI++; + if (AI == AE) + // Argument is a pointer. + if (PointerType *PTy = dyn_cast<PointerType>(A0->getType())) { + Type *ETy = PTy->getElementType(); + // Argument is i8*. + if (ETy->isIntegerTy(8)) + return StringSwitch<ARCInstKind>(F->getName()) + .Case("objc_retain", ARCInstKind::Retain) + .Case("objc_retainAutoreleasedReturnValue", ARCInstKind::RetainRV) + .Case("objc_retainBlock", ARCInstKind::RetainBlock) + .Case("objc_release", ARCInstKind::Release) + .Case("objc_autorelease", ARCInstKind::Autorelease) + .Case("objc_autoreleaseReturnValue", ARCInstKind::AutoreleaseRV) + .Case("objc_autoreleasePoolPop", ARCInstKind::AutoreleasepoolPop) + .Case("objc_retainedObject", ARCInstKind::NoopCast) + .Case("objc_unretainedObject", ARCInstKind::NoopCast) + .Case("objc_unretainedPointer", ARCInstKind::NoopCast) + .Case("objc_retain_autorelease", + ARCInstKind::FusedRetainAutorelease) + .Case("objc_retainAutorelease", ARCInstKind::FusedRetainAutorelease) + .Case("objc_retainAutoreleaseReturnValue", + ARCInstKind::FusedRetainAutoreleaseRV) + .Case("objc_sync_enter", ARCInstKind::User) + .Case("objc_sync_exit", ARCInstKind::User) + .Default(ARCInstKind::CallOrUser); + + // Argument is i8** + if (PointerType *Pte = dyn_cast<PointerType>(ETy)) + if (Pte->getElementType()->isIntegerTy(8)) + return StringSwitch<ARCInstKind>(F->getName()) + .Case("objc_loadWeakRetained", ARCInstKind::LoadWeakRetained) + .Case("objc_loadWeak", ARCInstKind::LoadWeak) + .Case("objc_destroyWeak", ARCInstKind::DestroyWeak) + .Default(ARCInstKind::CallOrUser); + } + + // Two arguments, first is i8**. + const Argument *A1 = AI++; + if (AI == AE) + if (PointerType *PTy = dyn_cast<PointerType>(A0->getType())) + if (PointerType *Pte = dyn_cast<PointerType>(PTy->getElementType())) + if (Pte->getElementType()->isIntegerTy(8)) + if (PointerType *PTy1 = dyn_cast<PointerType>(A1->getType())) { + Type *ETy1 = PTy1->getElementType(); + // Second argument is i8* + if (ETy1->isIntegerTy(8)) + return StringSwitch<ARCInstKind>(F->getName()) + .Case("objc_storeWeak", ARCInstKind::StoreWeak) + .Case("objc_initWeak", ARCInstKind::InitWeak) + .Case("objc_storeStrong", ARCInstKind::StoreStrong) + .Default(ARCInstKind::CallOrUser); + // Second argument is i8**. + if (PointerType *Pte1 = dyn_cast<PointerType>(ETy1)) + if (Pte1->getElementType()->isIntegerTy(8)) + return StringSwitch<ARCInstKind>(F->getName()) + .Case("objc_moveWeak", ARCInstKind::MoveWeak) + .Case("objc_copyWeak", ARCInstKind::CopyWeak) + // Ignore annotation calls. This is important to stop the + // optimizer from treating annotations as uses which would + // make the state of the pointers they are attempting to + // elucidate to be incorrect. + .Case("llvm.arc.annotation.topdown.bbstart", + ARCInstKind::None) + .Case("llvm.arc.annotation.topdown.bbend", + ARCInstKind::None) + .Case("llvm.arc.annotation.bottomup.bbstart", + ARCInstKind::None) + .Case("llvm.arc.annotation.bottomup.bbend", + ARCInstKind::None) + .Default(ARCInstKind::CallOrUser); + } + + // Anything else. + return ARCInstKind::CallOrUser; +} + +// A whitelist of intrinsics that we know do not use objc pointers or decrement +// ref counts. +static bool isInertIntrinsic(unsigned ID) { + // TODO: Make this into a covered switch. + switch (ID) { + case Intrinsic::returnaddress: + case Intrinsic::frameaddress: + case Intrinsic::stacksave: + case Intrinsic::stackrestore: + case Intrinsic::vastart: + case Intrinsic::vacopy: + case Intrinsic::vaend: + case Intrinsic::objectsize: + case Intrinsic::prefetch: + case Intrinsic::stackprotector: + case Intrinsic::eh_return_i32: + case Intrinsic::eh_return_i64: + case Intrinsic::eh_typeid_for: + case Intrinsic::eh_dwarf_cfa: + case Intrinsic::eh_sjlj_lsda: + case Intrinsic::eh_sjlj_functioncontext: + case Intrinsic::init_trampoline: + case Intrinsic::adjust_trampoline: + case Intrinsic::lifetime_start: + case Intrinsic::lifetime_end: + case Intrinsic::invariant_start: + case Intrinsic::invariant_end: + // Don't let dbg info affect our results. + case Intrinsic::dbg_declare: + case Intrinsic::dbg_value: + // Short cut: Some intrinsics obviously don't use ObjC pointers. + return true; + default: + return false; + } +} + +// A whitelist of intrinsics that we know do not use objc pointers or decrement +// ref counts. +static bool isUseOnlyIntrinsic(unsigned ID) { + // We are conservative and even though intrinsics are unlikely to touch + // reference counts, we white list them for safety. + // + // TODO: Expand this into a covered switch. There is a lot more here. + switch (ID) { + case Intrinsic::memcpy: + case Intrinsic::memmove: + case Intrinsic::memset: + return true; + default: + return false; + } +} + +/// \brief Determine what kind of construct V is. +ARCInstKind llvm::objcarc::GetARCInstKind(const Value *V) { + if (const Instruction *I = dyn_cast<Instruction>(V)) { + // Any instruction other than bitcast and gep with a pointer operand have a + // use of an objc pointer. Bitcasts, GEPs, Selects, PHIs transfer a pointer + // to a subsequent use, rather than using it themselves, in this sense. + // As a short cut, several other opcodes are known to have no pointer + // operands of interest. And ret is never followed by a release, so it's + // not interesting to examine. + switch (I->getOpcode()) { + case Instruction::Call: { + const CallInst *CI = cast<CallInst>(I); + // See if we have a function that we know something about. + if (const Function *F = CI->getCalledFunction()) { + ARCInstKind Class = GetFunctionClass(F); + if (Class != ARCInstKind::CallOrUser) + return Class; + Intrinsic::ID ID = F->getIntrinsicID(); + if (isInertIntrinsic(ID)) + return ARCInstKind::None; + if (isUseOnlyIntrinsic(ID)) + return ARCInstKind::User; + } + + // Otherwise, be conservative. + return GetCallSiteClass(CI); + } + case Instruction::Invoke: + // Otherwise, be conservative. + return GetCallSiteClass(cast<InvokeInst>(I)); + case Instruction::BitCast: + case Instruction::GetElementPtr: + case Instruction::Select: + case Instruction::PHI: + case Instruction::Ret: + case Instruction::Br: + case Instruction::Switch: + case Instruction::IndirectBr: + case Instruction::Alloca: + case Instruction::VAArg: + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::SDiv: + case Instruction::UDiv: + case Instruction::FDiv: + case Instruction::SRem: + case Instruction::URem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::SExt: + case Instruction::ZExt: + case Instruction::Trunc: + case Instruction::IntToPtr: + case Instruction::FCmp: + case Instruction::FPTrunc: + case Instruction::FPExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::UIToFP: + case Instruction::SIToFP: + case Instruction::InsertElement: + case Instruction::ExtractElement: + case Instruction::ShuffleVector: + case Instruction::ExtractValue: + break; + case Instruction::ICmp: + // Comparing a pointer with null, or any other constant, isn't an + // interesting use, because we don't care what the pointer points to, or + // about the values of any other dynamic reference-counted pointers. + if (IsPotentialRetainableObjPtr(I->getOperand(1))) + return ARCInstKind::User; + break; + default: + // For anything else, check all the operands. + // Note that this includes both operands of a Store: while the first + // operand isn't actually being dereferenced, it is being stored to + // memory where we can no longer track who might read it and dereference + // it, so we have to consider it potentially used. + for (User::const_op_iterator OI = I->op_begin(), OE = I->op_end(); + OI != OE; ++OI) + if (IsPotentialRetainableObjPtr(*OI)) + return ARCInstKind::User; + } + } + + // Otherwise, it's totally inert for ARC purposes. + return ARCInstKind::None; +} + +/// \brief Test if the given class is a kind of user. +bool llvm::objcarc::IsUser(ARCInstKind Class) { + switch (Class) { + case ARCInstKind::User: + case ARCInstKind::CallOrUser: + case ARCInstKind::IntrinsicUser: + return true; + case ARCInstKind::Retain: + case ARCInstKind::RetainRV: + case ARCInstKind::RetainBlock: + case ARCInstKind::Release: + case ARCInstKind::Autorelease: + case ARCInstKind::AutoreleaseRV: + case ARCInstKind::AutoreleasepoolPush: + case ARCInstKind::AutoreleasepoolPop: + case ARCInstKind::NoopCast: + case ARCInstKind::FusedRetainAutorelease: + case ARCInstKind::FusedRetainAutoreleaseRV: + case ARCInstKind::LoadWeakRetained: + case ARCInstKind::StoreWeak: + case ARCInstKind::InitWeak: + case ARCInstKind::LoadWeak: + case ARCInstKind::MoveWeak: + case ARCInstKind::CopyWeak: + case ARCInstKind::DestroyWeak: + case ARCInstKind::StoreStrong: + case ARCInstKind::Call: + case ARCInstKind::None: + return false; + } + llvm_unreachable("covered switch isn't covered?"); +} + +/// \brief Test if the given class is objc_retain or equivalent. +bool llvm::objcarc::IsRetain(ARCInstKind Class) { + switch (Class) { + case ARCInstKind::Retain: + case ARCInstKind::RetainRV: + return true; + // I believe we treat retain block as not a retain since it can copy its + // block. + case ARCInstKind::RetainBlock: + case ARCInstKind::Release: + case ARCInstKind::Autorelease: + case ARCInstKind::AutoreleaseRV: + case ARCInstKind::AutoreleasepoolPush: + case ARCInstKind::AutoreleasepoolPop: + case ARCInstKind::NoopCast: + case ARCInstKind::FusedRetainAutorelease: + case ARCInstKind::FusedRetainAutoreleaseRV: + case ARCInstKind::LoadWeakRetained: + case ARCInstKind::StoreWeak: + case ARCInstKind::InitWeak: + case ARCInstKind::LoadWeak: + case ARCInstKind::MoveWeak: + case ARCInstKind::CopyWeak: + case ARCInstKind::DestroyWeak: + case ARCInstKind::StoreStrong: + case ARCInstKind::IntrinsicUser: + case ARCInstKind::CallOrUser: + case ARCInstKind::Call: + case ARCInstKind::User: + case ARCInstKind::None: + return false; + } + llvm_unreachable("covered switch isn't covered?"); +} + +/// \brief Test if the given class is objc_autorelease or equivalent. +bool llvm::objcarc::IsAutorelease(ARCInstKind Class) { + switch (Class) { + case ARCInstKind::Autorelease: + case ARCInstKind::AutoreleaseRV: + return true; + case ARCInstKind::Retain: + case ARCInstKind::RetainRV: + case ARCInstKind::RetainBlock: + case ARCInstKind::Release: + case ARCInstKind::AutoreleasepoolPush: + case ARCInstKind::AutoreleasepoolPop: + case ARCInstKind::NoopCast: + case ARCInstKind::FusedRetainAutorelease: + case ARCInstKind::FusedRetainAutoreleaseRV: + case ARCInstKind::LoadWeakRetained: + case ARCInstKind::StoreWeak: + case ARCInstKind::InitWeak: + case ARCInstKind::LoadWeak: + case ARCInstKind::MoveWeak: + case ARCInstKind::CopyWeak: + case ARCInstKind::DestroyWeak: + case ARCInstKind::StoreStrong: + case ARCInstKind::IntrinsicUser: + case ARCInstKind::CallOrUser: + case ARCInstKind::Call: + case ARCInstKind::User: + case ARCInstKind::None: + return false; + } + llvm_unreachable("covered switch isn't covered?"); +} + +/// \brief Test if the given class represents instructions which return their +/// argument verbatim. +bool llvm::objcarc::IsForwarding(ARCInstKind Class) { + switch (Class) { + case ARCInstKind::Retain: + case ARCInstKind::RetainRV: + case ARCInstKind::Autorelease: + case ARCInstKind::AutoreleaseRV: + case ARCInstKind::NoopCast: + return true; + case ARCInstKind::RetainBlock: + case ARCInstKind::Release: + case ARCInstKind::AutoreleasepoolPush: + case ARCInstKind::AutoreleasepoolPop: + case ARCInstKind::FusedRetainAutorelease: + case ARCInstKind::FusedRetainAutoreleaseRV: + case ARCInstKind::LoadWeakRetained: + case ARCInstKind::StoreWeak: + case ARCInstKind::InitWeak: + case ARCInstKind::LoadWeak: + case ARCInstKind::MoveWeak: + case ARCInstKind::CopyWeak: + case ARCInstKind::DestroyWeak: + case ARCInstKind::StoreStrong: + case ARCInstKind::IntrinsicUser: + case ARCInstKind::CallOrUser: + case ARCInstKind::Call: + case ARCInstKind::User: + case ARCInstKind::None: + return false; + } + llvm_unreachable("covered switch isn't covered?"); +} + +/// \brief Test if the given class represents instructions which do nothing if +/// passed a null pointer. +bool llvm::objcarc::IsNoopOnNull(ARCInstKind Class) { + switch (Class) { + case ARCInstKind::Retain: + case ARCInstKind::RetainRV: + case ARCInstKind::Release: + case ARCInstKind::Autorelease: + case ARCInstKind::AutoreleaseRV: + case ARCInstKind::RetainBlock: + return true; + case ARCInstKind::AutoreleasepoolPush: + case ARCInstKind::AutoreleasepoolPop: + case ARCInstKind::FusedRetainAutorelease: + case ARCInstKind::FusedRetainAutoreleaseRV: + case ARCInstKind::LoadWeakRetained: + case ARCInstKind::StoreWeak: + case ARCInstKind::InitWeak: + case ARCInstKind::LoadWeak: + case ARCInstKind::MoveWeak: + case ARCInstKind::CopyWeak: + case ARCInstKind::DestroyWeak: + case ARCInstKind::StoreStrong: + case ARCInstKind::IntrinsicUser: + case ARCInstKind::CallOrUser: + case ARCInstKind::Call: + case ARCInstKind::User: + case ARCInstKind::None: + case ARCInstKind::NoopCast: + return false; + } + llvm_unreachable("covered switch isn't covered?"); +} + +/// \brief Test if the given class represents instructions which are always safe +/// to mark with the "tail" keyword. +bool llvm::objcarc::IsAlwaysTail(ARCInstKind Class) { + // ARCInstKind::RetainBlock may be given a stack argument. + switch (Class) { + case ARCInstKind::Retain: + case ARCInstKind::RetainRV: + case ARCInstKind::AutoreleaseRV: + return true; + case ARCInstKind::Release: + case ARCInstKind::Autorelease: + case ARCInstKind::RetainBlock: + case ARCInstKind::AutoreleasepoolPush: + case ARCInstKind::AutoreleasepoolPop: + case ARCInstKind::FusedRetainAutorelease: + case ARCInstKind::FusedRetainAutoreleaseRV: + case ARCInstKind::LoadWeakRetained: + case ARCInstKind::StoreWeak: + case ARCInstKind::InitWeak: + case ARCInstKind::LoadWeak: + case ARCInstKind::MoveWeak: + case ARCInstKind::CopyWeak: + case ARCInstKind::DestroyWeak: + case ARCInstKind::StoreStrong: + case ARCInstKind::IntrinsicUser: + case ARCInstKind::CallOrUser: + case ARCInstKind::Call: + case ARCInstKind::User: + case ARCInstKind::None: + case ARCInstKind::NoopCast: + return false; + } + llvm_unreachable("covered switch isn't covered?"); +} + +/// \brief Test if the given class represents instructions which are never safe +/// to mark with the "tail" keyword. +bool llvm::objcarc::IsNeverTail(ARCInstKind Class) { + /// It is never safe to tail call objc_autorelease since by tail calling + /// objc_autorelease: fast autoreleasing causing our object to be potentially + /// reclaimed from the autorelease pool which violates the semantics of + /// __autoreleasing types in ARC. + switch (Class) { + case ARCInstKind::Autorelease: + return true; + case ARCInstKind::Retain: + case ARCInstKind::RetainRV: + case ARCInstKind::AutoreleaseRV: + case ARCInstKind::Release: + case ARCInstKind::RetainBlock: + case ARCInstKind::AutoreleasepoolPush: + case ARCInstKind::AutoreleasepoolPop: + case ARCInstKind::FusedRetainAutorelease: + case ARCInstKind::FusedRetainAutoreleaseRV: + case ARCInstKind::LoadWeakRetained: + case ARCInstKind::StoreWeak: + case ARCInstKind::InitWeak: + case ARCInstKind::LoadWeak: + case ARCInstKind::MoveWeak: + case ARCInstKind::CopyWeak: + case ARCInstKind::DestroyWeak: + case ARCInstKind::StoreStrong: + case ARCInstKind::IntrinsicUser: + case ARCInstKind::CallOrUser: + case ARCInstKind::Call: + case ARCInstKind::User: + case ARCInstKind::None: + case ARCInstKind::NoopCast: + return false; + } + llvm_unreachable("covered switch isn't covered?"); +} + +/// \brief Test if the given class represents instructions which are always safe +/// to mark with the nounwind attribute. +bool llvm::objcarc::IsNoThrow(ARCInstKind Class) { + // objc_retainBlock is not nounwind because it calls user copy constructors + // which could theoretically throw. + switch (Class) { + case ARCInstKind::Retain: + case ARCInstKind::RetainRV: + case ARCInstKind::Release: + case ARCInstKind::Autorelease: + case ARCInstKind::AutoreleaseRV: + case ARCInstKind::AutoreleasepoolPush: + case ARCInstKind::AutoreleasepoolPop: + return true; + case ARCInstKind::RetainBlock: + case ARCInstKind::FusedRetainAutorelease: + case ARCInstKind::FusedRetainAutoreleaseRV: + case ARCInstKind::LoadWeakRetained: + case ARCInstKind::StoreWeak: + case ARCInstKind::InitWeak: + case ARCInstKind::LoadWeak: + case ARCInstKind::MoveWeak: + case ARCInstKind::CopyWeak: + case ARCInstKind::DestroyWeak: + case ARCInstKind::StoreStrong: + case ARCInstKind::IntrinsicUser: + case ARCInstKind::CallOrUser: + case ARCInstKind::Call: + case ARCInstKind::User: + case ARCInstKind::None: + case ARCInstKind::NoopCast: + return false; + } + llvm_unreachable("covered switch isn't covered?"); +} + +/// Test whether the given instruction can autorelease any pointer or cause an +/// autoreleasepool pop. +/// +/// This means that it *could* interrupt the RV optimization. +bool llvm::objcarc::CanInterruptRV(ARCInstKind Class) { + switch (Class) { + case ARCInstKind::AutoreleasepoolPop: + case ARCInstKind::CallOrUser: + case ARCInstKind::Call: + case ARCInstKind::Autorelease: + case ARCInstKind::AutoreleaseRV: + case ARCInstKind::FusedRetainAutorelease: + case ARCInstKind::FusedRetainAutoreleaseRV: + return true; + case ARCInstKind::Retain: + case ARCInstKind::RetainRV: + case ARCInstKind::Release: + case ARCInstKind::AutoreleasepoolPush: + case ARCInstKind::RetainBlock: + case ARCInstKind::LoadWeakRetained: + case ARCInstKind::StoreWeak: + case ARCInstKind::InitWeak: + case ARCInstKind::LoadWeak: + case ARCInstKind::MoveWeak: + case ARCInstKind::CopyWeak: + case ARCInstKind::DestroyWeak: + case ARCInstKind::StoreStrong: + case ARCInstKind::IntrinsicUser: + case ARCInstKind::User: + case ARCInstKind::None: + case ARCInstKind::NoopCast: + return false; + } + llvm_unreachable("covered switch isn't covered?"); +} + +bool llvm::objcarc::CanDecrementRefCount(ARCInstKind Kind) { + switch (Kind) { + case ARCInstKind::Retain: + case ARCInstKind::RetainRV: + case ARCInstKind::Autorelease: + case ARCInstKind::AutoreleaseRV: + case ARCInstKind::NoopCast: + case ARCInstKind::FusedRetainAutorelease: + case ARCInstKind::FusedRetainAutoreleaseRV: + case ARCInstKind::IntrinsicUser: + case ARCInstKind::User: + case ARCInstKind::None: + return false; + + // The cases below are conservative. + + // RetainBlock can result in user defined copy constructors being called + // implying releases may occur. + case ARCInstKind::RetainBlock: + case ARCInstKind::Release: + case ARCInstKind::AutoreleasepoolPush: + case ARCInstKind::AutoreleasepoolPop: + case ARCInstKind::LoadWeakRetained: + case ARCInstKind::StoreWeak: + case ARCInstKind::InitWeak: + case ARCInstKind::LoadWeak: + case ARCInstKind::MoveWeak: + case ARCInstKind::CopyWeak: + case ARCInstKind::DestroyWeak: + case ARCInstKind::StoreStrong: + case ARCInstKind::CallOrUser: + case ARCInstKind::Call: + return true; + } + + llvm_unreachable("covered switch isn't covered?"); +} diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ARCInstKind.h b/contrib/llvm/lib/Transforms/ObjCARC/ARCInstKind.h new file mode 100644 index 0000000..636c65c --- /dev/null +++ b/contrib/llvm/lib/Transforms/ObjCARC/ARCInstKind.h @@ -0,0 +1,123 @@ +//===--- ARCInstKind.h - ARC instruction equivalence classes -*- C++ -*----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TRANSFORMS_OBJCARC_ARCINSTKIND_H +#define LLVM_LIB_TRANSFORMS_OBJCARC_ARCINSTKIND_H + +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Function.h" + +namespace llvm { +namespace objcarc { + +/// \enum ARCInstKind +/// +/// \brief Equivalence classes of instructions in the ARC Model. +/// +/// Since we do not have "instructions" to represent ARC concepts in LLVM IR, +/// we instead operate on equivalence classes of instructions. +/// +/// TODO: This should be split into two enums: a runtime entry point enum +/// (possibly united with the ARCRuntimeEntrypoint class) and an enum that deals +/// with effects of instructions in the ARC model (which would handle the notion +/// of a User or CallOrUser). +enum class ARCInstKind { + Retain, ///< objc_retain + RetainRV, ///< objc_retainAutoreleasedReturnValue + RetainBlock, ///< objc_retainBlock + Release, ///< objc_release + Autorelease, ///< objc_autorelease + AutoreleaseRV, ///< objc_autoreleaseReturnValue + AutoreleasepoolPush, ///< objc_autoreleasePoolPush + AutoreleasepoolPop, ///< objc_autoreleasePoolPop + NoopCast, ///< objc_retainedObject, etc. + FusedRetainAutorelease, ///< objc_retainAutorelease + FusedRetainAutoreleaseRV, ///< objc_retainAutoreleaseReturnValue + LoadWeakRetained, ///< objc_loadWeakRetained (primitive) + StoreWeak, ///< objc_storeWeak (primitive) + InitWeak, ///< objc_initWeak (derived) + LoadWeak, ///< objc_loadWeak (derived) + MoveWeak, ///< objc_moveWeak (derived) + CopyWeak, ///< objc_copyWeak (derived) + DestroyWeak, ///< objc_destroyWeak (derived) + StoreStrong, ///< objc_storeStrong (derived) + IntrinsicUser, ///< clang.arc.use + CallOrUser, ///< could call objc_release and/or "use" pointers + Call, ///< could call objc_release + User, ///< could "use" a pointer + None ///< anything that is inert from an ARC perspective. +}; + +raw_ostream &operator<<(raw_ostream &OS, const ARCInstKind Class); + +/// \brief Test if the given class is a kind of user. +bool IsUser(ARCInstKind Class); + +/// \brief Test if the given class is objc_retain or equivalent. +bool IsRetain(ARCInstKind Class); + +/// \brief Test if the given class is objc_autorelease or equivalent. +bool IsAutorelease(ARCInstKind Class); + +/// \brief Test if the given class represents instructions which return their +/// argument verbatim. +bool IsForwarding(ARCInstKind Class); + +/// \brief Test if the given class represents instructions which do nothing if +/// passed a null pointer. +bool IsNoopOnNull(ARCInstKind Class); + +/// \brief Test if the given class represents instructions which are always safe +/// to mark with the "tail" keyword. +bool IsAlwaysTail(ARCInstKind Class); + +/// \brief Test if the given class represents instructions which are never safe +/// to mark with the "tail" keyword. +bool IsNeverTail(ARCInstKind Class); + +/// \brief Test if the given class represents instructions which are always safe +/// to mark with the nounwind attribute. +bool IsNoThrow(ARCInstKind Class); + +/// Test whether the given instruction can autorelease any pointer or cause an +/// autoreleasepool pop. +bool CanInterruptRV(ARCInstKind Class); + +/// \brief Determine if F is one of the special known Functions. If it isn't, +/// return ARCInstKind::CallOrUser. +ARCInstKind GetFunctionClass(const Function *F); + +/// \brief Determine which objc runtime call instruction class V belongs to. +/// +/// This is similar to GetARCInstKind except that it only detects objc +/// runtime calls. This allows it to be faster. +/// +static inline ARCInstKind GetBasicARCInstKind(const Value *V) { + if (const CallInst *CI = dyn_cast<CallInst>(V)) { + if (const Function *F = CI->getCalledFunction()) + return GetFunctionClass(F); + // Otherwise, be conservative. + return ARCInstKind::CallOrUser; + } + + // Otherwise, be conservative. + return isa<InvokeInst>(V) ? ARCInstKind::CallOrUser : ARCInstKind::User; +} + +/// Map V to its ARCInstKind equivalence class. +ARCInstKind GetARCInstKind(const Value *V); + +/// Returns false if conservatively we can prove that any instruction mapped to +/// this kind can not decrement ref counts. Returns true otherwise. +bool CanDecrementRefCount(ARCInstKind Kind); + +} // end namespace objcarc +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h b/contrib/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h index e286dbc..d4fef10 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h +++ b/contrib/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h @@ -27,22 +27,22 @@ namespace llvm { namespace objcarc { +enum class ARCRuntimeEntryPointKind { + AutoreleaseRV, + Release, + Retain, + RetainBlock, + Autorelease, + StoreStrong, + RetainRV, + RetainAutorelease, + RetainAutoreleaseRV, +}; + /// Declarations for ObjC runtime functions and constants. These are initialized /// lazily to avoid cluttering up the Module with unused declarations. class ARCRuntimeEntryPoints { public: - enum EntryPointType { - EPT_AutoreleaseRV, - EPT_Release, - EPT_Retain, - EPT_RetainBlock, - EPT_Autorelease, - EPT_StoreStrong, - EPT_RetainRV, - EPT_RetainAutorelease, - EPT_RetainAutoreleaseRV - }; - ARCRuntimeEntryPoints() : TheModule(nullptr), AutoreleaseRV(nullptr), Release(nullptr), @@ -54,9 +54,7 @@ public: RetainAutorelease(nullptr), RetainAutoreleaseRV(nullptr) { } - ~ARCRuntimeEntryPoints() { } - - void Initialize(Module *M) { + void init(Module *M) { TheModule = M; AutoreleaseRV = nullptr; Release = nullptr; @@ -69,30 +67,30 @@ public: RetainAutoreleaseRV = nullptr; } - Constant *get(const EntryPointType entry) { + Constant *get(ARCRuntimeEntryPointKind kind) { assert(TheModule != nullptr && "Not initialized."); - switch (entry) { - case EPT_AutoreleaseRV: + switch (kind) { + case ARCRuntimeEntryPointKind::AutoreleaseRV: return getI8XRetI8XEntryPoint(AutoreleaseRV, "objc_autoreleaseReturnValue", true); - case EPT_Release: + case ARCRuntimeEntryPointKind::Release: return getVoidRetI8XEntryPoint(Release, "objc_release"); - case EPT_Retain: + case ARCRuntimeEntryPointKind::Retain: return getI8XRetI8XEntryPoint(Retain, "objc_retain", true); - case EPT_RetainBlock: + case ARCRuntimeEntryPointKind::RetainBlock: return getI8XRetI8XEntryPoint(RetainBlock, "objc_retainBlock", false); - case EPT_Autorelease: + case ARCRuntimeEntryPointKind::Autorelease: return getI8XRetI8XEntryPoint(Autorelease, "objc_autorelease", true); - case EPT_StoreStrong: + case ARCRuntimeEntryPointKind::StoreStrong: return getI8XRetI8XXI8XEntryPoint(StoreStrong, "objc_storeStrong"); - case EPT_RetainRV: + case ARCRuntimeEntryPointKind::RetainRV: return getI8XRetI8XEntryPoint(RetainRV, "objc_retainAutoreleasedReturnValue", true); - case EPT_RetainAutorelease: + case ARCRuntimeEntryPointKind::RetainAutorelease: return getI8XRetI8XEntryPoint(RetainAutorelease, "objc_retainAutorelease", true); - case EPT_RetainAutoreleaseRV: + case ARCRuntimeEntryPointKind::RetainAutoreleaseRV: return getI8XRetI8XEntryPoint(RetainAutoreleaseRV, "objc_retainAutoreleaseReturnValue", true); } diff --git a/contrib/llvm/lib/Transforms/ObjCARC/BlotMapVector.h b/contrib/llvm/lib/Transforms/ObjCARC/BlotMapVector.h new file mode 100644 index 0000000..d6439b6 --- /dev/null +++ b/contrib/llvm/lib/Transforms/ObjCARC/BlotMapVector.h @@ -0,0 +1,108 @@ +//===- BlotMapVector.h - A MapVector with the blot operation -*- C++ -*----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/DenseMap.h" +#include <vector> +#include <algorithm> + +namespace llvm { +/// \brief An associative container with fast insertion-order (deterministic) +/// iteration over its elements. Plus the special blot operation. +template <class KeyT, class ValueT> class BlotMapVector { + /// Map keys to indices in Vector. + typedef DenseMap<KeyT, size_t> MapTy; + MapTy Map; + + typedef std::vector<std::pair<KeyT, ValueT>> VectorTy; + /// Keys and values. + VectorTy Vector; + +public: + typedef typename VectorTy::iterator iterator; + typedef typename VectorTy::const_iterator const_iterator; + iterator begin() { return Vector.begin(); } + iterator end() { return Vector.end(); } + const_iterator begin() const { return Vector.begin(); } + const_iterator end() const { return Vector.end(); } + +#ifdef XDEBUG + ~BlotMapVector() { + assert(Vector.size() >= Map.size()); // May differ due to blotting. + for (typename MapTy::const_iterator I = Map.begin(), E = Map.end(); I != E; + ++I) { + assert(I->second < Vector.size()); + assert(Vector[I->second].first == I->first); + } + for (typename VectorTy::const_iterator I = Vector.begin(), E = Vector.end(); + I != E; ++I) + assert(!I->first || (Map.count(I->first) && + Map[I->first] == size_t(I - Vector.begin()))); + } +#endif + + ValueT &operator[](const KeyT &Arg) { + std::pair<typename MapTy::iterator, bool> Pair = + Map.insert(std::make_pair(Arg, size_t(0))); + if (Pair.second) { + size_t Num = Vector.size(); + Pair.first->second = Num; + Vector.push_back(std::make_pair(Arg, ValueT())); + return Vector[Num].second; + } + return Vector[Pair.first->second].second; + } + + std::pair<iterator, bool> insert(const std::pair<KeyT, ValueT> &InsertPair) { + std::pair<typename MapTy::iterator, bool> Pair = + Map.insert(std::make_pair(InsertPair.first, size_t(0))); + if (Pair.second) { + size_t Num = Vector.size(); + Pair.first->second = Num; + Vector.push_back(InsertPair); + return std::make_pair(Vector.begin() + Num, true); + } + return std::make_pair(Vector.begin() + Pair.first->second, false); + } + + iterator find(const KeyT &Key) { + typename MapTy::iterator It = Map.find(Key); + if (It == Map.end()) + return Vector.end(); + return Vector.begin() + It->second; + } + + const_iterator find(const KeyT &Key) const { + typename MapTy::const_iterator It = Map.find(Key); + if (It == Map.end()) + return Vector.end(); + return Vector.begin() + It->second; + } + + /// This is similar to erase, but instead of removing the element from the + /// vector, it just zeros out the key in the vector. This leaves iterators + /// intact, but clients must be prepared for zeroed-out keys when iterating. + void blot(const KeyT &Key) { + typename MapTy::iterator It = Map.find(Key); + if (It == Map.end()) + return; + Vector[It->second].first = KeyT(); + Map.erase(It); + } + + void clear() { + Map.clear(); + Vector.clear(); + } + + bool empty() const { + assert(Map.empty() == Vector.empty()); + return Map.empty(); + } +}; +} // diff --git a/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp b/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp index f6c236c..4edd029 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp +++ b/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp @@ -32,21 +32,20 @@ using namespace llvm::objcarc; /// Test whether the given instruction can result in a reference count /// modification (positive or negative) for the pointer's object. -bool -llvm::objcarc::CanAlterRefCount(const Instruction *Inst, const Value *Ptr, - ProvenanceAnalysis &PA, - InstructionClass Class) { +bool llvm::objcarc::CanAlterRefCount(const Instruction *Inst, const Value *Ptr, + ProvenanceAnalysis &PA, + ARCInstKind Class) { switch (Class) { - case IC_Autorelease: - case IC_AutoreleaseRV: - case IC_IntrinsicUser: - case IC_User: + case ARCInstKind::Autorelease: + case ARCInstKind::AutoreleaseRV: + case ARCInstKind::IntrinsicUser: + case ARCInstKind::User: // These operations never directly modify a reference count. return false; default: break; } - ImmutableCallSite CS = static_cast<const Value *>(Inst); + ImmutableCallSite CS(Inst); assert(CS && "Only calls can alter reference counts!"); // See if AliasAnalysis can help us with the call. @@ -54,10 +53,12 @@ llvm::objcarc::CanAlterRefCount(const Instruction *Inst, const Value *Ptr, if (AliasAnalysis::onlyReadsMemory(MRB)) return false; if (AliasAnalysis::onlyAccessesArgPointees(MRB)) { + const DataLayout &DL = Inst->getModule()->getDataLayout(); for (ImmutableCallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); I != E; ++I) { const Value *Op = *I; - if (IsPotentialRetainableObjPtr(Op, *PA.getAA()) && PA.related(Ptr, Op)) + if (IsPotentialRetainableObjPtr(Op, *PA.getAA()) && + PA.related(Ptr, Op, DL)) return true; } return false; @@ -67,15 +68,29 @@ llvm::objcarc::CanAlterRefCount(const Instruction *Inst, const Value *Ptr, return true; } +bool llvm::objcarc::CanDecrementRefCount(const Instruction *Inst, + const Value *Ptr, + ProvenanceAnalysis &PA, + ARCInstKind Class) { + // First perform a quick check if Class can not touch ref counts. + if (!CanDecrementRefCount(Class)) + return false; + + // Otherwise, just use CanAlterRefCount for now. + return CanAlterRefCount(Inst, Ptr, PA, Class); +} + /// Test whether the given instruction can "use" the given pointer's object in a /// way that requires the reference count to be positive. -bool -llvm::objcarc::CanUse(const Instruction *Inst, const Value *Ptr, - ProvenanceAnalysis &PA, InstructionClass Class) { - // IC_Call operations (as opposed to IC_CallOrUser) never "use" objc pointers. - if (Class == IC_Call) +bool llvm::objcarc::CanUse(const Instruction *Inst, const Value *Ptr, + ProvenanceAnalysis &PA, ARCInstKind Class) { + // ARCInstKind::Call operations (as opposed to + // ARCInstKind::CallOrUser) never "use" objc pointers. + if (Class == ARCInstKind::Call) return false; + const DataLayout &DL = Inst->getModule()->getDataLayout(); + // Consider various instructions which may have pointer arguments which are // not "uses". if (const ICmpInst *ICI = dyn_cast<ICmpInst>(Inst)) { @@ -84,29 +99,31 @@ llvm::objcarc::CanUse(const Instruction *Inst, const Value *Ptr, // of any other dynamic reference-counted pointers. if (!IsPotentialRetainableObjPtr(ICI->getOperand(1), *PA.getAA())) return false; - } else if (ImmutableCallSite CS = static_cast<const Value *>(Inst)) { + } else if (auto CS = ImmutableCallSite(Inst)) { // For calls, just check the arguments (and not the callee operand). for (ImmutableCallSite::arg_iterator OI = CS.arg_begin(), OE = CS.arg_end(); OI != OE; ++OI) { const Value *Op = *OI; - if (IsPotentialRetainableObjPtr(Op, *PA.getAA()) && PA.related(Ptr, Op)) + if (IsPotentialRetainableObjPtr(Op, *PA.getAA()) && + PA.related(Ptr, Op, DL)) return true; } return false; } else if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) { // Special-case stores, because we don't care about the stored value, just // the store address. - const Value *Op = GetUnderlyingObjCPtr(SI->getPointerOperand()); + const Value *Op = GetUnderlyingObjCPtr(SI->getPointerOperand(), DL); // If we can't tell what the underlying object was, assume there is a // dependence. - return IsPotentialRetainableObjPtr(Op, *PA.getAA()) && PA.related(Op, Ptr); + return IsPotentialRetainableObjPtr(Op, *PA.getAA()) && + PA.related(Op, Ptr, DL); } // Check each operand for a match. for (User::const_op_iterator OI = Inst->op_begin(), OE = Inst->op_end(); OI != OE; ++OI) { const Value *Op = *OI; - if (IsPotentialRetainableObjPtr(Op, *PA.getAA()) && PA.related(Ptr, Op)) + if (IsPotentialRetainableObjPtr(Op, *PA.getAA()) && PA.related(Ptr, Op, DL)) return true; } return false; @@ -123,11 +140,11 @@ llvm::objcarc::Depends(DependenceKind Flavor, Instruction *Inst, switch (Flavor) { case NeedsPositiveRetainCount: { - InstructionClass Class = GetInstructionClass(Inst); + ARCInstKind Class = GetARCInstKind(Inst); switch (Class) { - case IC_AutoreleasepoolPop: - case IC_AutoreleasepoolPush: - case IC_None: + case ARCInstKind::AutoreleasepoolPop: + case ARCInstKind::AutoreleasepoolPush: + case ARCInstKind::None: return false; default: return CanUse(Inst, Arg, PA, Class); @@ -135,10 +152,10 @@ llvm::objcarc::Depends(DependenceKind Flavor, Instruction *Inst, } case AutoreleasePoolBoundary: { - InstructionClass Class = GetInstructionClass(Inst); + ARCInstKind Class = GetARCInstKind(Inst); switch (Class) { - case IC_AutoreleasepoolPop: - case IC_AutoreleasepoolPush: + case ARCInstKind::AutoreleasepoolPop: + case ARCInstKind::AutoreleasepoolPush: // These mark the end and begin of an autorelease pool scope. return true; default: @@ -148,13 +165,13 @@ llvm::objcarc::Depends(DependenceKind Flavor, Instruction *Inst, } case CanChangeRetainCount: { - InstructionClass Class = GetInstructionClass(Inst); + ARCInstKind Class = GetARCInstKind(Inst); switch (Class) { - case IC_AutoreleasepoolPop: + case ARCInstKind::AutoreleasepoolPop: // Conservatively assume this can decrement any count. return true; - case IC_AutoreleasepoolPush: - case IC_None: + case ARCInstKind::AutoreleasepoolPush: + case ARCInstKind::None: return false; default: return CanAlterRefCount(Inst, Arg, PA, Class); @@ -162,28 +179,28 @@ llvm::objcarc::Depends(DependenceKind Flavor, Instruction *Inst, } case RetainAutoreleaseDep: - switch (GetBasicInstructionClass(Inst)) { - case IC_AutoreleasepoolPop: - case IC_AutoreleasepoolPush: + switch (GetBasicARCInstKind(Inst)) { + case ARCInstKind::AutoreleasepoolPop: + case ARCInstKind::AutoreleasepoolPush: // Don't merge an objc_autorelease with an objc_retain inside a different // autoreleasepool scope. return true; - case IC_Retain: - case IC_RetainRV: + case ARCInstKind::Retain: + case ARCInstKind::RetainRV: // Check for a retain of the same pointer for merging. - return GetObjCArg(Inst) == Arg; + return GetArgRCIdentityRoot(Inst) == Arg; default: // Nothing else matters for objc_retainAutorelease formation. return false; } case RetainAutoreleaseRVDep: { - InstructionClass Class = GetBasicInstructionClass(Inst); + ARCInstKind Class = GetBasicARCInstKind(Inst); switch (Class) { - case IC_Retain: - case IC_RetainRV: + case ARCInstKind::Retain: + case ARCInstKind::RetainRV: // Check for a retain of the same pointer for merging. - return GetObjCArg(Inst) == Arg; + return GetArgRCIdentityRoot(Inst) == Arg; default: // Anything that can autorelease interrupts // retainAutoreleaseReturnValue formation. @@ -192,7 +209,7 @@ llvm::objcarc::Depends(DependenceKind Flavor, Instruction *Inst, } case RetainRVDep: - return CanInterruptRV(GetBasicInstructionClass(Inst)); + return CanInterruptRV(GetBasicARCInstKind(Inst)); } llvm_unreachable("Invalid dependence flavor"); diff --git a/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.h b/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.h index 7b5601a..8e042d4 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.h +++ b/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.h @@ -63,15 +63,24 @@ Depends(DependenceKind Flavor, Instruction *Inst, const Value *Arg, /// Test whether the given instruction can "use" the given pointer's object in a /// way that requires the reference count to be positive. -bool -CanUse(const Instruction *Inst, const Value *Ptr, ProvenanceAnalysis &PA, - InstructionClass Class); +bool CanUse(const Instruction *Inst, const Value *Ptr, ProvenanceAnalysis &PA, + ARCInstKind Class); /// Test whether the given instruction can result in a reference count /// modification (positive or negative) for the pointer's object. -bool -CanAlterRefCount(const Instruction *Inst, const Value *Ptr, - ProvenanceAnalysis &PA, InstructionClass Class); +bool CanAlterRefCount(const Instruction *Inst, const Value *Ptr, + ProvenanceAnalysis &PA, ARCInstKind Class); + +/// Returns true if we can not conservatively prove that Inst can not decrement +/// the reference count of Ptr. Returns false if we can. +bool CanDecrementRefCount(const Instruction *Inst, const Value *Ptr, + ProvenanceAnalysis &PA, ARCInstKind Class); + +static inline bool CanDecrementRefCount(const Instruction *Inst, + const Value *Ptr, + ProvenanceAnalysis &PA) { + return CanDecrementRefCount(Inst, Ptr, PA, GetARCInstKind(Inst)); +} } // namespace objcarc } // namespace llvm diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.h b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.h index 7a7eae8..7595e2d 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.h +++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.h @@ -24,6 +24,7 @@ #define LLVM_LIB_TRANSFORMS_OBJCARC_OBJCARC_H #include "llvm/ADT/StringSwitch.h" +#include "llvm/ADT/Optional.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/Passes.h" #include "llvm/Analysis/ValueTracking.h" @@ -33,6 +34,7 @@ #include "llvm/Pass.h" #include "llvm/Transforms/ObjCARC.h" #include "llvm/Transforms/Utils/Local.h" +#include "ARCInstKind.h" namespace llvm { class raw_ostream; @@ -68,160 +70,14 @@ static inline bool ModuleHasARC(const Module &M) { M.getNamedValue("clang.arc.use"); } -/// \enum InstructionClass -/// \brief A simple classification for instructions. -enum InstructionClass { - IC_Retain, ///< objc_retain - IC_RetainRV, ///< objc_retainAutoreleasedReturnValue - IC_RetainBlock, ///< objc_retainBlock - IC_Release, ///< objc_release - IC_Autorelease, ///< objc_autorelease - IC_AutoreleaseRV, ///< objc_autoreleaseReturnValue - IC_AutoreleasepoolPush, ///< objc_autoreleasePoolPush - IC_AutoreleasepoolPop, ///< objc_autoreleasePoolPop - IC_NoopCast, ///< objc_retainedObject, etc. - IC_FusedRetainAutorelease, ///< objc_retainAutorelease - IC_FusedRetainAutoreleaseRV, ///< objc_retainAutoreleaseReturnValue - IC_LoadWeakRetained, ///< objc_loadWeakRetained (primitive) - IC_StoreWeak, ///< objc_storeWeak (primitive) - IC_InitWeak, ///< objc_initWeak (derived) - IC_LoadWeak, ///< objc_loadWeak (derived) - IC_MoveWeak, ///< objc_moveWeak (derived) - IC_CopyWeak, ///< objc_copyWeak (derived) - IC_DestroyWeak, ///< objc_destroyWeak (derived) - IC_StoreStrong, ///< objc_storeStrong (derived) - IC_IntrinsicUser, ///< clang.arc.use - IC_CallOrUser, ///< could call objc_release and/or "use" pointers - IC_Call, ///< could call objc_release - IC_User, ///< could "use" a pointer - IC_None ///< anything else -}; - -raw_ostream &operator<<(raw_ostream &OS, const InstructionClass Class); - -/// \brief Test if the given class is a kind of user. -inline static bool IsUser(InstructionClass Class) { - return Class == IC_User || - Class == IC_CallOrUser || - Class == IC_IntrinsicUser; -} - -/// \brief Test if the given class is objc_retain or equivalent. -static inline bool IsRetain(InstructionClass Class) { - return Class == IC_Retain || - Class == IC_RetainRV; -} - -/// \brief Test if the given class is objc_autorelease or equivalent. -static inline bool IsAutorelease(InstructionClass Class) { - return Class == IC_Autorelease || - Class == IC_AutoreleaseRV; -} - -/// \brief Test if the given class represents instructions which return their -/// argument verbatim. -static inline bool IsForwarding(InstructionClass Class) { - return Class == IC_Retain || - Class == IC_RetainRV || - Class == IC_Autorelease || - Class == IC_AutoreleaseRV || - Class == IC_NoopCast; -} - -/// \brief Test if the given class represents instructions which do nothing if -/// passed a null pointer. -static inline bool IsNoopOnNull(InstructionClass Class) { - return Class == IC_Retain || - Class == IC_RetainRV || - Class == IC_Release || - Class == IC_Autorelease || - Class == IC_AutoreleaseRV || - Class == IC_RetainBlock; -} - -/// \brief Test if the given class represents instructions which are always safe -/// to mark with the "tail" keyword. -static inline bool IsAlwaysTail(InstructionClass Class) { - // IC_RetainBlock may be given a stack argument. - return Class == IC_Retain || - Class == IC_RetainRV || - Class == IC_AutoreleaseRV; -} - -/// \brief Test if the given class represents instructions which are never safe -/// to mark with the "tail" keyword. -static inline bool IsNeverTail(InstructionClass Class) { - /// It is never safe to tail call objc_autorelease since by tail calling - /// objc_autorelease, we also tail call -[NSObject autorelease] which supports - /// fast autoreleasing causing our object to be potentially reclaimed from the - /// autorelease pool which violates the semantics of __autoreleasing types in - /// ARC. - return Class == IC_Autorelease; -} - -/// \brief Test if the given class represents instructions which are always safe -/// to mark with the nounwind attribute. -static inline bool IsNoThrow(InstructionClass Class) { - // objc_retainBlock is not nounwind because it calls user copy constructors - // which could theoretically throw. - return Class == IC_Retain || - Class == IC_RetainRV || - Class == IC_Release || - Class == IC_Autorelease || - Class == IC_AutoreleaseRV || - Class == IC_AutoreleasepoolPush || - Class == IC_AutoreleasepoolPop; -} - -/// Test whether the given instruction can autorelease any pointer or cause an -/// autoreleasepool pop. -static inline bool -CanInterruptRV(InstructionClass Class) { - switch (Class) { - case IC_AutoreleasepoolPop: - case IC_CallOrUser: - case IC_Call: - case IC_Autorelease: - case IC_AutoreleaseRV: - case IC_FusedRetainAutorelease: - case IC_FusedRetainAutoreleaseRV: - return true; - default: - return false; - } -} - -/// \brief Determine if F is one of the special known Functions. If it isn't, -/// return IC_CallOrUser. -InstructionClass GetFunctionClass(const Function *F); - -/// \brief Determine which objc runtime call instruction class V belongs to. -/// -/// This is similar to GetInstructionClass except that it only detects objc -/// runtime calls. This allows it to be faster. -/// -static inline InstructionClass GetBasicInstructionClass(const Value *V) { - if (const CallInst *CI = dyn_cast<CallInst>(V)) { - if (const Function *F = CI->getCalledFunction()) - return GetFunctionClass(F); - // Otherwise, be conservative. - return IC_CallOrUser; - } - - // Otherwise, be conservative. - return isa<InvokeInst>(V) ? IC_CallOrUser : IC_User; -} - -/// \brief Determine what kind of construct V is. -InstructionClass GetInstructionClass(const Value *V); - /// \brief This is a wrapper around getUnderlyingObject which also knows how to /// look through objc_retain and objc_autorelease calls, which we know to return /// their argument verbatim. -static inline const Value *GetUnderlyingObjCPtr(const Value *V) { +static inline const Value *GetUnderlyingObjCPtr(const Value *V, + const DataLayout &DL) { for (;;) { - V = GetUnderlyingObject(V); - if (!IsForwarding(GetBasicInstructionClass(V))) + V = GetUnderlyingObject(V, DL); + if (!IsForwarding(GetBasicARCInstKind(V))) break; V = cast<CallInst>(V)->getArgOperand(0); } @@ -229,37 +85,44 @@ static inline const Value *GetUnderlyingObjCPtr(const Value *V) { return V; } -/// \brief This is a wrapper around Value::stripPointerCasts which also knows -/// how to look through objc_retain and objc_autorelease calls, which we know to -/// return their argument verbatim. -static inline const Value *StripPointerCastsAndObjCCalls(const Value *V) { +/// The RCIdentity root of a value \p V is a dominating value U for which +/// retaining or releasing U is equivalent to retaining or releasing V. In other +/// words, ARC operations on \p V are equivalent to ARC operations on \p U. +/// +/// We use this in the ARC optimizer to make it easier to match up ARC +/// operations by always mapping ARC operations to RCIdentityRoots instead of +/// pointers themselves. +/// +/// The two ways that we see RCIdentical values in ObjC are via: +/// +/// 1. PointerCasts +/// 2. Forwarding Calls that return their argument verbatim. +/// +/// Thus this function strips off pointer casts and forwarding calls. *NOTE* +/// This implies that two RCIdentical values must alias. +static inline const Value *GetRCIdentityRoot(const Value *V) { for (;;) { V = V->stripPointerCasts(); - if (!IsForwarding(GetBasicInstructionClass(V))) + if (!IsForwarding(GetBasicARCInstKind(V))) break; V = cast<CallInst>(V)->getArgOperand(0); } return V; } -/// \brief This is a wrapper around Value::stripPointerCasts which also knows -/// how to look through objc_retain and objc_autorelease calls, which we know to -/// return their argument verbatim. -static inline Value *StripPointerCastsAndObjCCalls(Value *V) { - for (;;) { - V = V->stripPointerCasts(); - if (!IsForwarding(GetBasicInstructionClass(V))) - break; - V = cast<CallInst>(V)->getArgOperand(0); - } - return V; +/// Helper which calls const Value *GetRCIdentityRoot(const Value *V) and just +/// casts away the const of the result. For documentation about what an +/// RCIdentityRoot (and by extension GetRCIdentityRoot is) look at that +/// function. +static inline Value *GetRCIdentityRoot(Value *V) { + return const_cast<Value *>(GetRCIdentityRoot((const Value *)V)); } /// \brief Assuming the given instruction is one of the special calls such as -/// objc_retain or objc_release, return the argument value, stripped of no-op -/// casts and forwarding calls. -static inline Value *GetObjCArg(Value *Inst) { - return StripPointerCastsAndObjCCalls(cast<CallInst>(Inst)->getArgOperand(0)); +/// objc_retain or objc_release, return the RCIdentity root of the argument of +/// the call. +static inline Value *GetArgRCIdentityRoot(Value *Inst) { + return GetRCIdentityRoot(cast<CallInst>(Inst)->getArgOperand(0)); } static inline bool IsNullOrUndef(const Value *V) { @@ -286,8 +149,8 @@ static inline void EraseInstruction(Instruction *CI) { if (!Unused) { // Replace the return value with the argument. - assert((IsForwarding(GetBasicInstructionClass(CI)) || - (IsNoopOnNull(GetBasicInstructionClass(CI)) && + assert((IsForwarding(GetBasicARCInstKind(CI)) || + (IsNoopOnNull(GetBasicARCInstKind(CI)) && isa<ConstantPointerNull>(OldArg))) && "Can't delete non-forwarding instruction with users!"); CI->replaceAllUsesWith(OldArg); @@ -344,15 +207,15 @@ static inline bool IsPotentialRetainableObjPtr(const Value *Op, return true; } -/// \brief Helper for GetInstructionClass. Determines what kind of construct CS +/// \brief Helper for GetARCInstKind. Determines what kind of construct CS /// is. -static inline InstructionClass GetCallSiteClass(ImmutableCallSite CS) { +static inline ARCInstKind GetCallSiteClass(ImmutableCallSite CS) { for (ImmutableCallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); I != E; ++I) if (IsPotentialRetainableObjPtr(*I)) - return CS.onlyReadsMemory() ? IC_User : IC_CallOrUser; + return CS.onlyReadsMemory() ? ARCInstKind::User : ARCInstKind::CallOrUser; - return CS.onlyReadsMemory() ? IC_None : IC_Call; + return CS.onlyReadsMemory() ? ARCInstKind::None : ARCInstKind::Call; } /// \brief Return true if this value refers to a distinct and identifiable @@ -371,7 +234,7 @@ static inline bool IsObjCIdentifiedObject(const Value *V) { if (const LoadInst *LI = dyn_cast<LoadInst>(V)) { const Value *Pointer = - StripPointerCastsAndObjCCalls(LI->getPointerOperand()); + GetRCIdentityRoot(LI->getPointerOperand()); if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(Pointer)) { // A constant pointer can't be pointing to an object on the heap. It may // be reference-counted, but it won't be deleted. @@ -396,6 +259,55 @@ static inline bool IsObjCIdentifiedObject(const Value *V) { return false; } +enum class ARCMDKindID { + ImpreciseRelease, + CopyOnEscape, + NoObjCARCExceptions, +}; + +/// A cache of MDKinds used by various ARC optimizations. +class ARCMDKindCache { + Module *M; + + /// The Metadata Kind for clang.imprecise_release metadata. + llvm::Optional<unsigned> ImpreciseReleaseMDKind; + + /// The Metadata Kind for clang.arc.copy_on_escape metadata. + llvm::Optional<unsigned> CopyOnEscapeMDKind; + + /// The Metadata Kind for clang.arc.no_objc_arc_exceptions metadata. + llvm::Optional<unsigned> NoObjCARCExceptionsMDKind; + +public: + void init(Module *Mod) { + M = Mod; + ImpreciseReleaseMDKind = NoneType::None; + CopyOnEscapeMDKind = NoneType::None; + NoObjCARCExceptionsMDKind = NoneType::None; + } + + unsigned get(ARCMDKindID ID) { + switch (ID) { + case ARCMDKindID::ImpreciseRelease: + if (!ImpreciseReleaseMDKind) + ImpreciseReleaseMDKind = + M->getContext().getMDKindID("clang.imprecise_release"); + return *ImpreciseReleaseMDKind; + case ARCMDKindID::CopyOnEscape: + if (!CopyOnEscapeMDKind) + CopyOnEscapeMDKind = + M->getContext().getMDKindID("clang.arc.copy_on_escape"); + return *CopyOnEscapeMDKind; + case ARCMDKindID::NoObjCARCExceptions: + if (!NoObjCARCExceptionsMDKind) + NoObjCARCExceptionsMDKind = + M->getContext().getMDKindID("clang.arc.no_objc_arc_exceptions"); + return *NoObjCARCExceptionsMDKind; + } + llvm_unreachable("Covered switch isn't covered?!"); + } +}; + } // end namespace objcarc } // end namespace llvm diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp index 1a25391..d318643 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp +++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp @@ -97,11 +97,11 @@ bool ObjCARCAPElim::OptimizeBB(BasicBlock *BB) { Instruction *Push = nullptr; for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) { Instruction *Inst = I++; - switch (GetBasicInstructionClass(Inst)) { - case IC_AutoreleasepoolPush: + switch (GetBasicARCInstKind(Inst)) { + case ARCInstKind::AutoreleasepoolPush: Push = Inst; break; - case IC_AutoreleasepoolPop: + case ARCInstKind::AutoreleasepoolPop: // If this pop matches a push and nothing in between can autorelease, // zap the pair. if (Push && cast<CallInst>(Inst)->getArgOperand(0) == Push) { @@ -115,7 +115,7 @@ bool ObjCARCAPElim::OptimizeBB(BasicBlock *BB) { } Push = nullptr; break; - case IC_CallOrUser: + case ARCInstKind::CallOrUser: if (MayAutorelease(ImmutableCallSite(Inst))) Push = nullptr; break; diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp index c61b6b0..b1515e3 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp +++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp @@ -46,6 +46,11 @@ ImmutablePass *llvm::createObjCARCAliasAnalysisPass() { return new ObjCARCAliasAnalysis(); } +bool ObjCARCAliasAnalysis::doInitialization(Module &M) { + InitializeAliasAnalysis(this, &M.getDataLayout()); + return true; +} + void ObjCARCAliasAnalysis::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); @@ -59,8 +64,8 @@ ObjCARCAliasAnalysis::alias(const Location &LocA, const Location &LocB) { // First, strip off no-ops, including ObjC-specific no-ops, and try making a // precise alias query. - const Value *SA = StripPointerCastsAndObjCCalls(LocA.Ptr); - const Value *SB = StripPointerCastsAndObjCCalls(LocB.Ptr); + const Value *SA = GetRCIdentityRoot(LocA.Ptr); + const Value *SB = GetRCIdentityRoot(LocB.Ptr); AliasResult Result = AliasAnalysis::alias(Location(SA, LocA.Size, LocA.AATags), Location(SB, LocB.Size, LocB.AATags)); @@ -69,8 +74,8 @@ ObjCARCAliasAnalysis::alias(const Location &LocA, const Location &LocB) { // If that failed, climb to the underlying object, including climbing through // ObjC-specific no-ops, and try making an imprecise alias query. - const Value *UA = GetUnderlyingObjCPtr(SA); - const Value *UB = GetUnderlyingObjCPtr(SB); + const Value *UA = GetUnderlyingObjCPtr(SA, *DL); + const Value *UB = GetUnderlyingObjCPtr(SB, *DL); if (UA != SA || UB != SB) { Result = AliasAnalysis::alias(Location(UA), Location(UB)); // We can't use MustAlias or PartialAlias results here because @@ -92,14 +97,14 @@ ObjCARCAliasAnalysis::pointsToConstantMemory(const Location &Loc, // First, strip off no-ops, including ObjC-specific no-ops, and try making // a precise alias query. - const Value *S = StripPointerCastsAndObjCCalls(Loc.Ptr); + const Value *S = GetRCIdentityRoot(Loc.Ptr); if (AliasAnalysis::pointsToConstantMemory(Location(S, Loc.Size, Loc.AATags), OrLocal)) return true; // If that failed, climb to the underlying object, including climbing through // ObjC-specific no-ops, and try making an imprecise alias query. - const Value *U = GetUnderlyingObjCPtr(S); + const Value *U = GetUnderlyingObjCPtr(S, *DL); if (U != S) return AliasAnalysis::pointsToConstantMemory(Location(U), OrLocal); @@ -120,7 +125,7 @@ ObjCARCAliasAnalysis::getModRefBehavior(const Function *F) { return AliasAnalysis::getModRefBehavior(F); switch (GetFunctionClass(F)) { - case IC_NoopCast: + case ARCInstKind::NoopCast: return DoesNotAccessMemory; default: break; @@ -134,15 +139,15 @@ ObjCARCAliasAnalysis::getModRefInfo(ImmutableCallSite CS, const Location &Loc) { if (!EnableARCOpts) return AliasAnalysis::getModRefInfo(CS, Loc); - switch (GetBasicInstructionClass(CS.getInstruction())) { - case IC_Retain: - case IC_RetainRV: - case IC_Autorelease: - case IC_AutoreleaseRV: - case IC_NoopCast: - case IC_AutoreleasepoolPush: - case IC_FusedRetainAutorelease: - case IC_FusedRetainAutoreleaseRV: + switch (GetBasicARCInstKind(CS.getInstruction())) { + case ARCInstKind::Retain: + case ARCInstKind::RetainRV: + case ARCInstKind::Autorelease: + case ARCInstKind::AutoreleaseRV: + case ARCInstKind::NoopCast: + case ARCInstKind::AutoreleasepoolPush: + case ARCInstKind::FusedRetainAutorelease: + case ARCInstKind::FusedRetainAutoreleaseRV: // These functions don't access any memory visible to the compiler. // Note that this doesn't include objc_retainBlock, because it updates // pointers when it copies block data. diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.h b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.h index 3fcea4e..3c5a021 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.h +++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.h @@ -44,9 +44,7 @@ namespace objcarc { } private: - void initializePass() override { - InitializeAliasAnalysis(this); - } + bool doInitialization(Module &M) override; /// This method is used when a pass implements an analysis interface through /// multiple inheritance. If needed, it should override this to adjust the diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp index eb325eb..2a3139f 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp +++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp @@ -35,6 +35,7 @@ #include "llvm/IR/InlineAsm.h" #include "llvm/IR/Operator.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" using namespace llvm; using namespace llvm::objcarc; @@ -44,6 +45,10 @@ using namespace llvm::objcarc; STATISTIC(NumPeeps, "Number of calls peephole-optimized"); STATISTIC(NumStoreStrongs, "Number objc_storeStrong calls formed"); +//===----------------------------------------------------------------------===// +// Declarations +//===----------------------------------------------------------------------===// + namespace { /// \brief Late ARC optimizations /// @@ -68,17 +73,23 @@ namespace { /// "tail". SmallPtrSet<CallInst *, 8> StoreStrongCalls; - bool OptimizeRetainCall(Function &F, Instruction *Retain); + /// Returns true if we eliminated Inst. + bool tryToPeepholeInstruction(Function &F, Instruction *Inst, + inst_iterator &Iter, + SmallPtrSetImpl<Instruction *> &DepInsts, + SmallPtrSetImpl<const BasicBlock *> &Visited, + bool &TailOkForStoreStrong); - bool ContractAutorelease(Function &F, Instruction *Autorelease, - InstructionClass Class, - SmallPtrSetImpl<Instruction *> - &DependingInstructions, - SmallPtrSetImpl<const BasicBlock *> - &Visited); + bool optimizeRetainCall(Function &F, Instruction *Retain); - void ContractRelease(Instruction *Release, - inst_iterator &Iter); + bool + contractAutorelease(Function &F, Instruction *Autorelease, + ARCInstKind Class, + SmallPtrSetImpl<Instruction *> &DependingInstructions, + SmallPtrSetImpl<const BasicBlock *> &Visited); + + void tryToContractReleaseIntoStoreStrong(Instruction *Release, + inst_iterator &Iter); void getAnalysisUsage(AnalysisUsage &AU) const override; bool doInitialization(Module &M) override; @@ -92,30 +103,15 @@ namespace { }; } -char ObjCARCContract::ID = 0; -INITIALIZE_PASS_BEGIN(ObjCARCContract, - "objc-arc-contract", "ObjC ARC contraction", false, false) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_END(ObjCARCContract, - "objc-arc-contract", "ObjC ARC contraction", false, false) - -Pass *llvm::createObjCARCContractPass() { - return new ObjCARCContract(); -} - -void ObjCARCContract::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<AliasAnalysis>(); - AU.addRequired<DominatorTreeWrapperPass>(); - AU.setPreservesCFG(); -} +//===----------------------------------------------------------------------===// +// Implementation +//===----------------------------------------------------------------------===// /// Turn objc_retain into objc_retainAutoreleasedReturnValue if the operand is a /// return value. We do this late so we do not disrupt the dataflow analysis in /// ObjCARCOpt. -bool -ObjCARCContract::OptimizeRetainCall(Function &F, Instruction *Retain) { - ImmutableCallSite CS(GetObjCArg(Retain)); +bool ObjCARCContract::optimizeRetainCall(Function &F, Instruction *Retain) { + ImmutableCallSite CS(GetArgRCIdentityRoot(Retain)); const Instruction *Call = CS.getInstruction(); if (!Call) return false; @@ -139,7 +135,7 @@ ObjCARCContract::OptimizeRetainCall(Function &F, Instruction *Retain) { // We do not have to worry about tail calls/does not throw since // retain/retainRV have the same properties. - Constant *Decl = EP.get(ARCRuntimeEntryPoints::EPT_RetainRV); + Constant *Decl = EP.get(ARCRuntimeEntryPointKind::RetainRV); cast<CallInst>(Retain)->setCalledFunction(Decl); DEBUG(dbgs() << "New: " << *Retain << "\n"); @@ -147,19 +143,16 @@ ObjCARCContract::OptimizeRetainCall(Function &F, Instruction *Retain) { } /// Merge an autorelease with a retain into a fused call. -bool -ObjCARCContract::ContractAutorelease(Function &F, Instruction *Autorelease, - InstructionClass Class, - SmallPtrSetImpl<Instruction *> - &DependingInstructions, - SmallPtrSetImpl<const BasicBlock *> - &Visited) { - const Value *Arg = GetObjCArg(Autorelease); +bool ObjCARCContract::contractAutorelease( + Function &F, Instruction *Autorelease, ARCInstKind Class, + SmallPtrSetImpl<Instruction *> &DependingInstructions, + SmallPtrSetImpl<const BasicBlock *> &Visited) { + const Value *Arg = GetArgRCIdentityRoot(Autorelease); // Check that there are no instructions between the retain and the autorelease // (such as an autorelease_pop) which may change the count. CallInst *Retain = nullptr; - if (Class == IC_AutoreleaseRV) + if (Class == ARCInstKind::AutoreleaseRV) FindDependencies(RetainAutoreleaseRVDep, Arg, Autorelease->getParent(), Autorelease, DependingInstructions, Visited, PA); @@ -177,94 +170,208 @@ ObjCARCContract::ContractAutorelease(Function &F, Instruction *Autorelease, Retain = dyn_cast_or_null<CallInst>(*DependingInstructions.begin()); DependingInstructions.clear(); - if (!Retain || - GetBasicInstructionClass(Retain) != IC_Retain || - GetObjCArg(Retain) != Arg) + if (!Retain || GetBasicARCInstKind(Retain) != ARCInstKind::Retain || + GetArgRCIdentityRoot(Retain) != Arg) return false; Changed = true; ++NumPeeps; - DEBUG(dbgs() << "ObjCARCContract::ContractAutorelease: Fusing " - "retain/autorelease. Erasing: " << *Autorelease << "\n" - " Old Retain: " - << *Retain << "\n"); + DEBUG(dbgs() << " Fusing retain/autorelease!\n" + " Autorelease:" << *Autorelease << "\n" + " Retain: " << *Retain << "\n"); - Constant *Decl = EP.get(Class == IC_AutoreleaseRV ? - ARCRuntimeEntryPoints::EPT_RetainAutoreleaseRV : - ARCRuntimeEntryPoints::EPT_RetainAutorelease); + Constant *Decl = EP.get(Class == ARCInstKind::AutoreleaseRV + ? ARCRuntimeEntryPointKind::RetainAutoreleaseRV + : ARCRuntimeEntryPointKind::RetainAutorelease); Retain->setCalledFunction(Decl); - DEBUG(dbgs() << " New Retain: " - << *Retain << "\n"); + DEBUG(dbgs() << " New RetainAutorelease: " << *Retain << "\n"); EraseInstruction(Autorelease); return true; } -/// Attempt to merge an objc_release with a store, load, and objc_retain to form -/// an objc_storeStrong. This can be a little tricky because the instructions -/// don't always appear in order, and there may be unrelated intervening -/// instructions. -void ObjCARCContract::ContractRelease(Instruction *Release, - inst_iterator &Iter) { - LoadInst *Load = dyn_cast<LoadInst>(GetObjCArg(Release)); - if (!Load || !Load->isSimple()) return; +static StoreInst *findSafeStoreForStoreStrongContraction(LoadInst *Load, + Instruction *Release, + ProvenanceAnalysis &PA, + AliasAnalysis *AA) { + StoreInst *Store = nullptr; + bool SawRelease = false; - // For now, require everything to be in one basic block. - BasicBlock *BB = Release->getParent(); - if (Load->getParent() != BB) return; + // Get the location associated with Load. + AliasAnalysis::Location Loc = AA->getLocation(Load); // Walk down to find the store and the release, which may be in either order. - BasicBlock::iterator I = Load, End = BB->end(); - ++I; - AliasAnalysis::Location Loc = AA->getLocation(Load); - StoreInst *Store = nullptr; - bool SawRelease = false; - for (; !Store || !SawRelease; ++I) { - if (I == End) - return; + for (auto I = std::next(BasicBlock::iterator(Load)), + E = Load->getParent()->end(); + I != E; ++I) { + // If we found the store we were looking for and saw the release, + // break. There is no more work to be done. + if (Store && SawRelease) + break; - Instruction *Inst = I; + // Now we know that we have not seen either the store or the release. If I + // is the the release, mark that we saw the release and continue. + Instruction *Inst = &*I; if (Inst == Release) { SawRelease = true; continue; } - InstructionClass Class = GetBasicInstructionClass(Inst); + // Otherwise, we check if Inst is a "good" store. Grab the instruction class + // of Inst. + ARCInstKind Class = GetBasicARCInstKind(Inst); - // Unrelated retains are harmless. + // If Inst is an unrelated retain, we don't care about it. + // + // TODO: This is one area where the optimization could be made more + // aggressive. if (IsRetain(Class)) continue; + // If we have seen the store, but not the release... if (Store) { - // The store is the point where we're going to put the objc_storeStrong, - // so make sure there are no uses after it. - if (CanUse(Inst, Load, PA, Class)) - return; - } else if (AA->getModRefInfo(Inst, Loc) & AliasAnalysis::Mod) { - // We are moving the load down to the store, so check for anything - // else which writes to the memory between the load and the store. - Store = dyn_cast<StoreInst>(Inst); - if (!Store || !Store->isSimple()) return; - if (Store->getPointerOperand() != Loc.Ptr) return; + // We need to make sure that it is safe to move the release from its + // current position to the store. This implies proving that any + // instruction in between Store and the Release conservatively can not use + // the RCIdentityRoot of Release. If we can prove we can ignore Inst, so + // continue... + if (!CanUse(Inst, Load, PA, Class)) { + continue; + } + + // Otherwise, be conservative and return nullptr. + return nullptr; } + + // Ok, now we know we have not seen a store yet. See if Inst can write to + // our load location, if it can not, just ignore the instruction. + if (!(AA->getModRefInfo(Inst, Loc) & AliasAnalysis::Mod)) + continue; + + Store = dyn_cast<StoreInst>(Inst); + + // If Inst can, then check if Inst is a simple store. If Inst is not a + // store or a store that is not simple, then we have some we do not + // understand writing to this memory implying we can not move the load + // over the write to any subsequent store that we may find. + if (!Store || !Store->isSimple()) + return nullptr; + + // Then make sure that the pointer we are storing to is Ptr. If so, we + // found our Store! + if (Store->getPointerOperand() == Loc.Ptr) + continue; + + // Otherwise, we have an unknown store to some other ptr that clobbers + // Loc.Ptr. Bail! + return nullptr; } - Value *New = StripPointerCastsAndObjCCalls(Store->getValueOperand()); + // If we did not find the store or did not see the release, fail. + if (!Store || !SawRelease) + return nullptr; + + // We succeeded! + return Store; +} - // Walk up to find the retain. - I = Store; - BasicBlock::iterator Begin = BB->begin(); - while (I != Begin && GetBasicInstructionClass(I) != IC_Retain) +static Instruction * +findRetainForStoreStrongContraction(Value *New, StoreInst *Store, + Instruction *Release, + ProvenanceAnalysis &PA) { + // Walk up from the Store to find the retain. + BasicBlock::iterator I = Store; + BasicBlock::iterator Begin = Store->getParent()->begin(); + while (I != Begin && GetBasicARCInstKind(I) != ARCInstKind::Retain) { + Instruction *Inst = &*I; + + // It is only safe to move the retain to the store if we can prove + // conservatively that nothing besides the release can decrement reference + // counts in between the retain and the store. + if (CanDecrementRefCount(Inst, New, PA) && Inst != Release) + return nullptr; --I; + } Instruction *Retain = I; - if (GetBasicInstructionClass(Retain) != IC_Retain) return; - if (GetObjCArg(Retain) != New) return; + if (GetBasicARCInstKind(Retain) != ARCInstKind::Retain) + return nullptr; + if (GetArgRCIdentityRoot(Retain) != New) + return nullptr; + return Retain; +} + +/// Attempt to merge an objc_release with a store, load, and objc_retain to form +/// an objc_storeStrong. An objc_storeStrong: +/// +/// objc_storeStrong(i8** %old_ptr, i8* new_value) +/// +/// is equivalent to the following IR sequence: +/// +/// ; Load old value. +/// %old_value = load i8** %old_ptr (1) +/// +/// ; Increment the new value and then release the old value. This must occur +/// ; in order in case old_value releases new_value in its destructor causing +/// ; us to potentially have a dangling ptr. +/// tail call i8* @objc_retain(i8* %new_value) (2) +/// tail call void @objc_release(i8* %old_value) (3) +/// +/// ; Store the new_value into old_ptr +/// store i8* %new_value, i8** %old_ptr (4) +/// +/// The safety of this optimization is based around the following +/// considerations: +/// +/// 1. We are forming the store strong at the store. Thus to perform this +/// optimization it must be safe to move the retain, load, and release to +/// (4). +/// 2. We need to make sure that any re-orderings of (1), (2), (3), (4) are +/// safe. +void ObjCARCContract::tryToContractReleaseIntoStoreStrong(Instruction *Release, + inst_iterator &Iter) { + // See if we are releasing something that we just loaded. + auto *Load = dyn_cast<LoadInst>(GetArgRCIdentityRoot(Release)); + if (!Load || !Load->isSimple()) + return; + + // For now, require everything to be in one basic block. + BasicBlock *BB = Release->getParent(); + if (Load->getParent() != BB) + return; + + // First scan down the BB from Load, looking for a store of the RCIdentityRoot + // of Load's + StoreInst *Store = + findSafeStoreForStoreStrongContraction(Load, Release, PA, AA); + // If we fail, bail. + if (!Store) + return; + + // Then find what new_value's RCIdentity Root is. + Value *New = GetRCIdentityRoot(Store->getValueOperand()); + + // Then walk up the BB and look for a retain on New without any intervening + // instructions which conservatively might decrement ref counts. + Instruction *Retain = + findRetainForStoreStrongContraction(New, Store, Release, PA); + + // If we fail, bail. + if (!Retain) + return; Changed = true; ++NumStoreStrongs; + DEBUG( + llvm::dbgs() << " Contracting retain, release into objc_storeStrong.\n" + << " Old:\n" + << " Store: " << *Store << "\n" + << " Release: " << *Release << "\n" + << " Retain: " << *Retain << "\n" + << " Load: " << *Load << "\n"); + LLVMContext &C = Release->getContext(); Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); Type *I8XX = PointerType::getUnqual(I8X); @@ -274,7 +381,7 @@ void ObjCARCContract::ContractRelease(Instruction *Release, Args[0] = new BitCastInst(Args[0], I8XX, "", Store); if (Args[1]->getType() != I8X) Args[1] = new BitCastInst(Args[1], I8X, "", Store); - Constant *Decl = EP.get(ARCRuntimeEntryPoints::EPT_StoreStrong); + Constant *Decl = EP.get(ARCRuntimeEntryPointKind::StoreStrong); CallInst *StoreStrong = CallInst::Create(Decl, Args, "", Store); StoreStrong->setDoesNotThrow(); StoreStrong->setDebugLoc(Store->getDebugLoc()); @@ -284,6 +391,8 @@ void ObjCARCContract::ContractRelease(Instruction *Release, // we can set the tail flag once we know it's safe. StoreStrongCalls.insert(StoreStrong); + DEBUG(llvm::dbgs() << " New Store Strong: " << *StoreStrong << "\n"); + if (&*Iter == Store) ++Iter; Store->eraseFromParent(); Release->eraseFromParent(); @@ -292,85 +401,34 @@ void ObjCARCContract::ContractRelease(Instruction *Release, Load->eraseFromParent(); } -bool ObjCARCContract::doInitialization(Module &M) { - // If nothing in the Module uses ARC, don't do anything. - Run = ModuleHasARC(M); - if (!Run) - return false; - - EP.Initialize(&M); - - // Initialize RetainRVMarker. - RetainRVMarker = nullptr; - if (NamedMDNode *NMD = - M.getNamedMetadata("clang.arc.retainAutoreleasedReturnValueMarker")) - if (NMD->getNumOperands() == 1) { - const MDNode *N = NMD->getOperand(0); - if (N->getNumOperands() == 1) - if (const MDString *S = dyn_cast<MDString>(N->getOperand(0))) - RetainRVMarker = S; - } - - return false; -} - -bool ObjCARCContract::runOnFunction(Function &F) { - if (!EnableARCOpts) - return false; - - // If nothing in the Module uses ARC, don't do anything. - if (!Run) - return false; - - Changed = false; - AA = &getAnalysis<AliasAnalysis>(); - DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - - PA.setAA(&getAnalysis<AliasAnalysis>()); - - // Track whether it's ok to mark objc_storeStrong calls with the "tail" - // keyword. Be conservative if the function has variadic arguments. - // It seems that functions which "return twice" are also unsafe for the - // "tail" argument, because they are setjmp, which could need to - // return to an earlier stack state. - bool TailOkForStoreStrongs = !F.isVarArg() && - !F.callsFunctionThatReturnsTwice(); - - // For ObjC library calls which return their argument, replace uses of the - // argument with uses of the call return value, if it dominates the use. This - // reduces register pressure. - SmallPtrSet<Instruction *, 4> DependingInstructions; - SmallPtrSet<const BasicBlock *, 4> Visited; - for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) { - Instruction *Inst = &*I++; - - DEBUG(dbgs() << "ObjCARCContract: Visiting: " << *Inst << "\n"); - +bool ObjCARCContract::tryToPeepholeInstruction( + Function &F, Instruction *Inst, inst_iterator &Iter, + SmallPtrSetImpl<Instruction *> &DependingInsts, + SmallPtrSetImpl<const BasicBlock *> &Visited, + bool &TailOkForStoreStrongs) { // Only these library routines return their argument. In particular, // objc_retainBlock does not necessarily return its argument. - InstructionClass Class = GetBasicInstructionClass(Inst); + ARCInstKind Class = GetBasicARCInstKind(Inst); switch (Class) { - case IC_FusedRetainAutorelease: - case IC_FusedRetainAutoreleaseRV: - break; - case IC_Autorelease: - case IC_AutoreleaseRV: - if (ContractAutorelease(F, Inst, Class, DependingInstructions, Visited)) - continue; - break; - case IC_Retain: + case ARCInstKind::FusedRetainAutorelease: + case ARCInstKind::FusedRetainAutoreleaseRV: + return false; + case ARCInstKind::Autorelease: + case ARCInstKind::AutoreleaseRV: + return contractAutorelease(F, Inst, Class, DependingInsts, Visited); + case ARCInstKind::Retain: // Attempt to convert retains to retainrvs if they are next to function // calls. - if (!OptimizeRetainCall(F, Inst)) - break; + if (!optimizeRetainCall(F, Inst)) + return false; // If we succeed in our optimization, fall through. // FALLTHROUGH - case IC_RetainRV: { + case ARCInstKind::RetainRV: { // If we're compiling for a target which needs a special inline-asm // marker to do the retainAutoreleasedReturnValue optimization, // insert it now. if (!RetainRVMarker) - break; + return false; BasicBlock::iterator BBI = Inst; BasicBlock *InstParent = Inst->getParent(); @@ -388,8 +446,8 @@ bool ObjCARCContract::runOnFunction(Function &F) { --BBI; } while (IsNoopInstruction(BBI)); - if (&*BBI == GetObjCArg(Inst)) { - DEBUG(dbgs() << "ObjCARCContract: Adding inline asm marker for " + if (&*BBI == GetArgRCIdentityRoot(Inst)) { + DEBUG(dbgs() << "Adding inline asm marker for " "retainAutoreleasedReturnValue optimization.\n"); Changed = true; InlineAsm *IA = @@ -400,9 +458,9 @@ bool ObjCARCContract::runOnFunction(Function &F) { CallInst::Create(IA, "", Inst); } decline_rv_optimization: - break; + return false; } - case IC_InitWeak: { + case ARCInstKind::InitWeak: { // objc_initWeak(p, null) => *p = null CallInst *CI = cast<CallInst>(Inst); if (IsNullOrUndef(CI->getArgOperand(1))) { @@ -417,31 +475,80 @@ bool ObjCARCContract::runOnFunction(Function &F) { CI->replaceAllUsesWith(Null); CI->eraseFromParent(); } - continue; + return true; } - case IC_Release: - ContractRelease(Inst, I); - continue; - case IC_User: + case ARCInstKind::Release: + // Try to form an objc store strong from our release. If we fail, there is + // nothing further to do below, so continue. + tryToContractReleaseIntoStoreStrong(Inst, Iter); + return true; + case ARCInstKind::User: // Be conservative if the function has any alloca instructions. // Technically we only care about escaping alloca instructions, // but this is sufficient to handle some interesting cases. if (isa<AllocaInst>(Inst)) TailOkForStoreStrongs = false; - continue; - case IC_IntrinsicUser: + return true; + case ARCInstKind::IntrinsicUser: // Remove calls to @clang.arc.use(...). Inst->eraseFromParent(); - continue; + return true; default: - continue; + return true; } +} + +//===----------------------------------------------------------------------===// +// Top Level Driver +//===----------------------------------------------------------------------===// + +bool ObjCARCContract::runOnFunction(Function &F) { + if (!EnableARCOpts) + return false; + + // If nothing in the Module uses ARC, don't do anything. + if (!Run) + return false; + + Changed = false; + AA = &getAnalysis<AliasAnalysis>(); + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + + PA.setAA(&getAnalysis<AliasAnalysis>()); + + DEBUG(llvm::dbgs() << "**** ObjCARC Contract ****\n"); + + // Track whether it's ok to mark objc_storeStrong calls with the "tail" + // keyword. Be conservative if the function has variadic arguments. + // It seems that functions which "return twice" are also unsafe for the + // "tail" argument, because they are setjmp, which could need to + // return to an earlier stack state. + bool TailOkForStoreStrongs = + !F.isVarArg() && !F.callsFunctionThatReturnsTwice(); + + // For ObjC library calls which return their argument, replace uses of the + // argument with uses of the call return value, if it dominates the use. This + // reduces register pressure. + SmallPtrSet<Instruction *, 4> DependingInstructions; + SmallPtrSet<const BasicBlock *, 4> Visited; + for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E;) { + Instruction *Inst = &*I++; + + DEBUG(dbgs() << "Visiting: " << *Inst << "\n"); + + // First try to peephole Inst. If there is nothing further we can do in + // terms of undoing objc-arc-expand, process the next inst. + if (tryToPeepholeInstruction(F, Inst, I, DependingInstructions, Visited, + TailOkForStoreStrongs)) + continue; - DEBUG(dbgs() << "ObjCARCContract: Finished List.\n\n"); + // Otherwise, try to undo objc-arc-expand. - // Don't use GetObjCArg because we don't want to look through bitcasts + // Don't use GetArgRCIdentityRoot because we don't want to look through bitcasts // and such; to do the replacement, the argument must have type i8*. Value *Arg = cast<CallInst>(Inst)->getArgOperand(0); + + // TODO: Change this to a do-while. for (;;) { // If we're compiling bugpointed code, don't get in trouble. if (!isa<Instruction>(Arg) && !isa<Argument>(Arg)) @@ -458,7 +565,7 @@ bool ObjCARCContract::runOnFunction(Function &F) { // reachability here because an unreachable call is considered to // trivially dominate itself, which would lead us to rewriting its // argument in terms of its return value, which would lead to - // infinite loops in GetObjCArg. + // infinite loops in GetArgRCIdentityRoot. if (DT->isReachableFromEntry(U) && DT->dominates(Inst, U)) { Changed = true; Instruction *Replacement = Inst; @@ -514,3 +621,45 @@ bool ObjCARCContract::runOnFunction(Function &F) { return Changed; } + +//===----------------------------------------------------------------------===// +// Misc Pass Manager +//===----------------------------------------------------------------------===// + +char ObjCARCContract::ID = 0; +INITIALIZE_PASS_BEGIN(ObjCARCContract, "objc-arc-contract", + "ObjC ARC contraction", false, false) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_END(ObjCARCContract, "objc-arc-contract", + "ObjC ARC contraction", false, false) + +void ObjCARCContract::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<AliasAnalysis>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.setPreservesCFG(); +} + +Pass *llvm::createObjCARCContractPass() { return new ObjCARCContract(); } + +bool ObjCARCContract::doInitialization(Module &M) { + // If nothing in the Module uses ARC, don't do anything. + Run = ModuleHasARC(M); + if (!Run) + return false; + + EP.init(&M); + + // Initialize RetainRVMarker. + RetainRVMarker = nullptr; + if (NamedMDNode *NMD = + M.getNamedMetadata("clang.arc.retainAutoreleasedReturnValueMarker")) + if (NMD->getNumOperands() == 1) { + const MDNode *N = NMD->getOperand(0); + if (N->getNumOperands() == 1) + if (const MDString *S = dyn_cast<MDString>(N->getOperand(0))) + RetainRVMarker = S; + } + + return false; +} diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCExpand.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCExpand.cpp index bf9fcbb..53c19c3 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCExpand.cpp +++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCExpand.cpp @@ -99,13 +99,13 @@ bool ObjCARCExpand::runOnFunction(Function &F) { DEBUG(dbgs() << "ObjCARCExpand: Visiting: " << *Inst << "\n"); - switch (GetBasicInstructionClass(Inst)) { - case IC_Retain: - case IC_RetainRV: - case IC_Autorelease: - case IC_AutoreleaseRV: - case IC_FusedRetainAutorelease: - case IC_FusedRetainAutoreleaseRV: { + switch (GetBasicARCInstKind(Inst)) { + case ARCInstKind::Retain: + case ARCInstKind::RetainRV: + case ARCInstKind::Autorelease: + case ARCInstKind::AutoreleaseRV: + case ARCInstKind::FusedRetainAutorelease: + case ARCInstKind::FusedRetainAutoreleaseRV: { // These calls return their argument verbatim, as a low-level // optimization. However, this makes high-level optimizations // harder. Undo any uses of this optimization that the front-end diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp index 76932e6..dca3f1b 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp +++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp @@ -26,9 +26,11 @@ #include "ObjCARC.h" #include "ARCRuntimeEntryPoints.h" +#include "BlotMapVector.h" #include "DependencyAnalysis.h" #include "ObjCARCAliasAnalysis.h" #include "ProvenanceAnalysis.h" +#include "PtrState.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/STLExtras.h" @@ -45,106 +47,10 @@ using namespace llvm::objcarc; #define DEBUG_TYPE "objc-arc-opts" -/// \defgroup MiscUtils Miscellaneous utilities that are not ARC specific. -/// @{ - -namespace { - /// \brief An associative container with fast insertion-order (deterministic) - /// iteration over its elements. Plus the special blot operation. - template<class KeyT, class ValueT> - class MapVector { - /// Map keys to indices in Vector. - typedef DenseMap<KeyT, size_t> MapTy; - MapTy Map; - - typedef std::vector<std::pair<KeyT, ValueT> > VectorTy; - /// Keys and values. - VectorTy Vector; - - public: - typedef typename VectorTy::iterator iterator; - typedef typename VectorTy::const_iterator const_iterator; - iterator begin() { return Vector.begin(); } - iterator end() { return Vector.end(); } - const_iterator begin() const { return Vector.begin(); } - const_iterator end() const { return Vector.end(); } - -#ifdef XDEBUG - ~MapVector() { - assert(Vector.size() >= Map.size()); // May differ due to blotting. - for (typename MapTy::const_iterator I = Map.begin(), E = Map.end(); - I != E; ++I) { - assert(I->second < Vector.size()); - assert(Vector[I->second].first == I->first); - } - for (typename VectorTy::const_iterator I = Vector.begin(), - E = Vector.end(); I != E; ++I) - assert(!I->first || - (Map.count(I->first) && - Map[I->first] == size_t(I - Vector.begin()))); - } -#endif - - ValueT &operator[](const KeyT &Arg) { - std::pair<typename MapTy::iterator, bool> Pair = - Map.insert(std::make_pair(Arg, size_t(0))); - if (Pair.second) { - size_t Num = Vector.size(); - Pair.first->second = Num; - Vector.push_back(std::make_pair(Arg, ValueT())); - return Vector[Num].second; - } - return Vector[Pair.first->second].second; - } - - std::pair<iterator, bool> - insert(const std::pair<KeyT, ValueT> &InsertPair) { - std::pair<typename MapTy::iterator, bool> Pair = - Map.insert(std::make_pair(InsertPair.first, size_t(0))); - if (Pair.second) { - size_t Num = Vector.size(); - Pair.first->second = Num; - Vector.push_back(InsertPair); - return std::make_pair(Vector.begin() + Num, true); - } - return std::make_pair(Vector.begin() + Pair.first->second, false); - } - - iterator find(const KeyT &Key) { - typename MapTy::iterator It = Map.find(Key); - if (It == Map.end()) return Vector.end(); - return Vector.begin() + It->second; - } - - const_iterator find(const KeyT &Key) const { - typename MapTy::const_iterator It = Map.find(Key); - if (It == Map.end()) return Vector.end(); - return Vector.begin() + It->second; - } - - /// This is similar to erase, but instead of removing the element from the - /// vector, it just zeros out the key in the vector. This leaves iterators - /// intact, but clients must be prepared for zeroed-out keys when iterating. - void blot(const KeyT &Key) { - typename MapTy::iterator It = Map.find(Key); - if (It == Map.end()) return; - Vector[It->second].first = KeyT(); - Map.erase(It); - } - - void clear() { - Map.clear(); - Vector.clear(); - } - }; -} - -/// @} -/// /// \defgroup ARCUtilities Utility declarations/definitions specific to ARC. /// @{ -/// \brief This is similar to StripPointerCastsAndObjCCalls but it stops as soon +/// \brief This is similar to GetRCIdentityRoot but it stops as soon /// as it finds a value with multiple uses. static const Value *FindSingleUseIdentifiedObject(const Value *Arg) { if (Arg->hasOneUse()) { @@ -153,7 +59,7 @@ static const Value *FindSingleUseIdentifiedObject(const Value *Arg) { if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Arg)) if (GEP->hasAllZeroIndices()) return FindSingleUseIdentifiedObject(GEP->getPointerOperand()); - if (IsForwarding(GetBasicInstructionClass(Arg))) + if (IsForwarding(GetBasicARCInstKind(Arg))) return FindSingleUseIdentifiedObject( cast<CallInst>(Arg)->getArgOperand(0)); if (!IsObjCIdentifiedObject(Arg)) @@ -165,7 +71,7 @@ static const Value *FindSingleUseIdentifiedObject(const Value *Arg) { // trivial uses, we can still consider this to be a single-use value. if (IsObjCIdentifiedObject(Arg)) { for (const User *U : Arg->users()) - if (!U->use_empty() || StripPointerCastsAndObjCCalls(U) != Arg) + if (!U->use_empty() || GetRCIdentityRoot(U) != Arg) return nullptr; return Arg; @@ -177,13 +83,14 @@ static const Value *FindSingleUseIdentifiedObject(const Value *Arg) { /// This is a wrapper around getUnderlyingObjCPtr along the lines of /// GetUnderlyingObjects except that it returns early when it sees the first /// alloca. -static inline bool AreAnyUnderlyingObjectsAnAlloca(const Value *V) { +static inline bool AreAnyUnderlyingObjectsAnAlloca(const Value *V, + const DataLayout &DL) { SmallPtrSet<const Value *, 4> Visited; SmallVector<const Value *, 4> Worklist; Worklist.push_back(V); do { const Value *P = Worklist.pop_back_val(); - P = GetUnderlyingObjCPtr(P); + P = GetUnderlyingObjCPtr(P, DL); if (isa<AllocaInst>(P)) return true; @@ -198,8 +105,8 @@ static inline bool AreAnyUnderlyingObjectsAnAlloca(const Value *V) { } if (const PHINode *PN = dyn_cast<const PHINode>(P)) { - for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) - Worklist.push_back(PN->getIncomingValue(i)); + for (Value *IncValue : PN->incoming_values()) + Worklist.push_back(IncValue); continue; } } while (!Worklist.empty()); @@ -270,293 +177,6 @@ STATISTIC(NumReleasesAfterOpt, #endif namespace { - /// \enum Sequence - /// - /// \brief A sequence of states that a pointer may go through in which an - /// objc_retain and objc_release are actually needed. - enum Sequence { - S_None, - S_Retain, ///< objc_retain(x). - S_CanRelease, ///< foo(x) -- x could possibly see a ref count decrement. - S_Use, ///< any use of x. - S_Stop, ///< like S_Release, but code motion is stopped. - S_Release, ///< objc_release(x). - S_MovableRelease ///< objc_release(x), !clang.imprecise_release. - }; - - raw_ostream &operator<<(raw_ostream &OS, const Sequence S) - LLVM_ATTRIBUTE_UNUSED; - raw_ostream &operator<<(raw_ostream &OS, const Sequence S) { - switch (S) { - case S_None: - return OS << "S_None"; - case S_Retain: - return OS << "S_Retain"; - case S_CanRelease: - return OS << "S_CanRelease"; - case S_Use: - return OS << "S_Use"; - case S_Release: - return OS << "S_Release"; - case S_MovableRelease: - return OS << "S_MovableRelease"; - case S_Stop: - return OS << "S_Stop"; - } - llvm_unreachable("Unknown sequence type."); - } -} - -static Sequence MergeSeqs(Sequence A, Sequence B, bool TopDown) { - // The easy cases. - if (A == B) - return A; - if (A == S_None || B == S_None) - return S_None; - - if (A > B) std::swap(A, B); - if (TopDown) { - // Choose the side which is further along in the sequence. - if ((A == S_Retain || A == S_CanRelease) && - (B == S_CanRelease || B == S_Use)) - return B; - } else { - // Choose the side which is further along in the sequence. - if ((A == S_Use || A == S_CanRelease) && - (B == S_Use || B == S_Release || B == S_Stop || B == S_MovableRelease)) - return A; - // If both sides are releases, choose the more conservative one. - if (A == S_Stop && (B == S_Release || B == S_MovableRelease)) - return A; - if (A == S_Release && B == S_MovableRelease) - return A; - } - - return S_None; -} - -namespace { - /// \brief Unidirectional information about either a - /// retain-decrement-use-release sequence or release-use-decrement-retain - /// reverse sequence. - struct RRInfo { - /// After an objc_retain, the reference count of the referenced - /// object is known to be positive. Similarly, before an objc_release, the - /// reference count of the referenced object is known to be positive. If - /// there are retain-release pairs in code regions where the retain count - /// is known to be positive, they can be eliminated, regardless of any side - /// effects between them. - /// - /// Also, a retain+release pair nested within another retain+release - /// pair all on the known same pointer value can be eliminated, regardless - /// of any intervening side effects. - /// - /// KnownSafe is true when either of these conditions is satisfied. - bool KnownSafe; - - /// True of the objc_release calls are all marked with the "tail" keyword. - bool IsTailCallRelease; - - /// If the Calls are objc_release calls and they all have a - /// clang.imprecise_release tag, this is the metadata tag. - MDNode *ReleaseMetadata; - - /// For a top-down sequence, the set of objc_retains or - /// objc_retainBlocks. For bottom-up, the set of objc_releases. - SmallPtrSet<Instruction *, 2> Calls; - - /// The set of optimal insert positions for moving calls in the opposite - /// sequence. - SmallPtrSet<Instruction *, 2> ReverseInsertPts; - - /// If this is true, we cannot perform code motion but can still remove - /// retain/release pairs. - bool CFGHazardAfflicted; - - RRInfo() : - KnownSafe(false), IsTailCallRelease(false), ReleaseMetadata(nullptr), - CFGHazardAfflicted(false) {} - - void clear(); - - /// Conservatively merge the two RRInfo. Returns true if a partial merge has - /// occurred, false otherwise. - bool Merge(const RRInfo &Other); - - }; -} - -void RRInfo::clear() { - KnownSafe = false; - IsTailCallRelease = false; - ReleaseMetadata = nullptr; - Calls.clear(); - ReverseInsertPts.clear(); - CFGHazardAfflicted = false; -} - -bool RRInfo::Merge(const RRInfo &Other) { - // Conservatively merge the ReleaseMetadata information. - if (ReleaseMetadata != Other.ReleaseMetadata) - ReleaseMetadata = nullptr; - - // Conservatively merge the boolean state. - KnownSafe &= Other.KnownSafe; - IsTailCallRelease &= Other.IsTailCallRelease; - CFGHazardAfflicted |= Other.CFGHazardAfflicted; - - // Merge the call sets. - Calls.insert(Other.Calls.begin(), Other.Calls.end()); - - // Merge the insert point sets. If there are any differences, - // that makes this a partial merge. - bool Partial = ReverseInsertPts.size() != Other.ReverseInsertPts.size(); - for (Instruction *Inst : Other.ReverseInsertPts) - Partial |= ReverseInsertPts.insert(Inst).second; - return Partial; -} - -namespace { - /// \brief This class summarizes several per-pointer runtime properties which - /// are propogated through the flow graph. - class PtrState { - /// True if the reference count is known to be incremented. - bool KnownPositiveRefCount; - - /// True if we've seen an opportunity for partial RR elimination, such as - /// pushing calls into a CFG triangle or into one side of a CFG diamond. - bool Partial; - - /// The current position in the sequence. - unsigned char Seq : 8; - - /// Unidirectional information about the current sequence. - RRInfo RRI; - - public: - PtrState() : KnownPositiveRefCount(false), Partial(false), - Seq(S_None) {} - - - bool IsKnownSafe() const { - return RRI.KnownSafe; - } - - void SetKnownSafe(const bool NewValue) { - RRI.KnownSafe = NewValue; - } - - bool IsTailCallRelease() const { - return RRI.IsTailCallRelease; - } - - void SetTailCallRelease(const bool NewValue) { - RRI.IsTailCallRelease = NewValue; - } - - bool IsTrackingImpreciseReleases() const { - return RRI.ReleaseMetadata != nullptr; - } - - const MDNode *GetReleaseMetadata() const { - return RRI.ReleaseMetadata; - } - - void SetReleaseMetadata(MDNode *NewValue) { - RRI.ReleaseMetadata = NewValue; - } - - bool IsCFGHazardAfflicted() const { - return RRI.CFGHazardAfflicted; - } - - void SetCFGHazardAfflicted(const bool NewValue) { - RRI.CFGHazardAfflicted = NewValue; - } - - void SetKnownPositiveRefCount() { - DEBUG(dbgs() << "Setting Known Positive.\n"); - KnownPositiveRefCount = true; - } - - void ClearKnownPositiveRefCount() { - DEBUG(dbgs() << "Clearing Known Positive.\n"); - KnownPositiveRefCount = false; - } - - bool HasKnownPositiveRefCount() const { - return KnownPositiveRefCount; - } - - void SetSeq(Sequence NewSeq) { - DEBUG(dbgs() << "Old: " << Seq << "; New: " << NewSeq << "\n"); - Seq = NewSeq; - } - - Sequence GetSeq() const { - return static_cast<Sequence>(Seq); - } - - void ClearSequenceProgress() { - ResetSequenceProgress(S_None); - } - - void ResetSequenceProgress(Sequence NewSeq) { - DEBUG(dbgs() << "Resetting sequence progress.\n"); - SetSeq(NewSeq); - Partial = false; - RRI.clear(); - } - - void Merge(const PtrState &Other, bool TopDown); - - void InsertCall(Instruction *I) { - RRI.Calls.insert(I); - } - - void InsertReverseInsertPt(Instruction *I) { - RRI.ReverseInsertPts.insert(I); - } - - void ClearReverseInsertPts() { - RRI.ReverseInsertPts.clear(); - } - - bool HasReverseInsertPts() const { - return !RRI.ReverseInsertPts.empty(); - } - - const RRInfo &GetRRInfo() const { - return RRI; - } - }; -} - -void -PtrState::Merge(const PtrState &Other, bool TopDown) { - Seq = MergeSeqs(GetSeq(), Other.GetSeq(), TopDown); - KnownPositiveRefCount &= Other.KnownPositiveRefCount; - - // If we're not in a sequence (anymore), drop all associated state. - if (Seq == S_None) { - Partial = false; - RRI.clear(); - } else if (Partial || Other.Partial) { - // If we're doing a merge on a path that's previously seen a partial - // merge, conservatively drop the sequence, to avoid doing partial - // RR elimination. If the branch predicates for the two merge differ, - // mixing them is unsafe. - ClearSequenceProgress(); - } else { - // Otherwise merge the other PtrState's RRInfo into our RRInfo. At this - // point, we know that currently we are not partial. Stash whether or not - // the merge operation caused us to undergo a partial merging of reverse - // insertion points. - Partial = RRI.Merge(Other.RRI); - } -} - -namespace { /// \brief Per-BasicBlock state. class BBState { /// The number of unique control paths from the entry which can reach this @@ -566,20 +186,18 @@ namespace { /// The number of unique control paths to exits from this block. unsigned BottomUpPathCount; - /// A type for PerPtrTopDown and PerPtrBottomUp. - typedef MapVector<const Value *, PtrState> MapTy; - /// The top-down traversal uses this to record information known about a /// pointer at the bottom of each block. - MapTy PerPtrTopDown; + BlotMapVector<const Value *, TopDownPtrState> PerPtrTopDown; /// The bottom-up traversal uses this to record information known about a /// pointer at the top of each block. - MapTy PerPtrBottomUp; + BlotMapVector<const Value *, BottomUpPtrState> PerPtrBottomUp; /// Effective predecessors of the current block ignoring ignorable edges and /// ignored backedges. SmallVector<BasicBlock *, 2> Preds; + /// Effective successors of the current block ignoring ignorable edges and /// ignored backedges. SmallVector<BasicBlock *, 2> Succs; @@ -589,26 +207,38 @@ namespace { BBState() : TopDownPathCount(0), BottomUpPathCount(0) { } - typedef MapTy::iterator ptr_iterator; - typedef MapTy::const_iterator ptr_const_iterator; + typedef decltype(PerPtrTopDown)::iterator top_down_ptr_iterator; + typedef decltype(PerPtrTopDown)::const_iterator const_top_down_ptr_iterator; - ptr_iterator top_down_ptr_begin() { return PerPtrTopDown.begin(); } - ptr_iterator top_down_ptr_end() { return PerPtrTopDown.end(); } - ptr_const_iterator top_down_ptr_begin() const { + top_down_ptr_iterator top_down_ptr_begin() { return PerPtrTopDown.begin(); } + top_down_ptr_iterator top_down_ptr_end() { return PerPtrTopDown.end(); } + const_top_down_ptr_iterator top_down_ptr_begin() const { return PerPtrTopDown.begin(); } - ptr_const_iterator top_down_ptr_end() const { + const_top_down_ptr_iterator top_down_ptr_end() const { return PerPtrTopDown.end(); } + bool hasTopDownPtrs() const { + return !PerPtrTopDown.empty(); + } - ptr_iterator bottom_up_ptr_begin() { return PerPtrBottomUp.begin(); } - ptr_iterator bottom_up_ptr_end() { return PerPtrBottomUp.end(); } - ptr_const_iterator bottom_up_ptr_begin() const { + typedef decltype(PerPtrBottomUp)::iterator bottom_up_ptr_iterator; + typedef decltype( + PerPtrBottomUp)::const_iterator const_bottom_up_ptr_iterator; + + bottom_up_ptr_iterator bottom_up_ptr_begin() { + return PerPtrBottomUp.begin(); + } + bottom_up_ptr_iterator bottom_up_ptr_end() { return PerPtrBottomUp.end(); } + const_bottom_up_ptr_iterator bottom_up_ptr_begin() const { return PerPtrBottomUp.begin(); } - ptr_const_iterator bottom_up_ptr_end() const { + const_bottom_up_ptr_iterator bottom_up_ptr_end() const { return PerPtrBottomUp.end(); } + bool hasBottomUpPtrs() const { + return !PerPtrBottomUp.empty(); + } /// Mark this block as being an entry block, which has one path from the /// entry by definition. @@ -621,20 +251,20 @@ namespace { /// Attempt to find the PtrState object describing the top down state for /// pointer Arg. Return a new initialized PtrState describing the top down /// state for Arg if we do not find one. - PtrState &getPtrTopDownState(const Value *Arg) { + TopDownPtrState &getPtrTopDownState(const Value *Arg) { return PerPtrTopDown[Arg]; } /// Attempt to find the PtrState object describing the bottom up state for /// pointer Arg. Return a new initialized PtrState describing the bottom up /// state for Arg if we do not find one. - PtrState &getPtrBottomUpState(const Value *Arg) { + BottomUpPtrState &getPtrBottomUpState(const Value *Arg) { return PerPtrBottomUp[Arg]; } /// Attempt to find the PtrState object describing the bottom up state for /// pointer Arg. - ptr_iterator findPtrBottomUpState(const Value *Arg) { + bottom_up_ptr_iterator findPtrBottomUpState(const Value *Arg) { return PerPtrBottomUp.find(Arg); } @@ -685,6 +315,11 @@ namespace { const unsigned BBState::OverflowOccurredValue = 0xffffffff; } +namespace llvm { +raw_ostream &operator<<(raw_ostream &OS, + BBState &BBState) LLVM_ATTRIBUTE_UNUSED; +} + void BBState::InitFromPred(const BBState &Other) { PerPtrTopDown = Other.PerPtrTopDown; TopDownPathCount = Other.TopDownPathCount; @@ -724,19 +359,18 @@ void BBState::MergePred(const BBState &Other) { // For each entry in the other set, if our set has an entry with the same key, // merge the entries. Otherwise, copy the entry and merge it with an empty // entry. - for (ptr_const_iterator MI = Other.top_down_ptr_begin(), - ME = Other.top_down_ptr_end(); MI != ME; ++MI) { - std::pair<ptr_iterator, bool> Pair = PerPtrTopDown.insert(*MI); - Pair.first->second.Merge(Pair.second ? PtrState() : MI->second, + for (auto MI = Other.top_down_ptr_begin(), ME = Other.top_down_ptr_end(); + MI != ME; ++MI) { + auto Pair = PerPtrTopDown.insert(*MI); + Pair.first->second.Merge(Pair.second ? TopDownPtrState() : MI->second, /*TopDown=*/true); } // For each entry in our set, if the other set doesn't have an entry with the // same key, force it to merge with an empty entry. - for (ptr_iterator MI = top_down_ptr_begin(), - ME = top_down_ptr_end(); MI != ME; ++MI) + for (auto MI = top_down_ptr_begin(), ME = top_down_ptr_end(); MI != ME; ++MI) if (Other.PerPtrTopDown.find(MI->first) == Other.PerPtrTopDown.end()) - MI->second.Merge(PtrState(), /*TopDown=*/true); + MI->second.Merge(TopDownPtrState(), /*TopDown=*/true); } /// The bottom-up traversal uses this to merge information about successors to @@ -768,304 +402,80 @@ void BBState::MergeSucc(const BBState &Other) { // For each entry in the other set, if our set has an entry with the // same key, merge the entries. Otherwise, copy the entry and merge // it with an empty entry. - for (ptr_const_iterator MI = Other.bottom_up_ptr_begin(), - ME = Other.bottom_up_ptr_end(); MI != ME; ++MI) { - std::pair<ptr_iterator, bool> Pair = PerPtrBottomUp.insert(*MI); - Pair.first->second.Merge(Pair.second ? PtrState() : MI->second, + for (auto MI = Other.bottom_up_ptr_begin(), ME = Other.bottom_up_ptr_end(); + MI != ME; ++MI) { + auto Pair = PerPtrBottomUp.insert(*MI); + Pair.first->second.Merge(Pair.second ? BottomUpPtrState() : MI->second, /*TopDown=*/false); } // For each entry in our set, if the other set doesn't have an entry // with the same key, force it to merge with an empty entry. - for (ptr_iterator MI = bottom_up_ptr_begin(), - ME = bottom_up_ptr_end(); MI != ME; ++MI) + for (auto MI = bottom_up_ptr_begin(), ME = bottom_up_ptr_end(); MI != ME; + ++MI) if (Other.PerPtrBottomUp.find(MI->first) == Other.PerPtrBottomUp.end()) - MI->second.Merge(PtrState(), /*TopDown=*/false); + MI->second.Merge(BottomUpPtrState(), /*TopDown=*/false); } -// Only enable ARC Annotations if we are building a debug version of -// libObjCARCOpts. -#ifndef NDEBUG -#define ARC_ANNOTATIONS -#endif - -// Define some macros along the lines of DEBUG and some helper functions to make -// it cleaner to create annotations in the source code and to no-op when not -// building in debug mode. -#ifdef ARC_ANNOTATIONS - -#include "llvm/Support/CommandLine.h" - -/// Enable/disable ARC sequence annotations. -static cl::opt<bool> -EnableARCAnnotations("enable-objc-arc-annotations", cl::init(false), - cl::desc("Enable emission of arc data flow analysis " - "annotations")); -static cl::opt<bool> -DisableCheckForCFGHazards("disable-objc-arc-checkforcfghazards", cl::init(false), - cl::desc("Disable check for cfg hazards when " - "annotating")); -static cl::opt<std::string> -ARCAnnotationTargetIdentifier("objc-arc-annotation-target-identifier", - cl::init(""), - cl::desc("filter out all data flow annotations " - "but those that apply to the given " - "target llvm identifier.")); - -/// This function appends a unique ARCAnnotationProvenanceSourceMDKind id to an -/// instruction so that we can track backwards when post processing via the llvm -/// arc annotation processor tool. If the function is an -static MDString *AppendMDNodeToSourcePtr(unsigned NodeId, - Value *Ptr) { - MDString *Hash = nullptr; - - // If pointer is a result of an instruction and it does not have a source - // MDNode it, attach a new MDNode onto it. If pointer is a result of - // an instruction and does have a source MDNode attached to it, return a - // reference to said Node. Otherwise just return 0. - if (Instruction *Inst = dyn_cast<Instruction>(Ptr)) { - MDNode *Node; - if (!(Node = Inst->getMetadata(NodeId))) { - // We do not have any node. Generate and attatch the hash MDString to the - // instruction. - - // We just use an MDString to ensure that this metadata gets written out - // of line at the module level and to provide a very simple format - // encoding the information herein. Both of these makes it simpler to - // parse the annotations by a simple external program. - std::string Str; - raw_string_ostream os(Str); - os << "(" << Inst->getParent()->getParent()->getName() << ",%" - << Inst->getName() << ")"; - - Hash = MDString::get(Inst->getContext(), os.str()); - Inst->setMetadata(NodeId, MDNode::get(Inst->getContext(),Hash)); - } else { - // We have a node. Grab its hash and return it. - assert(Node->getNumOperands() == 1 && - "An ARCAnnotationProvenanceSourceMDKind can only have 1 operand."); - Hash = cast<MDString>(Node->getOperand(0)); +raw_ostream &llvm::operator<<(raw_ostream &OS, BBState &BBInfo) { + // Dump the pointers we are tracking. + OS << " TopDown State:\n"; + if (!BBInfo.hasTopDownPtrs()) { + DEBUG(llvm::dbgs() << " NONE!\n"); + } else { + for (auto I = BBInfo.top_down_ptr_begin(), E = BBInfo.top_down_ptr_end(); + I != E; ++I) { + const PtrState &P = I->second; + OS << " Ptr: " << *I->first + << "\n KnownSafe: " << (P.IsKnownSafe()?"true":"false") + << "\n ImpreciseRelease: " + << (P.IsTrackingImpreciseReleases()?"true":"false") << "\n" + << " HasCFGHazards: " + << (P.IsCFGHazardAfflicted()?"true":"false") << "\n" + << " KnownPositive: " + << (P.HasKnownPositiveRefCount()?"true":"false") << "\n" + << " Seq: " + << P.GetSeq() << "\n"; } - } else if (Argument *Arg = dyn_cast<Argument>(Ptr)) { - std::string str; - raw_string_ostream os(str); - os << "(" << Arg->getParent()->getName() << ",%" << Arg->getName() - << ")"; - Hash = MDString::get(Arg->getContext(), os.str()); } - return Hash; -} - -static std::string SequenceToString(Sequence A) { - std::string str; - raw_string_ostream os(str); - os << A; - return os.str(); -} - -/// Helper function to change a Sequence into a String object using our overload -/// for raw_ostream so we only have printing code in one location. -static MDString *SequenceToMDString(LLVMContext &Context, - Sequence A) { - return MDString::get(Context, SequenceToString(A)); -} - -/// A simple function to generate a MDNode which describes the change in state -/// for Value *Ptr caused by Instruction *Inst. -static void AppendMDNodeToInstForPtr(unsigned NodeId, - Instruction *Inst, - Value *Ptr, - MDString *PtrSourceMDNodeID, - Sequence OldSeq, - Sequence NewSeq) { - MDNode *Node = nullptr; - Metadata *tmp[3] = {PtrSourceMDNodeID, - SequenceToMDString(Inst->getContext(), OldSeq), - SequenceToMDString(Inst->getContext(), NewSeq)}; - Node = MDNode::get(Inst->getContext(), tmp); - - Inst->setMetadata(NodeId, Node); -} - -/// Add to the beginning of the basic block llvm.ptr.annotations which show the -/// state of a pointer at the entrance to a basic block. -static void GenerateARCBBEntranceAnnotation(const char *Name, BasicBlock *BB, - Value *Ptr, Sequence Seq) { - // If we have a target identifier, make sure that we match it before - // continuing. - if(!ARCAnnotationTargetIdentifier.empty() && - !Ptr->getName().equals(ARCAnnotationTargetIdentifier)) - return; - - Module *M = BB->getParent()->getParent(); - LLVMContext &C = M->getContext(); - Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); - Type *I8XX = PointerType::getUnqual(I8X); - Type *Params[] = {I8XX, I8XX}; - FunctionType *FTy = FunctionType::get(Type::getVoidTy(C), Params, - /*isVarArg=*/false); - Constant *Callee = M->getOrInsertFunction(Name, FTy); - - IRBuilder<> Builder(BB, BB->getFirstInsertionPt()); - - Value *PtrName; - StringRef Tmp = Ptr->getName(); - if (nullptr == (PtrName = M->getGlobalVariable(Tmp, true))) { - Value *ActualPtrName = Builder.CreateGlobalStringPtr(Tmp, - Tmp + "_STR"); - PtrName = new GlobalVariable(*M, I8X, true, GlobalVariable::InternalLinkage, - cast<Constant>(ActualPtrName), Tmp); - } - - Value *S; - std::string SeqStr = SequenceToString(Seq); - if (nullptr == (S = M->getGlobalVariable(SeqStr, true))) { - Value *ActualPtrName = Builder.CreateGlobalStringPtr(SeqStr, - SeqStr + "_STR"); - S = new GlobalVariable(*M, I8X, true, GlobalVariable::InternalLinkage, - cast<Constant>(ActualPtrName), SeqStr); - } - - Builder.CreateCall2(Callee, PtrName, S); -} - -/// Add to the end of the basic block llvm.ptr.annotations which show the state -/// of the pointer at the bottom of the basic block. -static void GenerateARCBBTerminatorAnnotation(const char *Name, BasicBlock *BB, - Value *Ptr, Sequence Seq) { - // If we have a target identifier, make sure that we match it before emitting - // an annotation. - if(!ARCAnnotationTargetIdentifier.empty() && - !Ptr->getName().equals(ARCAnnotationTargetIdentifier)) - return; - - Module *M = BB->getParent()->getParent(); - LLVMContext &C = M->getContext(); - Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); - Type *I8XX = PointerType::getUnqual(I8X); - Type *Params[] = {I8XX, I8XX}; - FunctionType *FTy = FunctionType::get(Type::getVoidTy(C), Params, - /*isVarArg=*/false); - Constant *Callee = M->getOrInsertFunction(Name, FTy); - - IRBuilder<> Builder(BB, std::prev(BB->end())); - - Value *PtrName; - StringRef Tmp = Ptr->getName(); - if (nullptr == (PtrName = M->getGlobalVariable(Tmp, true))) { - Value *ActualPtrName = Builder.CreateGlobalStringPtr(Tmp, - Tmp + "_STR"); - PtrName = new GlobalVariable(*M, I8X, true, GlobalVariable::InternalLinkage, - cast<Constant>(ActualPtrName), Tmp); - } - - Value *S; - std::string SeqStr = SequenceToString(Seq); - if (nullptr == (S = M->getGlobalVariable(SeqStr, true))) { - Value *ActualPtrName = Builder.CreateGlobalStringPtr(SeqStr, - SeqStr + "_STR"); - S = new GlobalVariable(*M, I8X, true, GlobalVariable::InternalLinkage, - cast<Constant>(ActualPtrName), SeqStr); + OS << " BottomUp State:\n"; + if (!BBInfo.hasBottomUpPtrs()) { + DEBUG(llvm::dbgs() << " NONE!\n"); + } else { + for (auto I = BBInfo.bottom_up_ptr_begin(), E = BBInfo.bottom_up_ptr_end(); + I != E; ++I) { + const PtrState &P = I->second; + OS << " Ptr: " << *I->first + << "\n KnownSafe: " << (P.IsKnownSafe()?"true":"false") + << "\n ImpreciseRelease: " + << (P.IsTrackingImpreciseReleases()?"true":"false") << "\n" + << " HasCFGHazards: " + << (P.IsCFGHazardAfflicted()?"true":"false") << "\n" + << " KnownPositive: " + << (P.HasKnownPositiveRefCount()?"true":"false") << "\n" + << " Seq: " + << P.GetSeq() << "\n"; + } } - Builder.CreateCall2(Callee, PtrName, S); -} -/// Adds a source annotation to pointer and a state change annotation to Inst -/// referencing the source annotation and the old/new state of pointer. -static void GenerateARCAnnotation(unsigned InstMDId, - unsigned PtrMDId, - Instruction *Inst, - Value *Ptr, - Sequence OldSeq, - Sequence NewSeq) { - if (EnableARCAnnotations) { - // If we have a target identifier, make sure that we match it before - // emitting an annotation. - if(!ARCAnnotationTargetIdentifier.empty() && - !Ptr->getName().equals(ARCAnnotationTargetIdentifier)) - return; - - // First generate the source annotation on our pointer. This will return an - // MDString* if Ptr actually comes from an instruction implying we can put - // in a source annotation. If AppendMDNodeToSourcePtr returns 0 (i.e. NULL), - // then we know that our pointer is from an Argument so we put a reference - // to the argument number. - // - // The point of this is to make it easy for the - // llvm-arc-annotation-processor tool to cross reference where the source - // pointer is in the LLVM IR since the LLVM IR parser does not submit such - // information via debug info for backends to use (since why would anyone - // need such a thing from LLVM IR besides in non-standard cases - // [i.e. this]). - MDString *SourcePtrMDNode = - AppendMDNodeToSourcePtr(PtrMDId, Ptr); - AppendMDNodeToInstForPtr(InstMDId, Inst, Ptr, SourcePtrMDNode, OldSeq, - NewSeq); - } + return OS; } -// The actual interface for accessing the above functionality is defined via -// some simple macros which are defined below. We do this so that the user does -// not need to pass in what metadata id is needed resulting in cleaner code and -// additionally since it provides an easy way to conditionally no-op all -// annotation support in a non-debug build. - -/// Use this macro to annotate a sequence state change when processing -/// instructions bottom up, -#define ANNOTATE_BOTTOMUP(inst, ptr, old, new) \ - GenerateARCAnnotation(ARCAnnotationBottomUpMDKind, \ - ARCAnnotationProvenanceSourceMDKind, (inst), \ - const_cast<Value*>(ptr), (old), (new)) -/// Use this macro to annotate a sequence state change when processing -/// instructions top down. -#define ANNOTATE_TOPDOWN(inst, ptr, old, new) \ - GenerateARCAnnotation(ARCAnnotationTopDownMDKind, \ - ARCAnnotationProvenanceSourceMDKind, (inst), \ - const_cast<Value*>(ptr), (old), (new)) - -#define ANNOTATE_BB(_states, _bb, _name, _type, _direction) \ - do { \ - if (EnableARCAnnotations) { \ - for(BBState::ptr_const_iterator I = (_states)._direction##_ptr_begin(), \ - E = (_states)._direction##_ptr_end(); I != E; ++I) { \ - Value *Ptr = const_cast<Value*>(I->first); \ - Sequence Seq = I->second.GetSeq(); \ - GenerateARCBB ## _type ## Annotation(_name, (_bb), Ptr, Seq); \ - } \ - } \ - } while (0) - -#define ANNOTATE_BOTTOMUP_BBSTART(_states, _basicblock) \ - ANNOTATE_BB(_states, _basicblock, "llvm.arc.annotation.bottomup.bbstart", \ - Entrance, bottom_up) -#define ANNOTATE_BOTTOMUP_BBEND(_states, _basicblock) \ - ANNOTATE_BB(_states, _basicblock, "llvm.arc.annotation.bottomup.bbend", \ - Terminator, bottom_up) -#define ANNOTATE_TOPDOWN_BBSTART(_states, _basicblock) \ - ANNOTATE_BB(_states, _basicblock, "llvm.arc.annotation.topdown.bbstart", \ - Entrance, top_down) -#define ANNOTATE_TOPDOWN_BBEND(_states, _basicblock) \ - ANNOTATE_BB(_states, _basicblock, "llvm.arc.annotation.topdown.bbend", \ - Terminator, top_down) - -#else // !ARC_ANNOTATION -// If annotations are off, noop. -#define ANNOTATE_BOTTOMUP(inst, ptr, old, new) -#define ANNOTATE_TOPDOWN(inst, ptr, old, new) -#define ANNOTATE_BOTTOMUP_BBSTART(states, basicblock) -#define ANNOTATE_BOTTOMUP_BBEND(states, basicblock) -#define ANNOTATE_TOPDOWN_BBSTART(states, basicblock) -#define ANNOTATE_TOPDOWN_BBEND(states, basicblock) -#endif // !ARC_ANNOTATION - namespace { + /// \brief The main ARC optimization pass. class ObjCARCOpt : public FunctionPass { bool Changed; ProvenanceAnalysis PA; + + /// A cache of references to runtime entry point constants. ARCRuntimeEntryPoints EP; + /// A cache of MDKinds that can be passed into other functions to propagate + /// MDKind identifiers. + ARCMDKindCache MDKindCache; + // This is used to track if a pointer is stored into an alloca. DenseSet<const Value *> MultiOwnersSet; @@ -1076,73 +486,49 @@ namespace { /// is in fact used in the current function. unsigned UsedInThisFunction; - /// The Metadata Kind for clang.imprecise_release metadata. - unsigned ImpreciseReleaseMDKind; - - /// The Metadata Kind for clang.arc.copy_on_escape metadata. - unsigned CopyOnEscapeMDKind; - - /// The Metadata Kind for clang.arc.no_objc_arc_exceptions metadata. - unsigned NoObjCARCExceptionsMDKind; - -#ifdef ARC_ANNOTATIONS - /// The Metadata Kind for llvm.arc.annotation.bottomup metadata. - unsigned ARCAnnotationBottomUpMDKind; - /// The Metadata Kind for llvm.arc.annotation.topdown metadata. - unsigned ARCAnnotationTopDownMDKind; - /// The Metadata Kind for llvm.arc.annotation.provenancesource metadata. - unsigned ARCAnnotationProvenanceSourceMDKind; -#endif // ARC_ANNOATIONS - bool OptimizeRetainRVCall(Function &F, Instruction *RetainRV); void OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV, - InstructionClass &Class); + ARCInstKind &Class); void OptimizeIndividualCalls(Function &F); void CheckForCFGHazards(const BasicBlock *BB, DenseMap<const BasicBlock *, BBState> &BBStates, BBState &MyStates) const; - bool VisitInstructionBottomUp(Instruction *Inst, - BasicBlock *BB, - MapVector<Value *, RRInfo> &Retains, + bool VisitInstructionBottomUp(Instruction *Inst, BasicBlock *BB, + BlotMapVector<Value *, RRInfo> &Retains, BBState &MyStates); bool VisitBottomUp(BasicBlock *BB, DenseMap<const BasicBlock *, BBState> &BBStates, - MapVector<Value *, RRInfo> &Retains); + BlotMapVector<Value *, RRInfo> &Retains); bool VisitInstructionTopDown(Instruction *Inst, DenseMap<Value *, RRInfo> &Releases, BBState &MyStates); bool VisitTopDown(BasicBlock *BB, DenseMap<const BasicBlock *, BBState> &BBStates, DenseMap<Value *, RRInfo> &Releases); - bool Visit(Function &F, - DenseMap<const BasicBlock *, BBState> &BBStates, - MapVector<Value *, RRInfo> &Retains, + bool Visit(Function &F, DenseMap<const BasicBlock *, BBState> &BBStates, + BlotMapVector<Value *, RRInfo> &Retains, DenseMap<Value *, RRInfo> &Releases); void MoveCalls(Value *Arg, RRInfo &RetainsToMove, RRInfo &ReleasesToMove, - MapVector<Value *, RRInfo> &Retains, + BlotMapVector<Value *, RRInfo> &Retains, DenseMap<Value *, RRInfo> &Releases, - SmallVectorImpl<Instruction *> &DeadInsts, - Module *M); - - bool ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState> &BBStates, - MapVector<Value *, RRInfo> &Retains, - DenseMap<Value *, RRInfo> &Releases, - Module *M, - SmallVectorImpl<Instruction *> &NewRetains, - SmallVectorImpl<Instruction *> &NewReleases, - SmallVectorImpl<Instruction *> &DeadInsts, - RRInfo &RetainsToMove, - RRInfo &ReleasesToMove, - Value *Arg, - bool KnownSafe, - bool &AnyPairsCompletelyEliminated); + SmallVectorImpl<Instruction *> &DeadInsts, Module *M); + + bool + PairUpRetainsAndReleases(DenseMap<const BasicBlock *, BBState> &BBStates, + BlotMapVector<Value *, RRInfo> &Retains, + DenseMap<Value *, RRInfo> &Releases, Module *M, + SmallVectorImpl<Instruction *> &NewRetains, + SmallVectorImpl<Instruction *> &NewReleases, + SmallVectorImpl<Instruction *> &DeadInsts, + RRInfo &RetainsToMove, RRInfo &ReleasesToMove, + Value *Arg, bool KnownSafe, + bool &AnyPairsCompletelyEliminated); bool PerformCodePlacement(DenseMap<const BasicBlock *, BBState> &BBStates, - MapVector<Value *, RRInfo> &Retains, - DenseMap<Value *, RRInfo> &Releases, - Module *M); + BlotMapVector<Value *, RRInfo> &Retains, + DenseMap<Value *, RRInfo> &Releases, Module *M); void OptimizeWeakCalls(Function &F); @@ -1191,7 +577,7 @@ void ObjCARCOpt::getAnalysisUsage(AnalysisUsage &AU) const { bool ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) { // Check for the argument being from an immediately preceding call or invoke. - const Value *Arg = GetObjCArg(RetainRV); + const Value *Arg = GetArgRCIdentityRoot(RetainRV); ImmutableCallSite CS(Arg); if (const Instruction *Call = CS.getInstruction()) { if (Call->getParent() == RetainRV->getParent()) { @@ -1216,8 +602,8 @@ ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) { BasicBlock::iterator I = RetainRV, Begin = RetainRV->getParent()->begin(); if (I != Begin) { do --I; while (I != Begin && IsNoopInstruction(I)); - if (GetBasicInstructionClass(I) == IC_AutoreleaseRV && - GetObjCArg(I) == Arg) { + if (GetBasicARCInstKind(I) == ARCInstKind::AutoreleaseRV && + GetArgRCIdentityRoot(I) == Arg) { Changed = true; ++NumPeeps; @@ -1238,7 +624,7 @@ ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) { "objc_retain since the operand is not a return value.\n" "Old = " << *RetainRV << "\n"); - Constant *NewDecl = EP.get(ARCRuntimeEntryPoints::EPT_Retain); + Constant *NewDecl = EP.get(ARCRuntimeEntryPointKind::Retain); cast<CallInst>(RetainRV)->setCalledFunction(NewDecl); DEBUG(dbgs() << "New = " << *RetainRV << "\n"); @@ -1248,17 +634,17 @@ ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) { /// Turn objc_autoreleaseReturnValue into objc_autorelease if the result is not /// used as a return value. -void -ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV, - InstructionClass &Class) { +void ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F, + Instruction *AutoreleaseRV, + ARCInstKind &Class) { // Check for a return of the pointer value. - const Value *Ptr = GetObjCArg(AutoreleaseRV); + const Value *Ptr = GetArgRCIdentityRoot(AutoreleaseRV); SmallVector<const Value *, 2> Users; Users.push_back(Ptr); do { Ptr = Users.pop_back_val(); for (const User *U : Ptr->users()) { - if (isa<ReturnInst>(U) || GetBasicInstructionClass(U) == IC_RetainRV) + if (isa<ReturnInst>(U) || GetBasicARCInstKind(U) == ARCInstKind::RetainRV) return; if (isa<BitCastInst>(U)) Users.push_back(U); @@ -1274,10 +660,10 @@ ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV, "Old = " << *AutoreleaseRV << "\n"); CallInst *AutoreleaseRVCI = cast<CallInst>(AutoreleaseRV); - Constant *NewDecl = EP.get(ARCRuntimeEntryPoints::EPT_Autorelease); + Constant *NewDecl = EP.get(ARCRuntimeEntryPointKind::Autorelease); AutoreleaseRVCI->setCalledFunction(NewDecl); AutoreleaseRVCI->setTailCall(false); // Never tail call objc_autorelease. - Class = IC_Autorelease; + Class = ARCInstKind::Autorelease; DEBUG(dbgs() << "New: " << *AutoreleaseRV << "\n"); @@ -1294,7 +680,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) { Instruction *Inst = &*I++; - InstructionClass Class = GetBasicInstructionClass(Inst); + ARCInstKind Class = GetBasicARCInstKind(Inst); DEBUG(dbgs() << "Visiting: Class: " << Class << "; " << *Inst << "\n"); @@ -1309,7 +695,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { // There are gray areas here, as the ability to cast reference-counted // pointers to raw void* and back allows code to break ARC assumptions, // however these are currently considered to be unimportant. - case IC_NoopCast: + case ARCInstKind::NoopCast: Changed = true; ++NumNoops; DEBUG(dbgs() << "Erasing no-op cast: " << *Inst << "\n"); @@ -1317,11 +703,11 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { continue; // If the pointer-to-weak-pointer is null, it's undefined behavior. - case IC_StoreWeak: - case IC_LoadWeak: - case IC_LoadWeakRetained: - case IC_InitWeak: - case IC_DestroyWeak: { + case ARCInstKind::StoreWeak: + case ARCInstKind::LoadWeak: + case ARCInstKind::LoadWeakRetained: + case ARCInstKind::InitWeak: + case ARCInstKind::DestroyWeak: { CallInst *CI = cast<CallInst>(Inst); if (IsNullOrUndef(CI->getArgOperand(0))) { Changed = true; @@ -1338,8 +724,8 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { } break; } - case IC_CopyWeak: - case IC_MoveWeak: { + case ARCInstKind::CopyWeak: + case ARCInstKind::MoveWeak: { CallInst *CI = cast<CallInst>(Inst); if (IsNullOrUndef(CI->getArgOperand(0)) || IsNullOrUndef(CI->getArgOperand(1))) { @@ -1359,11 +745,11 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { } break; } - case IC_RetainRV: + case ARCInstKind::RetainRV: if (OptimizeRetainRVCall(F, Inst)) continue; break; - case IC_AutoreleaseRV: + case ARCInstKind::AutoreleaseRV: OptimizeAutoreleaseRVCall(F, Inst, Class); break; } @@ -1380,10 +766,11 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { // Create the declaration lazily. LLVMContext &C = Inst->getContext(); - Constant *Decl = EP.get(ARCRuntimeEntryPoints::EPT_Release); + Constant *Decl = EP.get(ARCRuntimeEntryPointKind::Release); CallInst *NewCall = CallInst::Create(Decl, Call->getArgOperand(0), "", Call); - NewCall->setMetadata(ImpreciseReleaseMDKind, MDNode::get(C, None)); + NewCall->setMetadata(MDKindCache.get(ARCMDKindID::ImpreciseRelease), + MDNode::get(C, None)); DEBUG(dbgs() << "Replacing autorelease{,RV}(x) with objc_release(x) " "since x is otherwise unused.\nOld: " << *Call << "\nNew: " @@ -1391,7 +778,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { EraseInstruction(Call); Inst = NewCall; - Class = IC_Release; + Class = ARCInstKind::Release; } } @@ -1422,11 +809,11 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { } if (!IsNoopOnNull(Class)) { - UsedInThisFunction |= 1 << Class; + UsedInThisFunction |= 1 << unsigned(Class); continue; } - const Value *Arg = GetObjCArg(Inst); + const Value *Arg = GetArgRCIdentityRoot(Inst); // ARC calls with null are no-ops. Delete them. if (IsNullOrUndef(Arg)) { @@ -1440,7 +827,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { // Keep track of which of retain, release, autorelease, and retain_block // are actually present in this function. - UsedInThisFunction |= 1 << Class; + UsedInThisFunction |= 1 << unsigned(Class); // If Arg is a PHI, and one or more incoming values to the // PHI are null, and the call is control-equivalent to the PHI, and there @@ -1463,7 +850,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { bool HasCriticalEdges = false; for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { Value *Incoming = - StripPointerCastsAndObjCCalls(PN->getIncomingValue(i)); + GetRCIdentityRoot(PN->getIncomingValue(i)); if (IsNullOrUndef(Incoming)) HasNull = true; else if (cast<TerminatorInst>(PN->getIncomingBlock(i)->back()) @@ -1480,25 +867,25 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { // Check that there is nothing that cares about the reference // count between the call and the phi. switch (Class) { - case IC_Retain: - case IC_RetainBlock: + case ARCInstKind::Retain: + case ARCInstKind::RetainBlock: // These can always be moved up. break; - case IC_Release: + case ARCInstKind::Release: // These can't be moved across things that care about the retain // count. FindDependencies(NeedsPositiveRetainCount, Arg, Inst->getParent(), Inst, DependingInstructions, Visited, PA); break; - case IC_Autorelease: + case ARCInstKind::Autorelease: // These can't be moved across autorelease pool scope boundaries. FindDependencies(AutoreleasePoolBoundary, Arg, Inst->getParent(), Inst, DependingInstructions, Visited, PA); break; - case IC_RetainRV: - case IC_AutoreleaseRV: + case ARCInstKind::RetainRV: + case ARCInstKind::AutoreleaseRV: // Don't move these; the RV optimization depends on the autoreleaseRV // being tail called, and the retainRV being immediately after a call // (which might still happen if we get lucky with codegen layout, but @@ -1517,7 +904,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { Type *ParamTy = CInst->getArgOperand(0)->getType(); for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { Value *Incoming = - StripPointerCastsAndObjCCalls(PN->getIncomingValue(i)); + GetRCIdentityRoot(PN->getIncomingValue(i)); if (!IsNullOrUndef(Incoming)) { CallInst *Clone = cast<CallInst>(CInst->clone()); Value *Op = PN->getIncomingValue(i); @@ -1547,7 +934,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { /// no CFG hazards by checking the states of various bottom up pointers. static void CheckForUseCFGHazard(const Sequence SuccSSeq, const bool SuccSRRIKnownSafe, - PtrState &S, + TopDownPtrState &S, bool &SomeSuccHasSame, bool &AllSuccsHaveSame, bool &NotAllSeqEqualButKnownSafe, @@ -1585,7 +972,7 @@ static void CheckForUseCFGHazard(const Sequence SuccSSeq, /// pointers. static void CheckForCanReleaseCFGHazard(const Sequence SuccSSeq, const bool SuccSRRIKnownSafe, - PtrState &S, + TopDownPtrState &S, bool &SomeSuccHasSame, bool &AllSuccsHaveSame, bool &NotAllSeqEqualButKnownSafe) { @@ -1618,9 +1005,9 @@ ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB, BBState &MyStates) const { // If any top-down local-use or possible-dec has a succ which is earlier in // the sequence, forget it. - for (BBState::ptr_iterator I = MyStates.top_down_ptr_begin(), - E = MyStates.top_down_ptr_end(); I != E; ++I) { - PtrState &S = I->second; + for (auto I = MyStates.top_down_ptr_begin(), E = MyStates.top_down_ptr_end(); + I != E; ++I) { + TopDownPtrState &S = I->second; const Sequence Seq = I->second.GetSeq(); // We only care about S_Retain, S_CanRelease, and S_Use. @@ -1646,7 +1033,7 @@ ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB, const DenseMap<const BasicBlock *, BBState>::iterator BBI = BBStates.find(*SI); assert(BBI != BBStates.end()); - const PtrState &SuccS = BBI->second.getPtrBottomUpState(Arg); + const BottomUpPtrState &SuccS = BBI->second.getPtrBottomUpState(Arg); const Sequence SuccSSeq = SuccS.GetSeq(); // If bottom up, the pointer is in an S_None state, clear the sequence @@ -1705,94 +1092,53 @@ ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB, } } -bool -ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst, - BasicBlock *BB, - MapVector<Value *, RRInfo> &Retains, - BBState &MyStates) { +bool ObjCARCOpt::VisitInstructionBottomUp( + Instruction *Inst, BasicBlock *BB, BlotMapVector<Value *, RRInfo> &Retains, + BBState &MyStates) { bool NestingDetected = false; - InstructionClass Class = GetInstructionClass(Inst); + ARCInstKind Class = GetARCInstKind(Inst); const Value *Arg = nullptr; - DEBUG(dbgs() << "Class: " << Class << "\n"); + DEBUG(dbgs() << " Class: " << Class << "\n"); switch (Class) { - case IC_Release: { - Arg = GetObjCArg(Inst); - - PtrState &S = MyStates.getPtrBottomUpState(Arg); - - // If we see two releases in a row on the same pointer. If so, make - // a note, and we'll cicle back to revisit it after we've - // hopefully eliminated the second release, which may allow us to - // eliminate the first release too. - // Theoretically we could implement removal of nested retain+release - // pairs by making PtrState hold a stack of states, but this is - // simple and avoids adding overhead for the non-nested case. - if (S.GetSeq() == S_Release || S.GetSeq() == S_MovableRelease) { - DEBUG(dbgs() << "Found nested releases (i.e. a release pair)\n"); - NestingDetected = true; - } + case ARCInstKind::Release: { + Arg = GetArgRCIdentityRoot(Inst); - MDNode *ReleaseMetadata = Inst->getMetadata(ImpreciseReleaseMDKind); - Sequence NewSeq = ReleaseMetadata ? S_MovableRelease : S_Release; - ANNOTATE_BOTTOMUP(Inst, Arg, S.GetSeq(), NewSeq); - S.ResetSequenceProgress(NewSeq); - S.SetReleaseMetadata(ReleaseMetadata); - S.SetKnownSafe(S.HasKnownPositiveRefCount()); - S.SetTailCallRelease(cast<CallInst>(Inst)->isTailCall()); - S.InsertCall(Inst); - S.SetKnownPositiveRefCount(); + BottomUpPtrState &S = MyStates.getPtrBottomUpState(Arg); + NestingDetected |= S.InitBottomUp(MDKindCache, Inst); break; } - case IC_RetainBlock: + case ARCInstKind::RetainBlock: // In OptimizeIndividualCalls, we have strength reduced all optimizable // objc_retainBlocks to objc_retains. Thus at this point any // objc_retainBlocks that we see are not optimizable. break; - case IC_Retain: - case IC_RetainRV: { - Arg = GetObjCArg(Inst); - - PtrState &S = MyStates.getPtrBottomUpState(Arg); - S.SetKnownPositiveRefCount(); - - Sequence OldSeq = S.GetSeq(); - switch (OldSeq) { - case S_Stop: - case S_Release: - case S_MovableRelease: - case S_Use: - // If OldSeq is not S_Use or OldSeq is S_Use and we are tracking an - // imprecise release, clear our reverse insertion points. - if (OldSeq != S_Use || S.IsTrackingImpreciseReleases()) - S.ClearReverseInsertPts(); - // FALL THROUGH - case S_CanRelease: - // Don't do retain+release tracking for IC_RetainRV, because it's - // better to let it remain as the first instruction after a call. - if (Class != IC_RetainRV) + case ARCInstKind::Retain: + case ARCInstKind::RetainRV: { + Arg = GetArgRCIdentityRoot(Inst); + BottomUpPtrState &S = MyStates.getPtrBottomUpState(Arg); + if (S.MatchWithRetain()) { + // Don't do retain+release tracking for ARCInstKind::RetainRV, because + // it's better to let it remain as the first instruction after a call. + if (Class != ARCInstKind::RetainRV) { + DEBUG(llvm::dbgs() << " Matching with: " << *Inst << "\n"); Retains[Inst] = S.GetRRInfo(); + } S.ClearSequenceProgress(); - break; - case S_None: - break; - case S_Retain: - llvm_unreachable("bottom-up pointer in retain state!"); } - ANNOTATE_BOTTOMUP(Inst, Arg, OldSeq, S.GetSeq()); // A retain moving bottom up can be a use. break; } - case IC_AutoreleasepoolPop: + case ARCInstKind::AutoreleasepoolPop: // Conservatively, clear MyStates for all known pointers. MyStates.clearBottomUpPointers(); return NestingDetected; - case IC_AutoreleasepoolPush: - case IC_None: + case ARCInstKind::AutoreleasepoolPush: + case ARCInstKind::None: // These are irrelevant. return NestingDetected; - case IC_User: + case ARCInstKind::User: // If we have a store into an alloca of a pointer we are tracking, the // pointer has multiple owners implying that we must be more conservative. // @@ -1806,9 +1152,10 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst, // in the presence of allocas we only unconditionally remove pointers if // both our retain and our release are KnownSafe. if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { - if (AreAnyUnderlyingObjectsAnAlloca(SI->getPointerOperand())) { - BBState::ptr_iterator I = MyStates.findPtrBottomUpState( - StripPointerCastsAndObjCCalls(SI->getValueOperand())); + const DataLayout &DL = BB->getModule()->getDataLayout(); + if (AreAnyUnderlyingObjectsAnAlloca(SI->getPointerOperand(), DL)) { + auto I = MyStates.findPtrBottomUpState( + GetRCIdentityRoot(SI->getValueOperand())); if (I != MyStates.bottom_up_ptr_end()) MultiOwnersSet.insert(I->first); } @@ -1820,90 +1167,26 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst, // Consider any other possible effects of this instruction on each // pointer being tracked. - for (BBState::ptr_iterator MI = MyStates.bottom_up_ptr_begin(), - ME = MyStates.bottom_up_ptr_end(); MI != ME; ++MI) { + for (auto MI = MyStates.bottom_up_ptr_begin(), + ME = MyStates.bottom_up_ptr_end(); + MI != ME; ++MI) { const Value *Ptr = MI->first; if (Ptr == Arg) continue; // Handled above. - PtrState &S = MI->second; - Sequence Seq = S.GetSeq(); + BottomUpPtrState &S = MI->second; - // Check for possible releases. - if (CanAlterRefCount(Inst, Ptr, PA, Class)) { - DEBUG(dbgs() << "CanAlterRefCount: Seq: " << Seq << "; " << *Ptr - << "\n"); - S.ClearKnownPositiveRefCount(); - switch (Seq) { - case S_Use: - S.SetSeq(S_CanRelease); - ANNOTATE_BOTTOMUP(Inst, Ptr, Seq, S.GetSeq()); - continue; - case S_CanRelease: - case S_Release: - case S_MovableRelease: - case S_Stop: - case S_None: - break; - case S_Retain: - llvm_unreachable("bottom-up pointer in retain state!"); - } - } + if (S.HandlePotentialAlterRefCount(Inst, Ptr, PA, Class)) + continue; - // Check for possible direct uses. - switch (Seq) { - case S_Release: - case S_MovableRelease: - if (CanUse(Inst, Ptr, PA, Class)) { - DEBUG(dbgs() << "CanUse: Seq: " << Seq << "; " << *Ptr - << "\n"); - assert(!S.HasReverseInsertPts()); - // If this is an invoke instruction, we're scanning it as part of - // one of its successor blocks, since we can't insert code after it - // in its own block, and we don't want to split critical edges. - if (isa<InvokeInst>(Inst)) - S.InsertReverseInsertPt(BB->getFirstInsertionPt()); - else - S.InsertReverseInsertPt(std::next(BasicBlock::iterator(Inst))); - S.SetSeq(S_Use); - ANNOTATE_BOTTOMUP(Inst, Ptr, Seq, S_Use); - } else if (Seq == S_Release && IsUser(Class)) { - DEBUG(dbgs() << "PreciseReleaseUse: Seq: " << Seq << "; " << *Ptr - << "\n"); - // Non-movable releases depend on any possible objc pointer use. - S.SetSeq(S_Stop); - ANNOTATE_BOTTOMUP(Inst, Ptr, S_Release, S_Stop); - assert(!S.HasReverseInsertPts()); - // As above; handle invoke specially. - if (isa<InvokeInst>(Inst)) - S.InsertReverseInsertPt(BB->getFirstInsertionPt()); - else - S.InsertReverseInsertPt(std::next(BasicBlock::iterator(Inst))); - } - break; - case S_Stop: - if (CanUse(Inst, Ptr, PA, Class)) { - DEBUG(dbgs() << "PreciseStopUse: Seq: " << Seq << "; " << *Ptr - << "\n"); - S.SetSeq(S_Use); - ANNOTATE_BOTTOMUP(Inst, Ptr, Seq, S_Use); - } - break; - case S_CanRelease: - case S_Use: - case S_None: - break; - case S_Retain: - llvm_unreachable("bottom-up pointer in retain state!"); - } + S.HandlePotentialUse(BB, Inst, Ptr, PA, Class); } return NestingDetected; } -bool -ObjCARCOpt::VisitBottomUp(BasicBlock *BB, - DenseMap<const BasicBlock *, BBState> &BBStates, - MapVector<Value *, RRInfo> &Retains) { +bool ObjCARCOpt::VisitBottomUp(BasicBlock *BB, + DenseMap<const BasicBlock *, BBState> &BBStates, + BlotMapVector<Value *, RRInfo> &Retains) { DEBUG(dbgs() << "\n== ObjCARCOpt::VisitBottomUp ==\n"); @@ -1928,9 +1211,8 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB, } } - // If ARC Annotations are enabled, output the current state of pointers at the - // bottom of the basic block. - ANNOTATE_BOTTOMUP_BBEND(MyStates, BB); + DEBUG(llvm::dbgs() << "Before:\n" << BBStates[BB] << "\n" + << "Performing Dataflow:\n"); // Visit all the instructions, bottom-up. for (BasicBlock::iterator I = BB->end(), E = BB->begin(); I != E; --I) { @@ -1940,7 +1222,7 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB, if (isa<InvokeInst>(Inst)) continue; - DEBUG(dbgs() << "Visiting " << *Inst << "\n"); + DEBUG(dbgs() << " Visiting " << *Inst << "\n"); NestingDetected |= VisitInstructionBottomUp(Inst, BB, Retains, MyStates); } @@ -1955,9 +1237,7 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB, NestingDetected |= VisitInstructionBottomUp(II, BB, Retains, MyStates); } - // If ARC Annotations are enabled, output the current state of pointers at the - // top of the basic block. - ANNOTATE_BOTTOMUP_BBSTART(MyStates, BB); + DEBUG(llvm::dbgs() << "\nFinal State:\n" << BBStates[BB] << "\n"); return NestingDetected; } @@ -1967,146 +1247,66 @@ ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst, DenseMap<Value *, RRInfo> &Releases, BBState &MyStates) { bool NestingDetected = false; - InstructionClass Class = GetInstructionClass(Inst); + ARCInstKind Class = GetARCInstKind(Inst); const Value *Arg = nullptr; + DEBUG(llvm::dbgs() << " Class: " << Class << "\n"); + switch (Class) { - case IC_RetainBlock: + case ARCInstKind::RetainBlock: // In OptimizeIndividualCalls, we have strength reduced all optimizable // objc_retainBlocks to objc_retains. Thus at this point any - // objc_retainBlocks that we see are not optimizable. + // objc_retainBlocks that we see are not optimizable. We need to break since + // a retain can be a potential use. break; - case IC_Retain: - case IC_RetainRV: { - Arg = GetObjCArg(Inst); - - PtrState &S = MyStates.getPtrTopDownState(Arg); - - // Don't do retain+release tracking for IC_RetainRV, because it's - // better to let it remain as the first instruction after a call. - if (Class != IC_RetainRV) { - // If we see two retains in a row on the same pointer. If so, make - // a note, and we'll cicle back to revisit it after we've - // hopefully eliminated the second retain, which may allow us to - // eliminate the first retain too. - // Theoretically we could implement removal of nested retain+release - // pairs by making PtrState hold a stack of states, but this is - // simple and avoids adding overhead for the non-nested case. - if (S.GetSeq() == S_Retain) - NestingDetected = true; - - ANNOTATE_TOPDOWN(Inst, Arg, S.GetSeq(), S_Retain); - S.ResetSequenceProgress(S_Retain); - S.SetKnownSafe(S.HasKnownPositiveRefCount()); - S.InsertCall(Inst); - } - - S.SetKnownPositiveRefCount(); - + case ARCInstKind::Retain: + case ARCInstKind::RetainRV: { + Arg = GetArgRCIdentityRoot(Inst); + TopDownPtrState &S = MyStates.getPtrTopDownState(Arg); + NestingDetected |= S.InitTopDown(Class, Inst); // A retain can be a potential use; procede to the generic checking // code below. break; } - case IC_Release: { - Arg = GetObjCArg(Inst); - - PtrState &S = MyStates.getPtrTopDownState(Arg); - S.ClearKnownPositiveRefCount(); - - Sequence OldSeq = S.GetSeq(); - - MDNode *ReleaseMetadata = Inst->getMetadata(ImpreciseReleaseMDKind); - - switch (OldSeq) { - case S_Retain: - case S_CanRelease: - if (OldSeq == S_Retain || ReleaseMetadata != nullptr) - S.ClearReverseInsertPts(); - // FALL THROUGH - case S_Use: - S.SetReleaseMetadata(ReleaseMetadata); - S.SetTailCallRelease(cast<CallInst>(Inst)->isTailCall()); + case ARCInstKind::Release: { + Arg = GetArgRCIdentityRoot(Inst); + TopDownPtrState &S = MyStates.getPtrTopDownState(Arg); + // Try to form a tentative pair in between this release instruction and the + // top down pointers that we are tracking. + if (S.MatchWithRelease(MDKindCache, Inst)) { + // If we succeed, copy S's RRInfo into the Release -> {Retain Set + // Map}. Then we clear S. + DEBUG(llvm::dbgs() << " Matching with: " << *Inst << "\n"); Releases[Inst] = S.GetRRInfo(); - ANNOTATE_TOPDOWN(Inst, Arg, S.GetSeq(), S_None); S.ClearSequenceProgress(); - break; - case S_None: - break; - case S_Stop: - case S_Release: - case S_MovableRelease: - llvm_unreachable("top-down pointer in release state!"); } break; } - case IC_AutoreleasepoolPop: + case ARCInstKind::AutoreleasepoolPop: // Conservatively, clear MyStates for all known pointers. MyStates.clearTopDownPointers(); - return NestingDetected; - case IC_AutoreleasepoolPush: - case IC_None: - // These are irrelevant. - return NestingDetected; + return false; + case ARCInstKind::AutoreleasepoolPush: + case ARCInstKind::None: + // These can not be uses of + return false; default: break; } // Consider any other possible effects of this instruction on each // pointer being tracked. - for (BBState::ptr_iterator MI = MyStates.top_down_ptr_begin(), - ME = MyStates.top_down_ptr_end(); MI != ME; ++MI) { + for (auto MI = MyStates.top_down_ptr_begin(), + ME = MyStates.top_down_ptr_end(); + MI != ME; ++MI) { const Value *Ptr = MI->first; if (Ptr == Arg) continue; // Handled above. - PtrState &S = MI->second; - Sequence Seq = S.GetSeq(); - - // Check for possible releases. - if (CanAlterRefCount(Inst, Ptr, PA, Class)) { - DEBUG(dbgs() << "CanAlterRefCount: Seq: " << Seq << "; " << *Ptr - << "\n"); - S.ClearKnownPositiveRefCount(); - switch (Seq) { - case S_Retain: - S.SetSeq(S_CanRelease); - ANNOTATE_TOPDOWN(Inst, Ptr, Seq, S_CanRelease); - assert(!S.HasReverseInsertPts()); - S.InsertReverseInsertPt(Inst); - - // One call can't cause a transition from S_Retain to S_CanRelease - // and S_CanRelease to S_Use. If we've made the first transition, - // we're done. - continue; - case S_Use: - case S_CanRelease: - case S_None: - break; - case S_Stop: - case S_Release: - case S_MovableRelease: - llvm_unreachable("top-down pointer in release state!"); - } - } + TopDownPtrState &S = MI->second; + if (S.HandlePotentialAlterRefCount(Inst, Ptr, PA, Class)) + continue; - // Check for possible direct uses. - switch (Seq) { - case S_CanRelease: - if (CanUse(Inst, Ptr, PA, Class)) { - DEBUG(dbgs() << "CanUse: Seq: " << Seq << "; " << *Ptr - << "\n"); - S.SetSeq(S_Use); - ANNOTATE_TOPDOWN(Inst, Ptr, Seq, S_Use); - } - break; - case S_Retain: - case S_Use: - case S_None: - break; - case S_Stop: - case S_Release: - case S_MovableRelease: - llvm_unreachable("top-down pointer in release state!"); - } + S.HandlePotentialUse(Inst, Ptr, PA, Class); } return NestingDetected; @@ -2138,27 +1338,22 @@ ObjCARCOpt::VisitTopDown(BasicBlock *BB, } } - // If ARC Annotations are enabled, output the current state of pointers at the - // top of the basic block. - ANNOTATE_TOPDOWN_BBSTART(MyStates, BB); + DEBUG(llvm::dbgs() << "Before:\n" << BBStates[BB] << "\n" + << "Performing Dataflow:\n"); // Visit all the instructions, top-down. for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { Instruction *Inst = I; - DEBUG(dbgs() << "Visiting " << *Inst << "\n"); + DEBUG(dbgs() << " Visiting " << *Inst << "\n"); NestingDetected |= VisitInstructionTopDown(Inst, Releases, MyStates); } - // If ARC Annotations are enabled, output the current state of pointers at the - // bottom of the basic block. - ANNOTATE_TOPDOWN_BBEND(MyStates, BB); - -#ifdef ARC_ANNOTATIONS - if (!(EnableARCAnnotations && DisableCheckForCFGHazards)) -#endif + DEBUG(llvm::dbgs() << "\nState Before Checking for CFG Hazards:\n" + << BBStates[BB] << "\n\n"); CheckForCFGHazards(BB, BBStates, MyStates); + DEBUG(llvm::dbgs() << "Final State:\n" << BBStates[BB] << "\n"); return NestingDetected; } @@ -2244,11 +1439,10 @@ ComputePostOrders(Function &F, } // Visit the function both top-down and bottom-up. -bool -ObjCARCOpt::Visit(Function &F, - DenseMap<const BasicBlock *, BBState> &BBStates, - MapVector<Value *, RRInfo> &Retains, - DenseMap<Value *, RRInfo> &Releases) { +bool ObjCARCOpt::Visit(Function &F, + DenseMap<const BasicBlock *, BBState> &BBStates, + BlotMapVector<Value *, RRInfo> &Retains, + DenseMap<Value *, RRInfo> &Releases) { // Use reverse-postorder traversals, because we magically know that loops // will be well behaved, i.e. they won't repeatedly call retain on a single @@ -2258,7 +1452,7 @@ ObjCARCOpt::Visit(Function &F, SmallVector<BasicBlock *, 16> PostOrder; SmallVector<BasicBlock *, 16> ReverseCFGPostOrder; ComputePostOrders(F, PostOrder, ReverseCFGPostOrder, - NoObjCARCExceptionsMDKind, + MDKindCache.get(ARCMDKindID::NoObjCARCExceptions), BBStates); // Use reverse-postorder on the reverse CFG for bottom-up. @@ -2279,10 +1473,9 @@ ObjCARCOpt::Visit(Function &F, } /// Move the calls in RetainsToMove and ReleasesToMove. -void ObjCARCOpt::MoveCalls(Value *Arg, - RRInfo &RetainsToMove, +void ObjCARCOpt::MoveCalls(Value *Arg, RRInfo &RetainsToMove, RRInfo &ReleasesToMove, - MapVector<Value *, RRInfo> &Retains, + BlotMapVector<Value *, RRInfo> &Retains, DenseMap<Value *, RRInfo> &Releases, SmallVectorImpl<Instruction *> &DeadInsts, Module *M) { @@ -2295,7 +1488,7 @@ void ObjCARCOpt::MoveCalls(Value *Arg, for (Instruction *InsertPt : ReleasesToMove.ReverseInsertPts) { Value *MyArg = ArgTy == ParamTy ? Arg : new BitCastInst(Arg, ParamTy, "", InsertPt); - Constant *Decl = EP.get(ARCRuntimeEntryPoints::EPT_Retain); + Constant *Decl = EP.get(ARCRuntimeEntryPointKind::Retain); CallInst *Call = CallInst::Create(Decl, MyArg, "", InsertPt); Call->setDoesNotThrow(); Call->setTailCall(); @@ -2306,11 +1499,11 @@ void ObjCARCOpt::MoveCalls(Value *Arg, for (Instruction *InsertPt : RetainsToMove.ReverseInsertPts) { Value *MyArg = ArgTy == ParamTy ? Arg : new BitCastInst(Arg, ParamTy, "", InsertPt); - Constant *Decl = EP.get(ARCRuntimeEntryPoints::EPT_Release); + Constant *Decl = EP.get(ARCRuntimeEntryPointKind::Release); CallInst *Call = CallInst::Create(Decl, MyArg, "", InsertPt); // Attach a clang.imprecise_release metadata tag, if appropriate. if (MDNode *M = ReleasesToMove.ReleaseMetadata) - Call->setMetadata(ImpreciseReleaseMDKind, M); + Call->setMetadata(MDKindCache.get(ARCMDKindID::ImpreciseRelease), M); Call->setDoesNotThrow(); if (ReleasesToMove.IsTailCallRelease) Call->setTailCall(); @@ -2333,20 +1526,15 @@ void ObjCARCOpt::MoveCalls(Value *Arg, } -bool -ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState> - &BBStates, - MapVector<Value *, RRInfo> &Retains, - DenseMap<Value *, RRInfo> &Releases, - Module *M, - SmallVectorImpl<Instruction *> &NewRetains, - SmallVectorImpl<Instruction *> &NewReleases, - SmallVectorImpl<Instruction *> &DeadInsts, - RRInfo &RetainsToMove, - RRInfo &ReleasesToMove, - Value *Arg, - bool KnownSafe, - bool &AnyPairsCompletelyEliminated) { +bool ObjCARCOpt::PairUpRetainsAndReleases( + DenseMap<const BasicBlock *, BBState> &BBStates, + BlotMapVector<Value *, RRInfo> &Retains, + DenseMap<Value *, RRInfo> &Releases, Module *M, + SmallVectorImpl<Instruction *> &NewRetains, + SmallVectorImpl<Instruction *> &NewReleases, + SmallVectorImpl<Instruction *> &DeadInsts, RRInfo &RetainsToMove, + RRInfo &ReleasesToMove, Value *Arg, bool KnownSafe, + bool &AnyPairsCompletelyEliminated) { // If a pair happens in a region where it is known that the reference count // is already incremented, we can similarly ignore possible decrements unless // we are dealing with a retainable object with multiple provenance sources. @@ -2367,15 +1555,14 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState> for (SmallVectorImpl<Instruction *>::const_iterator NI = NewRetains.begin(), NE = NewRetains.end(); NI != NE; ++NI) { Instruction *NewRetain = *NI; - MapVector<Value *, RRInfo>::const_iterator It = Retains.find(NewRetain); + auto It = Retains.find(NewRetain); assert(It != Retains.end()); const RRInfo &NewRetainRRI = It->second; KnownSafeTD &= NewRetainRRI.KnownSafe; MultipleOwners = - MultipleOwners || MultiOwnersSet.count(GetObjCArg(NewRetain)); + MultipleOwners || MultiOwnersSet.count(GetArgRCIdentityRoot(NewRetain)); for (Instruction *NewRetainRelease : NewRetainRRI.Calls) { - DenseMap<Value *, RRInfo>::const_iterator Jt = - Releases.find(NewRetainRelease); + auto Jt = Releases.find(NewRetainRelease); if (Jt == Releases.end()) return false; const RRInfo &NewRetainReleaseRRI = Jt->second; @@ -2444,15 +1631,13 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState> for (SmallVectorImpl<Instruction *>::const_iterator NI = NewReleases.begin(), NE = NewReleases.end(); NI != NE; ++NI) { Instruction *NewRelease = *NI; - DenseMap<Value *, RRInfo>::const_iterator It = - Releases.find(NewRelease); + auto It = Releases.find(NewRelease); assert(It != Releases.end()); const RRInfo &NewReleaseRRI = It->second; KnownSafeBU &= NewReleaseRRI.KnownSafe; CFGHazardAfflicted |= NewReleaseRRI.CFGHazardAfflicted; for (Instruction *NewReleaseRetain : NewReleaseRRI.Calls) { - MapVector<Value *, RRInfo>::const_iterator Jt = - Retains.find(NewReleaseRetain); + auto Jt = Retains.find(NewReleaseRetain); if (Jt == Retains.end()) return false; const RRInfo &NewReleaseRetainRRI = Jt->second; @@ -2504,11 +1689,8 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState> if (NewRetains.empty()) break; } - // If the pointer is known incremented in 1 direction and we do not have - // MultipleOwners, we can safely remove the retain/releases. Otherwise we need - // to be known safe in both directions. - bool UnconditionallySafe = (KnownSafeTD && KnownSafeBU) || - ((KnownSafeTD || KnownSafeBU) && !MultipleOwners); + // We can only remove pointers if we are known safe in both directions. + bool UnconditionallySafe = KnownSafeTD && KnownSafeBU; if (UnconditionallySafe) { RetainsToMove.ReverseInsertPts.clear(); ReleasesToMove.ReverseInsertPts.clear(); @@ -2538,12 +1720,6 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState> if (OldDelta != 0) return false; -#ifdef ARC_ANNOTATIONS - // Do not move calls if ARC annotations are requested. - if (EnableARCAnnotations) - return false; -#endif // ARC_ANNOTATIONS - Changed = true; assert(OldCount != 0 && "Unreachable code?"); NumRRs += OldCount - NewCount; @@ -2556,12 +1732,10 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState> /// Identify pairings between the retains and releases, and delete and/or move /// them. -bool -ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState> - &BBStates, - MapVector<Value *, RRInfo> &Retains, - DenseMap<Value *, RRInfo> &Releases, - Module *M) { +bool ObjCARCOpt::PerformCodePlacement( + DenseMap<const BasicBlock *, BBState> &BBStates, + BlotMapVector<Value *, RRInfo> &Retains, + DenseMap<Value *, RRInfo> &Releases, Module *M) { DEBUG(dbgs() << "\n== ObjCARCOpt::PerformCodePlacement ==\n"); bool AnyPairsCompletelyEliminated = false; @@ -2572,8 +1746,9 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState> SmallVector<Instruction *, 8> DeadInsts; // Visit each retain. - for (MapVector<Value *, RRInfo>::const_iterator I = Retains.begin(), - E = Retains.end(); I != E; ++I) { + for (BlotMapVector<Value *, RRInfo>::const_iterator I = Retains.begin(), + E = Retains.end(); + I != E; ++I) { Value *V = I->first; if (!V) continue; // blotted @@ -2581,7 +1756,7 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState> DEBUG(dbgs() << "Visiting: " << *Retain << "\n"); - Value *Arg = GetObjCArg(Retain); + Value *Arg = GetArgRCIdentityRoot(Retain); // If the object being released is in static or stack storage, we know it's // not being managed by ObjC reference counting, so we can delete pairs @@ -2593,18 +1768,17 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState> if (const LoadInst *LI = dyn_cast<LoadInst>(Arg)) if (const GlobalVariable *GV = dyn_cast<GlobalVariable>( - StripPointerCastsAndObjCCalls(LI->getPointerOperand()))) + GetRCIdentityRoot(LI->getPointerOperand()))) if (GV->isConstant()) KnownSafe = true; // Connect the dots between the top-down-collected RetainsToMove and // bottom-up-collected ReleasesToMove to form sets of related calls. NewRetains.push_back(Retain); - bool PerformMoveCalls = - ConnectTDBUTraversals(BBStates, Retains, Releases, M, NewRetains, - NewReleases, DeadInsts, RetainsToMove, - ReleasesToMove, Arg, KnownSafe, - AnyPairsCompletelyEliminated); + bool PerformMoveCalls = PairUpRetainsAndReleases( + BBStates, Retains, Releases, M, NewRetains, NewReleases, DeadInsts, + RetainsToMove, ReleasesToMove, Arg, KnownSafe, + AnyPairsCompletelyEliminated); if (PerformMoveCalls) { // Ok, everything checks out and we're all set. Let's move/delete some @@ -2640,12 +1814,13 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) { DEBUG(dbgs() << "Visiting: " << *Inst << "\n"); - InstructionClass Class = GetBasicInstructionClass(Inst); - if (Class != IC_LoadWeak && Class != IC_LoadWeakRetained) + ARCInstKind Class = GetBasicARCInstKind(Inst); + if (Class != ARCInstKind::LoadWeak && + Class != ARCInstKind::LoadWeakRetained) continue; // Delete objc_loadWeak calls with no users. - if (Class == IC_LoadWeak && Inst->use_empty()) { + if (Class == ARCInstKind::LoadWeak && Inst->use_empty()) { Inst->eraseFromParent(); continue; } @@ -2660,10 +1835,10 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) { J = Current.getInstructionIterator(); J != B; --J) { Instruction *EarlierInst = &*std::prev(J); - InstructionClass EarlierClass = GetInstructionClass(EarlierInst); + ARCInstKind EarlierClass = GetARCInstKind(EarlierInst); switch (EarlierClass) { - case IC_LoadWeak: - case IC_LoadWeakRetained: { + case ARCInstKind::LoadWeak: + case ARCInstKind::LoadWeakRetained: { // If this is loading from the same pointer, replace this load's value // with that one. CallInst *Call = cast<CallInst>(Inst); @@ -2674,8 +1849,8 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) { case AliasAnalysis::MustAlias: Changed = true; // If the load has a builtin retain, insert a plain retain for it. - if (Class == IC_LoadWeakRetained) { - Constant *Decl = EP.get(ARCRuntimeEntryPoints::EPT_Retain); + if (Class == ARCInstKind::LoadWeakRetained) { + Constant *Decl = EP.get(ARCRuntimeEntryPointKind::Retain); CallInst *CI = CallInst::Create(Decl, EarlierCall, "", Call); CI->setTailCall(); } @@ -2691,8 +1866,8 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) { } break; } - case IC_StoreWeak: - case IC_InitWeak: { + case ARCInstKind::StoreWeak: + case ARCInstKind::InitWeak: { // If this is storing to the same pointer and has the same size etc. // replace this load's value with the stored value. CallInst *Call = cast<CallInst>(Inst); @@ -2703,8 +1878,8 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) { case AliasAnalysis::MustAlias: Changed = true; // If the load has a builtin retain, insert a plain retain for it. - if (Class == IC_LoadWeakRetained) { - Constant *Decl = EP.get(ARCRuntimeEntryPoints::EPT_Retain); + if (Class == ARCInstKind::LoadWeakRetained) { + Constant *Decl = EP.get(ARCRuntimeEntryPointKind::Retain); CallInst *CI = CallInst::Create(Decl, EarlierCall, "", Call); CI->setTailCall(); } @@ -2720,14 +1895,14 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) { } break; } - case IC_MoveWeak: - case IC_CopyWeak: + case ARCInstKind::MoveWeak: + case ARCInstKind::CopyWeak: // TOOD: Grab the copied value. goto clobbered; - case IC_AutoreleasepoolPush: - case IC_None: - case IC_IntrinsicUser: - case IC_User: + case ARCInstKind::AutoreleasepoolPush: + case ARCInstKind::None: + case ARCInstKind::IntrinsicUser: + case ARCInstKind::User: // Weak pointers are only modified through the weak entry points // (and arbitrary calls, which could call the weak entry points). break; @@ -2743,8 +1918,8 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) { // the alloca and all its users can be zapped. for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) { Instruction *Inst = &*I++; - InstructionClass Class = GetBasicInstructionClass(Inst); - if (Class != IC_DestroyWeak) + ARCInstKind Class = GetBasicARCInstKind(Inst); + if (Class != ARCInstKind::DestroyWeak) continue; CallInst *Call = cast<CallInst>(Inst); @@ -2752,10 +1927,10 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) { if (AllocaInst *Alloca = dyn_cast<AllocaInst>(Arg)) { for (User *U : Alloca->users()) { const Instruction *UserInst = cast<Instruction>(U); - switch (GetBasicInstructionClass(UserInst)) { - case IC_InitWeak: - case IC_StoreWeak: - case IC_DestroyWeak: + switch (GetBasicARCInstKind(UserInst)) { + case ARCInstKind::InitWeak: + case ARCInstKind::StoreWeak: + case ARCInstKind::DestroyWeak: continue; default: goto done; @@ -2764,13 +1939,13 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) { Changed = true; for (auto UI = Alloca->user_begin(), UE = Alloca->user_end(); UI != UE;) { CallInst *UserInst = cast<CallInst>(*UI++); - switch (GetBasicInstructionClass(UserInst)) { - case IC_InitWeak: - case IC_StoreWeak: + switch (GetBasicARCInstKind(UserInst)) { + case ARCInstKind::InitWeak: + case ARCInstKind::StoreWeak: // These functions return their second argument. UserInst->replaceAllUsesWith(UserInst->getArgOperand(1)); break; - case IC_DestroyWeak: + case ARCInstKind::DestroyWeak: // No return value. break; default: @@ -2792,7 +1967,7 @@ bool ObjCARCOpt::OptimizeSequences(Function &F) { // map stays valid when we get around to rewriting code and calls get // replaced by arguments. DenseMap<Value *, RRInfo> Releases; - MapVector<Value *, RRInfo> Retains; + BlotMapVector<Value *, RRInfo> Retains; // This is used during the traversal of the function to track the // states for each identified object at each block. @@ -2825,16 +2000,15 @@ HasSafePathToPredecessorCall(const Value *Arg, Instruction *Retain, if (DepInsts.size() != 1) return false; - CallInst *Call = - dyn_cast_or_null<CallInst>(*DepInsts.begin()); + auto *Call = dyn_cast_or_null<CallInst>(*DepInsts.begin()); // Check that the pointer is the return value of the call. if (!Call || Arg != Call) return false; // Check that the call is a regular call. - InstructionClass Class = GetBasicInstructionClass(Call); - if (Class != IC_CallOrUser && Class != IC_Call) + ARCInstKind Class = GetBasicARCInstKind(Call); + if (Class != ARCInstKind::CallOrUser && Class != ARCInstKind::Call) return false; return true; @@ -2854,13 +2028,11 @@ FindPredecessorRetainWithSafePath(const Value *Arg, BasicBlock *BB, if (DepInsts.size() != 1) return nullptr; - CallInst *Retain = - dyn_cast_or_null<CallInst>(*DepInsts.begin()); + auto *Retain = dyn_cast_or_null<CallInst>(*DepInsts.begin()); // Check that we found a retain with the same argument. - if (!Retain || - !IsRetain(GetBasicInstructionClass(Retain)) || - GetObjCArg(Retain) != Arg) { + if (!Retain || !IsRetain(GetBasicARCInstKind(Retain)) || + GetArgRCIdentityRoot(Retain) != Arg) { return nullptr; } @@ -2881,14 +2053,13 @@ FindPredecessorAutoreleaseWithSafePath(const Value *Arg, BasicBlock *BB, if (DepInsts.size() != 1) return nullptr; - CallInst *Autorelease = - dyn_cast_or_null<CallInst>(*DepInsts.begin()); + auto *Autorelease = dyn_cast_or_null<CallInst>(*DepInsts.begin()); if (!Autorelease) return nullptr; - InstructionClass AutoreleaseClass = GetBasicInstructionClass(Autorelease); + ARCInstKind AutoreleaseClass = GetBasicARCInstKind(Autorelease); if (!IsAutorelease(AutoreleaseClass)) return nullptr; - if (GetObjCArg(Autorelease) != Arg) + if (GetArgRCIdentityRoot(Autorelease) != Arg) return nullptr; return Autorelease; @@ -2919,7 +2090,7 @@ void ObjCARCOpt::OptimizeReturns(Function &F) { if (!Ret) continue; - const Value *Arg = StripPointerCastsAndObjCCalls(Ret->getOperand(0)); + const Value *Arg = GetRCIdentityRoot(Ret->getOperand(0)); // Look for an ``autorelease'' instruction that is a predecessor of Ret and // dependent on Arg such that there are no instructions dependent on Arg @@ -2974,13 +2145,13 @@ ObjCARCOpt::GatherStatistics(Function &F, bool AfterOptimization) { for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) { Instruction *Inst = &*I++; - switch (GetBasicInstructionClass(Inst)) { + switch (GetBasicARCInstKind(Inst)) { default: break; - case IC_Retain: + case ARCInstKind::Retain: ++NumRetains; break; - case IC_Release: + case ARCInstKind::Release: ++NumReleases; break; } @@ -2997,28 +2168,13 @@ bool ObjCARCOpt::doInitialization(Module &M) { if (!Run) return false; - // Identify the imprecise release metadata kind. - ImpreciseReleaseMDKind = - M.getContext().getMDKindID("clang.imprecise_release"); - CopyOnEscapeMDKind = - M.getContext().getMDKindID("clang.arc.copy_on_escape"); - NoObjCARCExceptionsMDKind = - M.getContext().getMDKindID("clang.arc.no_objc_arc_exceptions"); -#ifdef ARC_ANNOTATIONS - ARCAnnotationBottomUpMDKind = - M.getContext().getMDKindID("llvm.arc.annotation.bottomup"); - ARCAnnotationTopDownMDKind = - M.getContext().getMDKindID("llvm.arc.annotation.topdown"); - ARCAnnotationProvenanceSourceMDKind = - M.getContext().getMDKindID("llvm.arc.annotation.provenancesource"); -#endif // ARC_ANNOTATIONS - // Intuitively, objc_retain and others are nocapture, however in practice // they are not, because they return their argument value. And objc_release // calls finalizers which can have arbitrary side effects. + MDKindCache.init(&M); // Initialize our runtime entry point cache. - EP.Initialize(&M); + EP.init(&M); return false; } @@ -3052,27 +2208,27 @@ bool ObjCARCOpt::runOnFunction(Function &F) { OptimizeIndividualCalls(F); // Optimizations for weak pointers. - if (UsedInThisFunction & ((1 << IC_LoadWeak) | - (1 << IC_LoadWeakRetained) | - (1 << IC_StoreWeak) | - (1 << IC_InitWeak) | - (1 << IC_CopyWeak) | - (1 << IC_MoveWeak) | - (1 << IC_DestroyWeak))) + if (UsedInThisFunction & ((1 << unsigned(ARCInstKind::LoadWeak)) | + (1 << unsigned(ARCInstKind::LoadWeakRetained)) | + (1 << unsigned(ARCInstKind::StoreWeak)) | + (1 << unsigned(ARCInstKind::InitWeak)) | + (1 << unsigned(ARCInstKind::CopyWeak)) | + (1 << unsigned(ARCInstKind::MoveWeak)) | + (1 << unsigned(ARCInstKind::DestroyWeak)))) OptimizeWeakCalls(F); // Optimizations for retain+release pairs. - if (UsedInThisFunction & ((1 << IC_Retain) | - (1 << IC_RetainRV) | - (1 << IC_RetainBlock))) - if (UsedInThisFunction & (1 << IC_Release)) + if (UsedInThisFunction & ((1 << unsigned(ARCInstKind::Retain)) | + (1 << unsigned(ARCInstKind::RetainRV)) | + (1 << unsigned(ARCInstKind::RetainBlock)))) + if (UsedInThisFunction & (1 << unsigned(ARCInstKind::Release))) // Run OptimizeSequences until it either stops making changes or // no retain+release pair nesting is detected. while (OptimizeSequences(F)) {} // Optimizations if objc_autorelease is used. - if (UsedInThisFunction & ((1 << IC_Autorelease) | - (1 << IC_AutoreleaseRV))) + if (UsedInThisFunction & ((1 << unsigned(ARCInstKind::Autorelease)) | + (1 << unsigned(ARCInstKind::AutoreleaseRV)))) OptimizeReturns(F); // Gather statistics after optimization. diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCUtil.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCUtil.cpp deleted file mode 100644 index 53c077e..0000000 --- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCUtil.cpp +++ /dev/null @@ -1,254 +0,0 @@ -//===- ObjCARCUtil.cpp - ObjC ARC Optimization ----------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -/// \file -/// This file defines several utility functions used by various ARC -/// optimizations which are IMHO too big to be in a header file. -/// -/// WARNING: This file knows about certain library functions. It recognizes them -/// by name, and hardwires knowledge of their semantics. -/// -/// WARNING: This file knows about how certain Objective-C library functions are -/// used. Naive LLVM IR transformations which would otherwise be -/// behavior-preserving may break these assumptions. -/// -//===----------------------------------------------------------------------===// - -#include "ObjCARC.h" -#include "llvm/IR/Intrinsics.h" - -using namespace llvm; -using namespace llvm::objcarc; - -raw_ostream &llvm::objcarc::operator<<(raw_ostream &OS, - const InstructionClass Class) { - switch (Class) { - case IC_Retain: - return OS << "IC_Retain"; - case IC_RetainRV: - return OS << "IC_RetainRV"; - case IC_RetainBlock: - return OS << "IC_RetainBlock"; - case IC_Release: - return OS << "IC_Release"; - case IC_Autorelease: - return OS << "IC_Autorelease"; - case IC_AutoreleaseRV: - return OS << "IC_AutoreleaseRV"; - case IC_AutoreleasepoolPush: - return OS << "IC_AutoreleasepoolPush"; - case IC_AutoreleasepoolPop: - return OS << "IC_AutoreleasepoolPop"; - case IC_NoopCast: - return OS << "IC_NoopCast"; - case IC_FusedRetainAutorelease: - return OS << "IC_FusedRetainAutorelease"; - case IC_FusedRetainAutoreleaseRV: - return OS << "IC_FusedRetainAutoreleaseRV"; - case IC_LoadWeakRetained: - return OS << "IC_LoadWeakRetained"; - case IC_StoreWeak: - return OS << "IC_StoreWeak"; - case IC_InitWeak: - return OS << "IC_InitWeak"; - case IC_LoadWeak: - return OS << "IC_LoadWeak"; - case IC_MoveWeak: - return OS << "IC_MoveWeak"; - case IC_CopyWeak: - return OS << "IC_CopyWeak"; - case IC_DestroyWeak: - return OS << "IC_DestroyWeak"; - case IC_StoreStrong: - return OS << "IC_StoreStrong"; - case IC_CallOrUser: - return OS << "IC_CallOrUser"; - case IC_Call: - return OS << "IC_Call"; - case IC_User: - return OS << "IC_User"; - case IC_IntrinsicUser: - return OS << "IC_IntrinsicUser"; - case IC_None: - return OS << "IC_None"; - } - llvm_unreachable("Unknown instruction class!"); -} - -InstructionClass llvm::objcarc::GetFunctionClass(const Function *F) { - Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end(); - - // No (mandatory) arguments. - if (AI == AE) - return StringSwitch<InstructionClass>(F->getName()) - .Case("objc_autoreleasePoolPush", IC_AutoreleasepoolPush) - .Case("clang.arc.use", IC_IntrinsicUser) - .Default(IC_CallOrUser); - - // One argument. - const Argument *A0 = AI++; - if (AI == AE) - // Argument is a pointer. - if (PointerType *PTy = dyn_cast<PointerType>(A0->getType())) { - Type *ETy = PTy->getElementType(); - // Argument is i8*. - if (ETy->isIntegerTy(8)) - return StringSwitch<InstructionClass>(F->getName()) - .Case("objc_retain", IC_Retain) - .Case("objc_retainAutoreleasedReturnValue", IC_RetainRV) - .Case("objc_retainBlock", IC_RetainBlock) - .Case("objc_release", IC_Release) - .Case("objc_autorelease", IC_Autorelease) - .Case("objc_autoreleaseReturnValue", IC_AutoreleaseRV) - .Case("objc_autoreleasePoolPop", IC_AutoreleasepoolPop) - .Case("objc_retainedObject", IC_NoopCast) - .Case("objc_unretainedObject", IC_NoopCast) - .Case("objc_unretainedPointer", IC_NoopCast) - .Case("objc_retain_autorelease", IC_FusedRetainAutorelease) - .Case("objc_retainAutorelease", IC_FusedRetainAutorelease) - .Case("objc_retainAutoreleaseReturnValue",IC_FusedRetainAutoreleaseRV) - .Case("objc_sync_enter", IC_User) - .Case("objc_sync_exit", IC_User) - .Default(IC_CallOrUser); - - // Argument is i8** - if (PointerType *Pte = dyn_cast<PointerType>(ETy)) - if (Pte->getElementType()->isIntegerTy(8)) - return StringSwitch<InstructionClass>(F->getName()) - .Case("objc_loadWeakRetained", IC_LoadWeakRetained) - .Case("objc_loadWeak", IC_LoadWeak) - .Case("objc_destroyWeak", IC_DestroyWeak) - .Default(IC_CallOrUser); - } - - // Two arguments, first is i8**. - const Argument *A1 = AI++; - if (AI == AE) - if (PointerType *PTy = dyn_cast<PointerType>(A0->getType())) - if (PointerType *Pte = dyn_cast<PointerType>(PTy->getElementType())) - if (Pte->getElementType()->isIntegerTy(8)) - if (PointerType *PTy1 = dyn_cast<PointerType>(A1->getType())) { - Type *ETy1 = PTy1->getElementType(); - // Second argument is i8* - if (ETy1->isIntegerTy(8)) - return StringSwitch<InstructionClass>(F->getName()) - .Case("objc_storeWeak", IC_StoreWeak) - .Case("objc_initWeak", IC_InitWeak) - .Case("objc_storeStrong", IC_StoreStrong) - .Default(IC_CallOrUser); - // Second argument is i8**. - if (PointerType *Pte1 = dyn_cast<PointerType>(ETy1)) - if (Pte1->getElementType()->isIntegerTy(8)) - return StringSwitch<InstructionClass>(F->getName()) - .Case("objc_moveWeak", IC_MoveWeak) - .Case("objc_copyWeak", IC_CopyWeak) - // Ignore annotation calls. This is important to stop the - // optimizer from treating annotations as uses which would - // make the state of the pointers they are attempting to - // elucidate to be incorrect. - .Case("llvm.arc.annotation.topdown.bbstart", IC_None) - .Case("llvm.arc.annotation.topdown.bbend", IC_None) - .Case("llvm.arc.annotation.bottomup.bbstart", IC_None) - .Case("llvm.arc.annotation.bottomup.bbend", IC_None) - .Default(IC_CallOrUser); - } - - // Anything else. - return IC_CallOrUser; -} - -/// \brief Determine what kind of construct V is. -InstructionClass -llvm::objcarc::GetInstructionClass(const Value *V) { - if (const Instruction *I = dyn_cast<Instruction>(V)) { - // Any instruction other than bitcast and gep with a pointer operand have a - // use of an objc pointer. Bitcasts, GEPs, Selects, PHIs transfer a pointer - // to a subsequent use, rather than using it themselves, in this sense. - // As a short cut, several other opcodes are known to have no pointer - // operands of interest. And ret is never followed by a release, so it's - // not interesting to examine. - switch (I->getOpcode()) { - case Instruction::Call: { - const CallInst *CI = cast<CallInst>(I); - // Check for calls to special functions. - if (const Function *F = CI->getCalledFunction()) { - InstructionClass Class = GetFunctionClass(F); - if (Class != IC_CallOrUser) - return Class; - - // None of the intrinsic functions do objc_release. For intrinsics, the - // only question is whether or not they may be users. - switch (F->getIntrinsicID()) { - case Intrinsic::returnaddress: case Intrinsic::frameaddress: - case Intrinsic::stacksave: case Intrinsic::stackrestore: - case Intrinsic::vastart: case Intrinsic::vacopy: case Intrinsic::vaend: - case Intrinsic::objectsize: case Intrinsic::prefetch: - case Intrinsic::stackprotector: - case Intrinsic::eh_return_i32: case Intrinsic::eh_return_i64: - case Intrinsic::eh_typeid_for: case Intrinsic::eh_dwarf_cfa: - case Intrinsic::eh_sjlj_lsda: case Intrinsic::eh_sjlj_functioncontext: - case Intrinsic::init_trampoline: case Intrinsic::adjust_trampoline: - case Intrinsic::lifetime_start: case Intrinsic::lifetime_end: - case Intrinsic::invariant_start: case Intrinsic::invariant_end: - // Don't let dbg info affect our results. - case Intrinsic::dbg_declare: case Intrinsic::dbg_value: - // Short cut: Some intrinsics obviously don't use ObjC pointers. - return IC_None; - default: - break; - } - } - return GetCallSiteClass(CI); - } - case Instruction::Invoke: - return GetCallSiteClass(cast<InvokeInst>(I)); - case Instruction::BitCast: - case Instruction::GetElementPtr: - case Instruction::Select: case Instruction::PHI: - case Instruction::Ret: case Instruction::Br: - case Instruction::Switch: case Instruction::IndirectBr: - case Instruction::Alloca: case Instruction::VAArg: - case Instruction::Add: case Instruction::FAdd: - case Instruction::Sub: case Instruction::FSub: - case Instruction::Mul: case Instruction::FMul: - case Instruction::SDiv: case Instruction::UDiv: case Instruction::FDiv: - case Instruction::SRem: case Instruction::URem: case Instruction::FRem: - case Instruction::Shl: case Instruction::LShr: case Instruction::AShr: - case Instruction::And: case Instruction::Or: case Instruction::Xor: - case Instruction::SExt: case Instruction::ZExt: case Instruction::Trunc: - case Instruction::IntToPtr: case Instruction::FCmp: - case Instruction::FPTrunc: case Instruction::FPExt: - case Instruction::FPToUI: case Instruction::FPToSI: - case Instruction::UIToFP: case Instruction::SIToFP: - case Instruction::InsertElement: case Instruction::ExtractElement: - case Instruction::ShuffleVector: - case Instruction::ExtractValue: - break; - case Instruction::ICmp: - // Comparing a pointer with null, or any other constant, isn't an - // interesting use, because we don't care what the pointer points to, or - // about the values of any other dynamic reference-counted pointers. - if (IsPotentialRetainableObjPtr(I->getOperand(1))) - return IC_User; - break; - default: - // For anything else, check all the operands. - // Note that this includes both operands of a Store: while the first - // operand isn't actually being dereferenced, it is being stored to - // memory where we can no longer track who might read it and dereference - // it, so we have to consider it potentially used. - for (User::const_op_iterator OI = I->op_begin(), OE = I->op_end(); - OI != OE; ++OI) - if (IsPotentialRetainableObjPtr(*OI)) - return IC_User; - } - } - - // Otherwise, it's totally inert for ARC purposes. - return IC_None; -} diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp index 410abfc..8346345 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp +++ b/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp @@ -32,20 +32,22 @@ using namespace llvm::objcarc; bool ProvenanceAnalysis::relatedSelect(const SelectInst *A, const Value *B) { + const DataLayout &DL = A->getModule()->getDataLayout(); // If the values are Selects with the same condition, we can do a more precise // check: just check for relations between the values on corresponding arms. if (const SelectInst *SB = dyn_cast<SelectInst>(B)) if (A->getCondition() == SB->getCondition()) - return related(A->getTrueValue(), SB->getTrueValue()) || - related(A->getFalseValue(), SB->getFalseValue()); + return related(A->getTrueValue(), SB->getTrueValue(), DL) || + related(A->getFalseValue(), SB->getFalseValue(), DL); // Check both arms of the Select node individually. - return related(A->getTrueValue(), B) || - related(A->getFalseValue(), B); + return related(A->getTrueValue(), B, DL) || + related(A->getFalseValue(), B, DL); } bool ProvenanceAnalysis::relatedPHI(const PHINode *A, const Value *B) { + const DataLayout &DL = A->getModule()->getDataLayout(); // If the values are PHIs in the same block, we can do a more precise as well // as efficient check: just check for relations between the values on // corresponding edges. @@ -53,16 +55,15 @@ bool ProvenanceAnalysis::relatedPHI(const PHINode *A, if (PNB->getParent() == A->getParent()) { for (unsigned i = 0, e = A->getNumIncomingValues(); i != e; ++i) if (related(A->getIncomingValue(i), - PNB->getIncomingValueForBlock(A->getIncomingBlock(i)))) + PNB->getIncomingValueForBlock(A->getIncomingBlock(i)), DL)) return true; return false; } // Check each unique source of the PHI node against B. SmallPtrSet<const Value *, 4> UniqueSrc; - for (unsigned i = 0, e = A->getNumIncomingValues(); i != e; ++i) { - const Value *PV1 = A->getIncomingValue(i); - if (UniqueSrc.insert(PV1).second && related(PV1, B)) + for (Value *PV1 : A->incoming_values()) { + if (UniqueSrc.insert(PV1).second && related(PV1, B, DL)) return true; } @@ -103,11 +104,11 @@ static bool IsStoredObjCPointer(const Value *P) { return false; } -bool ProvenanceAnalysis::relatedCheck(const Value *A, - const Value *B) { +bool ProvenanceAnalysis::relatedCheck(const Value *A, const Value *B, + const DataLayout &DL) { // Skip past provenance pass-throughs. - A = GetUnderlyingObjCPtr(A); - B = GetUnderlyingObjCPtr(B); + A = GetUnderlyingObjCPtr(A, DL); + B = GetUnderlyingObjCPtr(B, DL); // Quick check. if (A == B) @@ -159,8 +160,8 @@ bool ProvenanceAnalysis::relatedCheck(const Value *A, return true; } -bool ProvenanceAnalysis::related(const Value *A, - const Value *B) { +bool ProvenanceAnalysis::related(const Value *A, const Value *B, + const DataLayout &DL) { // Begin by inserting a conservative value into the map. If the insertion // fails, we have the answer already. If it succeeds, leave it there until we // compute the real answer to guard against recursive queries. @@ -170,7 +171,7 @@ bool ProvenanceAnalysis::related(const Value *A, if (!Pair.second) return Pair.first->second; - bool Result = relatedCheck(A, B); + bool Result = relatedCheck(A, B, DL); CachedResults[ValuePairTy(A, B)] = Result; return Result; } diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h b/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h index 7820468..0ac41d3 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h +++ b/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h @@ -30,6 +30,7 @@ namespace llvm { class Value; class AliasAnalysis; + class DataLayout; class PHINode; class SelectInst; } @@ -53,12 +54,12 @@ class ProvenanceAnalysis { typedef DenseMap<ValuePairTy, bool> CachedResultsTy; CachedResultsTy CachedResults; - bool relatedCheck(const Value *A, const Value *B); + bool relatedCheck(const Value *A, const Value *B, const DataLayout &DL); bool relatedSelect(const SelectInst *A, const Value *B); bool relatedPHI(const PHINode *A, const Value *B); - void operator=(const ProvenanceAnalysis &) LLVM_DELETED_FUNCTION; - ProvenanceAnalysis(const ProvenanceAnalysis &) LLVM_DELETED_FUNCTION; + void operator=(const ProvenanceAnalysis &) = delete; + ProvenanceAnalysis(const ProvenanceAnalysis &) = delete; public: ProvenanceAnalysis() {} @@ -67,7 +68,7 @@ public: AliasAnalysis *getAA() const { return AA; } - bool related(const Value *A, const Value *B); + bool related(const Value *A, const Value *B, const DataLayout &DL); void clear() { CachedResults.clear(); diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp index d836632..0be75af 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp +++ b/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp @@ -14,6 +14,7 @@ #include "llvm/Analysis/Passes.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -65,6 +66,7 @@ bool PAEval::runOnFunction(Function &F) { ProvenanceAnalysis PA; PA.setAA(&getAnalysis<AliasAnalysis>()); + const DataLayout &DL = F.getParent()->getDataLayout(); for (Value *V1 : Values) { StringRef NameV1 = getName(V1); @@ -73,7 +75,7 @@ bool PAEval::runOnFunction(Function &F) { if (NameV1 >= NameV2) continue; errs() << NameV1 << " and " << NameV2; - if (PA.related(V1, V2)) + if (PA.related(V1, V2, DL)) errs() << " are related.\n"; else errs() << " are not related.\n"; diff --git a/contrib/llvm/lib/Transforms/ObjCARC/PtrState.cpp b/contrib/llvm/lib/Transforms/ObjCARC/PtrState.cpp new file mode 100644 index 0000000..ae20e7e --- /dev/null +++ b/contrib/llvm/lib/Transforms/ObjCARC/PtrState.cpp @@ -0,0 +1,404 @@ +//===--- PtrState.cpp -----------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "PtrState.h" +#include "DependencyAnalysis.h" +#include "ObjCARC.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; +using namespace llvm::objcarc; + +#define DEBUG_TYPE "objc-arc-ptr-state" + +//===----------------------------------------------------------------------===// +// Utility +//===----------------------------------------------------------------------===// + +raw_ostream &llvm::objcarc::operator<<(raw_ostream &OS, const Sequence S) { + switch (S) { + case S_None: + return OS << "S_None"; + case S_Retain: + return OS << "S_Retain"; + case S_CanRelease: + return OS << "S_CanRelease"; + case S_Use: + return OS << "S_Use"; + case S_Release: + return OS << "S_Release"; + case S_MovableRelease: + return OS << "S_MovableRelease"; + case S_Stop: + return OS << "S_Stop"; + } + llvm_unreachable("Unknown sequence type."); +} + +//===----------------------------------------------------------------------===// +// Sequence +//===----------------------------------------------------------------------===// + +static Sequence MergeSeqs(Sequence A, Sequence B, bool TopDown) { + // The easy cases. + if (A == B) + return A; + if (A == S_None || B == S_None) + return S_None; + + if (A > B) + std::swap(A, B); + if (TopDown) { + // Choose the side which is further along in the sequence. + if ((A == S_Retain || A == S_CanRelease) && + (B == S_CanRelease || B == S_Use)) + return B; + } else { + // Choose the side which is further along in the sequence. + if ((A == S_Use || A == S_CanRelease) && + (B == S_Use || B == S_Release || B == S_Stop || B == S_MovableRelease)) + return A; + // If both sides are releases, choose the more conservative one. + if (A == S_Stop && (B == S_Release || B == S_MovableRelease)) + return A; + if (A == S_Release && B == S_MovableRelease) + return A; + } + + return S_None; +} + +//===----------------------------------------------------------------------===// +// RRInfo +//===----------------------------------------------------------------------===// + +void RRInfo::clear() { + KnownSafe = false; + IsTailCallRelease = false; + ReleaseMetadata = nullptr; + Calls.clear(); + ReverseInsertPts.clear(); + CFGHazardAfflicted = false; +} + +bool RRInfo::Merge(const RRInfo &Other) { + // Conservatively merge the ReleaseMetadata information. + if (ReleaseMetadata != Other.ReleaseMetadata) + ReleaseMetadata = nullptr; + + // Conservatively merge the boolean state. + KnownSafe &= Other.KnownSafe; + IsTailCallRelease &= Other.IsTailCallRelease; + CFGHazardAfflicted |= Other.CFGHazardAfflicted; + + // Merge the call sets. + Calls.insert(Other.Calls.begin(), Other.Calls.end()); + + // Merge the insert point sets. If there are any differences, + // that makes this a partial merge. + bool Partial = ReverseInsertPts.size() != Other.ReverseInsertPts.size(); + for (Instruction *Inst : Other.ReverseInsertPts) + Partial |= ReverseInsertPts.insert(Inst).second; + return Partial; +} + +//===----------------------------------------------------------------------===// +// PtrState +//===----------------------------------------------------------------------===// + +void PtrState::SetKnownPositiveRefCount() { + DEBUG(dbgs() << " Setting Known Positive.\n"); + KnownPositiveRefCount = true; +} + +void PtrState::ClearKnownPositiveRefCount() { + DEBUG(dbgs() << " Clearing Known Positive.\n"); + KnownPositiveRefCount = false; +} + +void PtrState::SetSeq(Sequence NewSeq) { + DEBUG(dbgs() << " Old: " << GetSeq() << "; New: " << NewSeq << "\n"); + Seq = NewSeq; +} + +void PtrState::ResetSequenceProgress(Sequence NewSeq) { + DEBUG(dbgs() << " Resetting sequence progress.\n"); + SetSeq(NewSeq); + Partial = false; + RRI.clear(); +} + +void PtrState::Merge(const PtrState &Other, bool TopDown) { + Seq = MergeSeqs(GetSeq(), Other.GetSeq(), TopDown); + KnownPositiveRefCount &= Other.KnownPositiveRefCount; + + // If we're not in a sequence (anymore), drop all associated state. + if (Seq == S_None) { + Partial = false; + RRI.clear(); + } else if (Partial || Other.Partial) { + // If we're doing a merge on a path that's previously seen a partial + // merge, conservatively drop the sequence, to avoid doing partial + // RR elimination. If the branch predicates for the two merge differ, + // mixing them is unsafe. + ClearSequenceProgress(); + } else { + // Otherwise merge the other PtrState's RRInfo into our RRInfo. At this + // point, we know that currently we are not partial. Stash whether or not + // the merge operation caused us to undergo a partial merging of reverse + // insertion points. + Partial = RRI.Merge(Other.RRI); + } +} + +//===----------------------------------------------------------------------===// +// BottomUpPtrState +//===----------------------------------------------------------------------===// + +bool BottomUpPtrState::InitBottomUp(ARCMDKindCache &Cache, Instruction *I) { + // If we see two releases in a row on the same pointer. If so, make + // a note, and we'll cicle back to revisit it after we've + // hopefully eliminated the second release, which may allow us to + // eliminate the first release too. + // Theoretically we could implement removal of nested retain+release + // pairs by making PtrState hold a stack of states, but this is + // simple and avoids adding overhead for the non-nested case. + bool NestingDetected = false; + if (GetSeq() == S_Release || GetSeq() == S_MovableRelease) { + DEBUG(dbgs() << " Found nested releases (i.e. a release pair)\n"); + NestingDetected = true; + } + + MDNode *ReleaseMetadata = + I->getMetadata(Cache.get(ARCMDKindID::ImpreciseRelease)); + Sequence NewSeq = ReleaseMetadata ? S_MovableRelease : S_Release; + ResetSequenceProgress(NewSeq); + SetReleaseMetadata(ReleaseMetadata); + SetKnownSafe(HasKnownPositiveRefCount()); + SetTailCallRelease(cast<CallInst>(I)->isTailCall()); + InsertCall(I); + SetKnownPositiveRefCount(); + return NestingDetected; +} + +bool BottomUpPtrState::MatchWithRetain() { + SetKnownPositiveRefCount(); + + Sequence OldSeq = GetSeq(); + switch (OldSeq) { + case S_Stop: + case S_Release: + case S_MovableRelease: + case S_Use: + // If OldSeq is not S_Use or OldSeq is S_Use and we are tracking an + // imprecise release, clear our reverse insertion points. + if (OldSeq != S_Use || IsTrackingImpreciseReleases()) + ClearReverseInsertPts(); + // FALL THROUGH + case S_CanRelease: + return true; + case S_None: + return false; + case S_Retain: + llvm_unreachable("bottom-up pointer in retain state!"); + } + llvm_unreachable("Sequence unknown enum value"); +} + +bool BottomUpPtrState::HandlePotentialAlterRefCount(Instruction *Inst, + const Value *Ptr, + ProvenanceAnalysis &PA, + ARCInstKind Class) { + Sequence S = GetSeq(); + + // Check for possible releases. + if (!CanAlterRefCount(Inst, Ptr, PA, Class)) + return false; + + DEBUG(dbgs() << " CanAlterRefCount: Seq: " << S << "; " << *Ptr + << "\n"); + switch (S) { + case S_Use: + SetSeq(S_CanRelease); + return true; + case S_CanRelease: + case S_Release: + case S_MovableRelease: + case S_Stop: + case S_None: + return false; + case S_Retain: + llvm_unreachable("bottom-up pointer in retain state!"); + } + llvm_unreachable("Sequence unknown enum value"); +} + +void BottomUpPtrState::HandlePotentialUse(BasicBlock *BB, Instruction *Inst, + const Value *Ptr, + ProvenanceAnalysis &PA, + ARCInstKind Class) { + // Check for possible direct uses. + switch (GetSeq()) { + case S_Release: + case S_MovableRelease: + if (CanUse(Inst, Ptr, PA, Class)) { + DEBUG(dbgs() << " CanUse: Seq: " << GetSeq() << "; " << *Ptr + << "\n"); + assert(!HasReverseInsertPts()); + // If this is an invoke instruction, we're scanning it as part of + // one of its successor blocks, since we can't insert code after it + // in its own block, and we don't want to split critical edges. + if (isa<InvokeInst>(Inst)) + InsertReverseInsertPt(BB->getFirstInsertionPt()); + else + InsertReverseInsertPt(std::next(BasicBlock::iterator(Inst))); + SetSeq(S_Use); + } else if (Seq == S_Release && IsUser(Class)) { + DEBUG(dbgs() << " PreciseReleaseUse: Seq: " << GetSeq() << "; " + << *Ptr << "\n"); + // Non-movable releases depend on any possible objc pointer use. + SetSeq(S_Stop); + assert(!HasReverseInsertPts()); + // As above; handle invoke specially. + if (isa<InvokeInst>(Inst)) + InsertReverseInsertPt(BB->getFirstInsertionPt()); + else + InsertReverseInsertPt(std::next(BasicBlock::iterator(Inst))); + } + break; + case S_Stop: + if (CanUse(Inst, Ptr, PA, Class)) { + DEBUG(dbgs() << " PreciseStopUse: Seq: " << GetSeq() << "; " + << *Ptr << "\n"); + SetSeq(S_Use); + } + break; + case S_CanRelease: + case S_Use: + case S_None: + break; + case S_Retain: + llvm_unreachable("bottom-up pointer in retain state!"); + } +} + +//===----------------------------------------------------------------------===// +// TopDownPtrState +//===----------------------------------------------------------------------===// + +bool TopDownPtrState::InitTopDown(ARCInstKind Kind, Instruction *I) { + bool NestingDetected = false; + // Don't do retain+release tracking for ARCInstKind::RetainRV, because + // it's + // better to let it remain as the first instruction after a call. + if (Kind != ARCInstKind::RetainRV) { + // If we see two retains in a row on the same pointer. If so, make + // a note, and we'll cicle back to revisit it after we've + // hopefully eliminated the second retain, which may allow us to + // eliminate the first retain too. + // Theoretically we could implement removal of nested retain+release + // pairs by making PtrState hold a stack of states, but this is + // simple and avoids adding overhead for the non-nested case. + if (GetSeq() == S_Retain) + NestingDetected = true; + + ResetSequenceProgress(S_Retain); + SetKnownSafe(HasKnownPositiveRefCount()); + InsertCall(I); + } + + SetKnownPositiveRefCount(); + return NestingDetected; +} + +bool TopDownPtrState::MatchWithRelease(ARCMDKindCache &Cache, + Instruction *Release) { + ClearKnownPositiveRefCount(); + + Sequence OldSeq = GetSeq(); + + MDNode *ReleaseMetadata = + Release->getMetadata(Cache.get(ARCMDKindID::ImpreciseRelease)); + + switch (OldSeq) { + case S_Retain: + case S_CanRelease: + if (OldSeq == S_Retain || ReleaseMetadata != nullptr) + ClearReverseInsertPts(); + // FALL THROUGH + case S_Use: + SetReleaseMetadata(ReleaseMetadata); + SetTailCallRelease(cast<CallInst>(Release)->isTailCall()); + return true; + case S_None: + return false; + case S_Stop: + case S_Release: + case S_MovableRelease: + llvm_unreachable("top-down pointer in bottom up state!"); + } + llvm_unreachable("Sequence unknown enum value"); +} + +bool TopDownPtrState::HandlePotentialAlterRefCount(Instruction *Inst, + const Value *Ptr, + ProvenanceAnalysis &PA, + ARCInstKind Class) { + // Check for possible releases. + if (!CanAlterRefCount(Inst, Ptr, PA, Class)) + return false; + + DEBUG(dbgs() << " CanAlterRefCount: Seq: " << GetSeq() << "; " << *Ptr + << "\n"); + ClearKnownPositiveRefCount(); + switch (GetSeq()) { + case S_Retain: + SetSeq(S_CanRelease); + assert(!HasReverseInsertPts()); + InsertReverseInsertPt(Inst); + + // One call can't cause a transition from S_Retain to S_CanRelease + // and S_CanRelease to S_Use. If we've made the first transition, + // we're done. + return true; + case S_Use: + case S_CanRelease: + case S_None: + return false; + case S_Stop: + case S_Release: + case S_MovableRelease: + llvm_unreachable("top-down pointer in release state!"); + } + llvm_unreachable("covered switch is not covered!?"); +} + +void TopDownPtrState::HandlePotentialUse(Instruction *Inst, const Value *Ptr, + ProvenanceAnalysis &PA, + ARCInstKind Class) { + // Check for possible direct uses. + switch (GetSeq()) { + case S_CanRelease: + if (!CanUse(Inst, Ptr, PA, Class)) + return; + DEBUG(dbgs() << " CanUse: Seq: " << GetSeq() << "; " << *Ptr + << "\n"); + SetSeq(S_Use); + return; + case S_Retain: + case S_Use: + case S_None: + return; + case S_Stop: + case S_Release: + case S_MovableRelease: + llvm_unreachable("top-down pointer in release state!"); + } +} diff --git a/contrib/llvm/lib/Transforms/ObjCARC/PtrState.h b/contrib/llvm/lib/Transforms/ObjCARC/PtrState.h new file mode 100644 index 0000000..e45e1ea --- /dev/null +++ b/contrib/llvm/lib/Transforms/ObjCARC/PtrState.h @@ -0,0 +1,210 @@ +//===--- PtrState.h - ARC State for a Ptr -------------------*- C++ -*-----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains declarations for the ARC state associated with a ptr. It +// is only used by the ARC Sequence Dataflow computation. By separating this +// from the actual dataflow, it is easier to consider the mechanics of the ARC +// optimization separate from the actual predicates being used. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TRANSFORMS_OBJCARC_PTRSTATE_H +#define LLVM_LIB_TRANSFORMS_OBJCARC_PTRSTATE_H + +#include "ARCInstKind.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Value.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/Debug.h" + +namespace llvm { +namespace objcarc { + +class ARCMDKindCache; +class ProvenanceAnalysis; + +/// \enum Sequence +/// +/// \brief A sequence of states that a pointer may go through in which an +/// objc_retain and objc_release are actually needed. +enum Sequence { + S_None, + S_Retain, ///< objc_retain(x). + S_CanRelease, ///< foo(x) -- x could possibly see a ref count decrement. + S_Use, ///< any use of x. + S_Stop, ///< like S_Release, but code motion is stopped. + S_Release, ///< objc_release(x). + S_MovableRelease ///< objc_release(x), !clang.imprecise_release. +}; + +raw_ostream &operator<<(raw_ostream &OS, + const Sequence S) LLVM_ATTRIBUTE_UNUSED; + +/// \brief Unidirectional information about either a +/// retain-decrement-use-release sequence or release-use-decrement-retain +/// reverse sequence. +struct RRInfo { + /// After an objc_retain, the reference count of the referenced + /// object is known to be positive. Similarly, before an objc_release, the + /// reference count of the referenced object is known to be positive. If + /// there are retain-release pairs in code regions where the retain count + /// is known to be positive, they can be eliminated, regardless of any side + /// effects between them. + /// + /// Also, a retain+release pair nested within another retain+release + /// pair all on the known same pointer value can be eliminated, regardless + /// of any intervening side effects. + /// + /// KnownSafe is true when either of these conditions is satisfied. + bool KnownSafe; + + /// True of the objc_release calls are all marked with the "tail" keyword. + bool IsTailCallRelease; + + /// If the Calls are objc_release calls and they all have a + /// clang.imprecise_release tag, this is the metadata tag. + MDNode *ReleaseMetadata; + + /// For a top-down sequence, the set of objc_retains or + /// objc_retainBlocks. For bottom-up, the set of objc_releases. + SmallPtrSet<Instruction *, 2> Calls; + + /// The set of optimal insert positions for moving calls in the opposite + /// sequence. + SmallPtrSet<Instruction *, 2> ReverseInsertPts; + + /// If this is true, we cannot perform code motion but can still remove + /// retain/release pairs. + bool CFGHazardAfflicted; + + RRInfo() + : KnownSafe(false), IsTailCallRelease(false), ReleaseMetadata(nullptr), + CFGHazardAfflicted(false) {} + + void clear(); + + /// Conservatively merge the two RRInfo. Returns true if a partial merge has + /// occurred, false otherwise. + bool Merge(const RRInfo &Other); +}; + +/// \brief This class summarizes several per-pointer runtime properties which +/// are propogated through the flow graph. +class PtrState { +protected: + /// True if the reference count is known to be incremented. + bool KnownPositiveRefCount; + + /// True if we've seen an opportunity for partial RR elimination, such as + /// pushing calls into a CFG triangle or into one side of a CFG diamond. + bool Partial; + + /// The current position in the sequence. + unsigned char Seq : 8; + + /// Unidirectional information about the current sequence. + RRInfo RRI; + + PtrState() : KnownPositiveRefCount(false), Partial(false), Seq(S_None) {} + +public: + bool IsKnownSafe() const { return RRI.KnownSafe; } + + void SetKnownSafe(const bool NewValue) { RRI.KnownSafe = NewValue; } + + bool IsTailCallRelease() const { return RRI.IsTailCallRelease; } + + void SetTailCallRelease(const bool NewValue) { + RRI.IsTailCallRelease = NewValue; + } + + bool IsTrackingImpreciseReleases() const { + return RRI.ReleaseMetadata != nullptr; + } + + const MDNode *GetReleaseMetadata() const { return RRI.ReleaseMetadata; } + + void SetReleaseMetadata(MDNode *NewValue) { RRI.ReleaseMetadata = NewValue; } + + bool IsCFGHazardAfflicted() const { return RRI.CFGHazardAfflicted; } + + void SetCFGHazardAfflicted(const bool NewValue) { + RRI.CFGHazardAfflicted = NewValue; + } + + void SetKnownPositiveRefCount(); + void ClearKnownPositiveRefCount(); + + bool HasKnownPositiveRefCount() const { return KnownPositiveRefCount; } + + void SetSeq(Sequence NewSeq); + + Sequence GetSeq() const { return static_cast<Sequence>(Seq); } + + void ClearSequenceProgress() { ResetSequenceProgress(S_None); } + + void ResetSequenceProgress(Sequence NewSeq); + void Merge(const PtrState &Other, bool TopDown); + + void InsertCall(Instruction *I) { RRI.Calls.insert(I); } + + void InsertReverseInsertPt(Instruction *I) { RRI.ReverseInsertPts.insert(I); } + + void ClearReverseInsertPts() { RRI.ReverseInsertPts.clear(); } + + bool HasReverseInsertPts() const { return !RRI.ReverseInsertPts.empty(); } + + const RRInfo &GetRRInfo() const { return RRI; } +}; + +struct BottomUpPtrState : PtrState { + BottomUpPtrState() : PtrState() {} + + /// (Re-)Initialize this bottom up pointer returning true if we detected a + /// pointer with nested releases. + bool InitBottomUp(ARCMDKindCache &Cache, Instruction *I); + + /// Return true if this set of releases can be paired with a release. Modifies + /// state appropriately to reflect that the matching occured if it is + /// successful. + /// + /// It is assumed that one has already checked that the RCIdentity of the + /// retain and the RCIdentity of this ptr state are the same. + bool MatchWithRetain(); + + void HandlePotentialUse(BasicBlock *BB, Instruction *Inst, const Value *Ptr, + ProvenanceAnalysis &PA, ARCInstKind Class); + bool HandlePotentialAlterRefCount(Instruction *Inst, const Value *Ptr, + ProvenanceAnalysis &PA, ARCInstKind Class); +}; + +struct TopDownPtrState : PtrState { + TopDownPtrState() : PtrState() {} + + /// (Re-)Initialize this bottom up pointer returning true if we detected a + /// pointer with nested releases. + bool InitTopDown(ARCInstKind Kind, Instruction *I); + + /// Return true if this set of retains can be paired with the given + /// release. Modifies state appropriately to reflect that the matching + /// occured. + bool MatchWithRelease(ARCMDKindCache &Cache, Instruction *Release); + + void HandlePotentialUse(Instruction *Inst, const Value *Ptr, + ProvenanceAnalysis &PA, ARCInstKind Class); + + bool HandlePotentialAlterRefCount(Instruction *Inst, const Value *Ptr, + ProvenanceAnalysis &PA, ARCInstKind Class); +}; + +} // end namespace objcarc +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp index 3d91984..d6fc916 100644 --- a/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp @@ -32,19 +32,18 @@ using namespace llvm; STATISTIC(NumRemoved, "Number of instructions removed"); namespace { - struct ADCE : public FunctionPass { - static char ID; // Pass identification, replacement for typeid - ADCE() : FunctionPass(ID) { - initializeADCEPass(*PassRegistry::getPassRegistry()); - } - - bool runOnFunction(Function& F) override; +struct ADCE : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + ADCE() : FunctionPass(ID) { + initializeADCEPass(*PassRegistry::getPassRegistry()); + } - void getAnalysisUsage(AnalysisUsage& AU) const override { - AU.setPreservesCFG(); - } + bool runOnFunction(Function& F) override; - }; + void getAnalysisUsage(AnalysisUsage& AU) const override { + AU.setPreservesCFG(); + } +}; } char ADCE::ID = 0; @@ -54,46 +53,45 @@ bool ADCE::runOnFunction(Function& F) { if (skipOptnoneFunction(F)) return false; - SmallPtrSet<Instruction*, 128> alive; - SmallVector<Instruction*, 128> worklist; + SmallPtrSet<Instruction*, 128> Alive; + SmallVector<Instruction*, 128> Worklist; // Collect the set of "root" instructions that are known live. - for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) - if (isa<TerminatorInst>(I.getInstructionIterator()) || - isa<DbgInfoIntrinsic>(I.getInstructionIterator()) || - isa<LandingPadInst>(I.getInstructionIterator()) || - I->mayHaveSideEffects()) { - alive.insert(I.getInstructionIterator()); - worklist.push_back(I.getInstructionIterator()); + for (Instruction &I : inst_range(F)) { + if (isa<TerminatorInst>(I) || isa<DbgInfoIntrinsic>(I) || + isa<LandingPadInst>(I) || I.mayHaveSideEffects()) { + Alive.insert(&I); + Worklist.push_back(&I); } + } // Propagate liveness backwards to operands. - while (!worklist.empty()) { - Instruction* curr = worklist.pop_back_val(); - for (Instruction::op_iterator OI = curr->op_begin(), OE = curr->op_end(); - OI != OE; ++OI) - if (Instruction* Inst = dyn_cast<Instruction>(OI)) - if (alive.insert(Inst).second) - worklist.push_back(Inst); + while (!Worklist.empty()) { + Instruction *Curr = Worklist.pop_back_val(); + for (Use &OI : Curr->operands()) { + if (Instruction *Inst = dyn_cast<Instruction>(OI)) + if (Alive.insert(Inst).second) + Worklist.push_back(Inst); + } } // The inverse of the live set is the dead set. These are those instructions // which have no side effects and do not influence the control flow or return // value of the function, and may therefore be deleted safely. - // NOTE: We reuse the worklist vector here for memory efficiency. - for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) - if (!alive.count(I.getInstructionIterator())) { - worklist.push_back(I.getInstructionIterator()); - I->dropAllReferences(); + // NOTE: We reuse the Worklist vector here for memory efficiency. + for (Instruction &I : inst_range(F)) { + if (!Alive.count(&I)) { + Worklist.push_back(&I); + I.dropAllReferences(); } + } - for (SmallVectorImpl<Instruction *>::iterator I = worklist.begin(), - E = worklist.end(); I != E; ++I) { + for (Instruction *&I : Worklist) { ++NumRemoved; - (*I)->eraseFromParent(); + I->eraseFromParent(); } - return !worklist.empty(); + return !Worklist.empty(); } FunctionPass *llvm::createAggressiveDCEPass() { diff --git a/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp b/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp index f48cefa..8918909 100644 --- a/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp @@ -23,15 +23,15 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/LoopInfo.h" -#include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" -#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Module.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -50,15 +50,15 @@ struct AlignmentFromAssumptions : public FunctionPass { initializeAlignmentFromAssumptionsPass(*PassRegistry::getPassRegistry()); } - bool runOnFunction(Function &F); + bool runOnFunction(Function &F) override; - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<ScalarEvolution>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.setPreservesCFG(); - AU.addPreserved<LoopInfo>(); + AU.addPreserved<LoopInfoWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); AU.addPreserved<ScalarEvolution>(); } @@ -71,7 +71,6 @@ struct AlignmentFromAssumptions : public FunctionPass { ScalarEvolution *SE; DominatorTree *DT; - const DataLayout *DL; bool extractAlignmentInfo(CallInst *I, Value *&AAPtr, const SCEV *&AlignSCEV, const SCEV *&OffSCEV); @@ -123,7 +122,7 @@ static unsigned getNewAlignmentDiff(const SCEV *DiffSCEV, // If the displacement is not an exact multiple, but the remainder is a // constant, then return this remainder (but only if it is a power of 2). - uint64_t DiffUnitsAbs = abs64(DiffUnits); + uint64_t DiffUnitsAbs = std::abs(DiffUnits); if (isPowerOf2_64(DiffUnitsAbs)) return (unsigned) DiffUnitsAbs; } @@ -316,7 +315,7 @@ bool AlignmentFromAssumptions::processAssumption(CallInst *ACall) { continue; if (Instruction *K = dyn_cast<Instruction>(J)) - if (isValidAssumeForContext(ACall, K, DL, DT)) + if (isValidAssumeForContext(ACall, K, DT)) WorkList.push_back(K); } @@ -400,7 +399,7 @@ bool AlignmentFromAssumptions::processAssumption(CallInst *ACall) { Visited.insert(J); for (User *UJ : J->users()) { Instruction *K = cast<Instruction>(UJ); - if (!Visited.count(K) && isValidAssumeForContext(ACall, K, DL, DT)) + if (!Visited.count(K) && isValidAssumeForContext(ACall, K, DT)) WorkList.push_back(K); } } @@ -413,8 +412,6 @@ bool AlignmentFromAssumptions::runOnFunction(Function &F) { auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); SE = &getAnalysis<ScalarEvolution>(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - DL = DLP ? &DLP->getDataLayout() : nullptr; NewDestAlignments.clear(); NewSrcAlignments.clear(); diff --git a/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp b/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp new file mode 100644 index 0000000..09c605e --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp @@ -0,0 +1,410 @@ +//===---- BDCE.cpp - Bit-tracking dead code elimination -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the Bit-Tracking Dead Code Elimination pass. Some +// instructions (shifts, some ands, ors, etc.) kill some of their input bits. +// We track these dead bits and remove instructions that compute only these +// dead bits. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Operator.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "bdce" + +STATISTIC(NumRemoved, "Number of instructions removed (unused)"); +STATISTIC(NumSimplified, "Number of instructions trivialized (dead bits)"); + +namespace { +struct BDCE : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + BDCE() : FunctionPass(ID) { + initializeBDCEPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function& F) override; + + void getAnalysisUsage(AnalysisUsage& AU) const override { + AU.setPreservesCFG(); + AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<DominatorTreeWrapperPass>(); + } + + void determineLiveOperandBits(const Instruction *UserI, + const Instruction *I, unsigned OperandNo, + const APInt &AOut, APInt &AB, + APInt &KnownZero, APInt &KnownOne, + APInt &KnownZero2, APInt &KnownOne2); + + AssumptionCache *AC; + DominatorTree *DT; +}; +} + +char BDCE::ID = 0; +INITIALIZE_PASS_BEGIN(BDCE, "bdce", "Bit-Tracking Dead Code Elimination", + false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_END(BDCE, "bdce", "Bit-Tracking Dead Code Elimination", + false, false) + +static bool isAlwaysLive(Instruction *I) { + return isa<TerminatorInst>(I) || isa<DbgInfoIntrinsic>(I) || + isa<LandingPadInst>(I) || I->mayHaveSideEffects(); +} + +void BDCE::determineLiveOperandBits(const Instruction *UserI, + const Instruction *I, unsigned OperandNo, + const APInt &AOut, APInt &AB, + APInt &KnownZero, APInt &KnownOne, + APInt &KnownZero2, APInt &KnownOne2) { + unsigned BitWidth = AB.getBitWidth(); + + // We're called once per operand, but for some instructions, we need to + // compute known bits of both operands in order to determine the live bits of + // either (when both operands are instructions themselves). We don't, + // however, want to do this twice, so we cache the result in APInts that live + // in the caller. For the two-relevant-operands case, both operand values are + // provided here. + auto ComputeKnownBits = + [&](unsigned BitWidth, const Value *V1, const Value *V2) { + const DataLayout &DL = I->getModule()->getDataLayout(); + KnownZero = APInt(BitWidth, 0); + KnownOne = APInt(BitWidth, 0); + computeKnownBits(const_cast<Value *>(V1), KnownZero, KnownOne, DL, 0, + AC, UserI, DT); + + if (V2) { + KnownZero2 = APInt(BitWidth, 0); + KnownOne2 = APInt(BitWidth, 0); + computeKnownBits(const_cast<Value *>(V2), KnownZero2, KnownOne2, DL, + 0, AC, UserI, DT); + } + }; + + switch (UserI->getOpcode()) { + default: break; + case Instruction::Call: + case Instruction::Invoke: + if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(UserI)) + switch (II->getIntrinsicID()) { + default: break; + case Intrinsic::bswap: + // The alive bits of the input are the swapped alive bits of + // the output. + AB = AOut.byteSwap(); + break; + case Intrinsic::ctlz: + if (OperandNo == 0) { + // We need some output bits, so we need all bits of the + // input to the left of, and including, the leftmost bit + // known to be one. + ComputeKnownBits(BitWidth, I, nullptr); + AB = APInt::getHighBitsSet(BitWidth, + std::min(BitWidth, KnownOne.countLeadingZeros()+1)); + } + break; + case Intrinsic::cttz: + if (OperandNo == 0) { + // We need some output bits, so we need all bits of the + // input to the right of, and including, the rightmost bit + // known to be one. + ComputeKnownBits(BitWidth, I, nullptr); + AB = APInt::getLowBitsSet(BitWidth, + std::min(BitWidth, KnownOne.countTrailingZeros()+1)); + } + break; + } + break; + case Instruction::Add: + case Instruction::Sub: + // Find the highest live output bit. We don't need any more input + // bits than that (adds, and thus subtracts, ripple only to the + // left). + AB = APInt::getLowBitsSet(BitWidth, AOut.getActiveBits()); + break; + case Instruction::Shl: + if (OperandNo == 0) + if (ConstantInt *CI = + dyn_cast<ConstantInt>(UserI->getOperand(1))) { + uint64_t ShiftAmt = CI->getLimitedValue(BitWidth-1); + AB = AOut.lshr(ShiftAmt); + + // If the shift is nuw/nsw, then the high bits are not dead + // (because we've promised that they *must* be zero). + const ShlOperator *S = cast<ShlOperator>(UserI); + if (S->hasNoSignedWrap()) + AB |= APInt::getHighBitsSet(BitWidth, ShiftAmt+1); + else if (S->hasNoUnsignedWrap()) + AB |= APInt::getHighBitsSet(BitWidth, ShiftAmt); + } + break; + case Instruction::LShr: + if (OperandNo == 0) + if (ConstantInt *CI = + dyn_cast<ConstantInt>(UserI->getOperand(1))) { + uint64_t ShiftAmt = CI->getLimitedValue(BitWidth-1); + AB = AOut.shl(ShiftAmt); + + // If the shift is exact, then the low bits are not dead + // (they must be zero). + if (cast<LShrOperator>(UserI)->isExact()) + AB |= APInt::getLowBitsSet(BitWidth, ShiftAmt); + } + break; + case Instruction::AShr: + if (OperandNo == 0) + if (ConstantInt *CI = + dyn_cast<ConstantInt>(UserI->getOperand(1))) { + uint64_t ShiftAmt = CI->getLimitedValue(BitWidth-1); + AB = AOut.shl(ShiftAmt); + // Because the high input bit is replicated into the + // high-order bits of the result, if we need any of those + // bits, then we must keep the highest input bit. + if ((AOut & APInt::getHighBitsSet(BitWidth, ShiftAmt)) + .getBoolValue()) + AB.setBit(BitWidth-1); + + // If the shift is exact, then the low bits are not dead + // (they must be zero). + if (cast<AShrOperator>(UserI)->isExact()) + AB |= APInt::getLowBitsSet(BitWidth, ShiftAmt); + } + break; + case Instruction::And: + AB = AOut; + + // For bits that are known zero, the corresponding bits in the + // other operand are dead (unless they're both zero, in which + // case they can't both be dead, so just mark the LHS bits as + // dead). + if (OperandNo == 0) { + ComputeKnownBits(BitWidth, I, UserI->getOperand(1)); + AB &= ~KnownZero2; + } else { + if (!isa<Instruction>(UserI->getOperand(0))) + ComputeKnownBits(BitWidth, UserI->getOperand(0), I); + AB &= ~(KnownZero & ~KnownZero2); + } + break; + case Instruction::Or: + AB = AOut; + + // For bits that are known one, the corresponding bits in the + // other operand are dead (unless they're both one, in which + // case they can't both be dead, so just mark the LHS bits as + // dead). + if (OperandNo == 0) { + ComputeKnownBits(BitWidth, I, UserI->getOperand(1)); + AB &= ~KnownOne2; + } else { + if (!isa<Instruction>(UserI->getOperand(0))) + ComputeKnownBits(BitWidth, UserI->getOperand(0), I); + AB &= ~(KnownOne & ~KnownOne2); + } + break; + case Instruction::Xor: + case Instruction::PHI: + AB = AOut; + break; + case Instruction::Trunc: + AB = AOut.zext(BitWidth); + break; + case Instruction::ZExt: + AB = AOut.trunc(BitWidth); + break; + case Instruction::SExt: + AB = AOut.trunc(BitWidth); + // Because the high input bit is replicated into the + // high-order bits of the result, if we need any of those + // bits, then we must keep the highest input bit. + if ((AOut & APInt::getHighBitsSet(AOut.getBitWidth(), + AOut.getBitWidth() - BitWidth)) + .getBoolValue()) + AB.setBit(BitWidth-1); + break; + case Instruction::Select: + if (OperandNo != 0) + AB = AOut; + break; + } +} + +bool BDCE::runOnFunction(Function& F) { + if (skipOptnoneFunction(F)) + return false; + + AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + + DenseMap<Instruction *, APInt> AliveBits; + SmallVector<Instruction*, 128> Worklist; + + // The set of visited instructions (non-integer-typed only). + SmallPtrSet<Instruction*, 128> Visited; + + // Collect the set of "root" instructions that are known live. + for (Instruction &I : inst_range(F)) { + if (!isAlwaysLive(&I)) + continue; + + DEBUG(dbgs() << "BDCE: Root: " << I << "\n"); + // For integer-valued instructions, set up an initial empty set of alive + // bits and add the instruction to the work list. For other instructions + // add their operands to the work list (for integer values operands, mark + // all bits as live). + if (IntegerType *IT = dyn_cast<IntegerType>(I.getType())) { + if (!AliveBits.count(&I)) { + AliveBits[&I] = APInt(IT->getBitWidth(), 0); + Worklist.push_back(&I); + } + + continue; + } + + // Non-integer-typed instructions... + for (Use &OI : I.operands()) { + if (Instruction *J = dyn_cast<Instruction>(OI)) { + if (IntegerType *IT = dyn_cast<IntegerType>(J->getType())) + AliveBits[J] = APInt::getAllOnesValue(IT->getBitWidth()); + Worklist.push_back(J); + } + } + // To save memory, we don't add I to the Visited set here. Instead, we + // check isAlwaysLive on every instruction when searching for dead + // instructions later (we need to check isAlwaysLive for the + // integer-typed instructions anyway). + } + + // Propagate liveness backwards to operands. + while (!Worklist.empty()) { + Instruction *UserI = Worklist.pop_back_val(); + + DEBUG(dbgs() << "BDCE: Visiting: " << *UserI); + APInt AOut; + if (UserI->getType()->isIntegerTy()) { + AOut = AliveBits[UserI]; + DEBUG(dbgs() << " Alive Out: " << AOut); + } + DEBUG(dbgs() << "\n"); + + if (!UserI->getType()->isIntegerTy()) + Visited.insert(UserI); + + APInt KnownZero, KnownOne, KnownZero2, KnownOne2; + // Compute the set of alive bits for each operand. These are anded into the + // existing set, if any, and if that changes the set of alive bits, the + // operand is added to the work-list. + for (Use &OI : UserI->operands()) { + if (Instruction *I = dyn_cast<Instruction>(OI)) { + if (IntegerType *IT = dyn_cast<IntegerType>(I->getType())) { + unsigned BitWidth = IT->getBitWidth(); + APInt AB = APInt::getAllOnesValue(BitWidth); + if (UserI->getType()->isIntegerTy() && !AOut && + !isAlwaysLive(UserI)) { + AB = APInt(BitWidth, 0); + } else { + // If all bits of the output are dead, then all bits of the input + // Bits of each operand that are used to compute alive bits of the + // output are alive, all others are dead. + determineLiveOperandBits(UserI, I, OI.getOperandNo(), AOut, AB, + KnownZero, KnownOne, + KnownZero2, KnownOne2); + } + + // If we've added to the set of alive bits (or the operand has not + // been previously visited), then re-queue the operand to be visited + // again. + APInt ABPrev(BitWidth, 0); + auto ABI = AliveBits.find(I); + if (ABI != AliveBits.end()) + ABPrev = ABI->second; + + APInt ABNew = AB | ABPrev; + if (ABNew != ABPrev || ABI == AliveBits.end()) { + AliveBits[I] = std::move(ABNew); + Worklist.push_back(I); + } + } else if (!Visited.count(I)) { + Worklist.push_back(I); + } + } + } + } + + bool Changed = false; + // The inverse of the live set is the dead set. These are those instructions + // which have no side effects and do not influence the control flow or return + // value of the function, and may therefore be deleted safely. + // NOTE: We reuse the Worklist vector here for memory efficiency. + for (Instruction &I : inst_range(F)) { + // For live instructions that have all dead bits, first make them dead by + // replacing all uses with something else. Then, if they don't need to + // remain live (because they have side effects, etc.) we can remove them. + if (I.getType()->isIntegerTy()) { + auto ABI = AliveBits.find(&I); + if (ABI != AliveBits.end()) { + if (ABI->second.getBoolValue()) + continue; + + DEBUG(dbgs() << "BDCE: Trivializing: " << I << " (all bits dead)\n"); + // FIXME: In theory we could substitute undef here instead of zero. + // This should be reconsidered once we settle on the semantics of + // undef, poison, etc. + Value *Zero = ConstantInt::get(I.getType(), 0); + ++NumSimplified; + I.replaceAllUsesWith(Zero); + Changed = true; + } + } else if (Visited.count(&I)) { + continue; + } + + if (isAlwaysLive(&I)) + continue; + + Worklist.push_back(&I); + I.dropAllReferences(); + Changed = true; + } + + for (Instruction *&I : Worklist) { + ++NumRemoved; + I->eraseFromParent(); + } + + return Changed; +} + +FunctionPass *llvm::createBitTrackingDCEPass() { + return new BDCE(); +} + diff --git a/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp index 27c177a..4288742 100644 --- a/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp @@ -43,6 +43,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" #include <tuple> using namespace llvm; @@ -131,14 +132,14 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<TargetTransformInfo>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); } private: /// \brief Initialize the pass. void setup(Function &Fn) { DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - TTI = &getAnalysis<TargetTransformInfo>(); + TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(Fn); Entry = &Fn.getEntryBlock(); } @@ -176,7 +177,7 @@ char ConstantHoisting::ID = 0; INITIALIZE_PASS_BEGIN(ConstantHoisting, "consthoist", "Constant Hoisting", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_END(ConstantHoisting, "consthoist", "Constant Hoisting", false, false) @@ -186,6 +187,9 @@ FunctionPass *llvm::createConstantHoistingPass() { /// \brief Perform the constant hoisting optimization for the given function. bool ConstantHoisting::runOnFunction(Function &Fn) { + if (skipOptnoneFunction(Fn)) + return false; + DEBUG(dbgs() << "********** Begin Constant Hoisting **********\n"); DEBUG(dbgs() << "********** Function: " << Fn.getName() << '\n'); diff --git a/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp b/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp index dd51ce1..c974ebb 100644 --- a/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp @@ -22,11 +22,10 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/IR/Constant.h" -#include "llvm/IR/DataLayout.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instruction.h" #include "llvm/Pass.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include <set> using namespace llvm; @@ -45,7 +44,7 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired<TargetLibraryInfo>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); } }; } @@ -53,7 +52,7 @@ namespace { char ConstantPropagation::ID = 0; INITIALIZE_PASS_BEGIN(ConstantPropagation, "constprop", "Simple constant propagation", false, false) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END(ConstantPropagation, "constprop", "Simple constant propagation", false, false) @@ -68,9 +67,9 @@ bool ConstantPropagation::runOnFunction(Function &F) { WorkList.insert(&*i); } bool Changed = false; - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr; - TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>(); + const DataLayout &DL = F.getParent()->getDataLayout(); + TargetLibraryInfo *TLI = + &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); while (!WorkList.empty()) { Instruction *I = *WorkList.begin(); diff --git a/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index 5a3b5cf..d1302c6 100644 --- a/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -19,6 +19,7 @@ #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -102,32 +103,52 @@ bool CorrelatedValuePropagation::processPHI(PHINode *P) { Value *V = LVI->getConstantOnEdge(Incoming, P->getIncomingBlock(i), BB, P); - // Look if the incoming value is a select with a constant but LVI tells us - // that the incoming value can never be that constant. In that case replace - // the incoming value with the other value of the select. This often allows - // us to remove the select later. + // Look if the incoming value is a select with a scalar condition for which + // LVI can tells us the value. In that case replace the incoming value with + // the appropriate value of the select. This often allows us to remove the + // select later. if (!V) { SelectInst *SI = dyn_cast<SelectInst>(Incoming); if (!SI) continue; - Constant *C = dyn_cast<Constant>(SI->getFalseValue()); - if (!C) continue; + Value *Condition = SI->getCondition(); + if (!Condition->getType()->isVectorTy()) { + if (Constant *C = LVI->getConstantOnEdge(Condition, P->getIncomingBlock(i), BB, P)) { + if (C == ConstantInt::getTrue(Condition->getType())) { + V = SI->getTrueValue(); + } else { + V = SI->getFalseValue(); + } + // Once LVI learns to handle vector types, we could also add support + // for vector type constants that are not all zeroes or all ones. + } + } - if (LVI->getPredicateOnEdge(ICmpInst::ICMP_EQ, SI, C, - P->getIncomingBlock(i), BB, P) != - LazyValueInfo::False) - continue; + // Look if the select has a constant but LVI tells us that the incoming + // value can never be that constant. In that case replace the incoming + // value with the other value of the select. This often allows us to + // remove the select later. + if (!V) { + Constant *C = dyn_cast<Constant>(SI->getFalseValue()); + if (!C) continue; + + if (LVI->getPredicateOnEdge(ICmpInst::ICMP_EQ, SI, C, + P->getIncomingBlock(i), BB, P) != + LazyValueInfo::False) + continue; + V = SI->getTrueValue(); + } DEBUG(dbgs() << "CVP: Threading PHI over " << *SI << '\n'); - V = SI->getTrueValue(); } P->setIncomingValue(i, V); Changed = true; } - // FIXME: Provide DL, TLI, DT, AT to SimplifyInstruction. - if (Value *V = SimplifyInstruction(P)) { + // FIXME: Provide TLI, DT, AT to SimplifyInstruction. + const DataLayout &DL = BB->getModule()->getDataLayout(); + if (Value *V = SimplifyInstruction(P, DL)) { P->replaceAllUsesWith(V); P->eraseFromParent(); Changed = true; diff --git a/contrib/llvm/lib/Transforms/Scalar/DCE.cpp b/contrib/llvm/lib/Transforms/Scalar/DCE.cpp index 99fac75..3b262a2 100644 --- a/contrib/llvm/lib/Transforms/Scalar/DCE.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/DCE.cpp @@ -21,7 +21,7 @@ #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instruction.h" #include "llvm/Pass.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; @@ -42,7 +42,8 @@ namespace { bool runOnBasicBlock(BasicBlock &BB) override { if (skipOptnoneFunction(BB)) return false; - TargetLibraryInfo *TLI = getAnalysisIfAvailable<TargetLibraryInfo>(); + auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); + TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr; bool Changed = false; for (BasicBlock::iterator DI = BB.begin(); DI != BB.end(); ) { Instruction *Inst = DI++; @@ -95,7 +96,8 @@ bool DCE::runOnFunction(Function &F) { if (skipOptnoneFunction(F)) return false; - TargetLibraryInfo *TLI = getAnalysisIfAvailable<TargetLibraryInfo>(); + auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); + TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr; // Start out with all of the instructions in the worklist... std::vector<Instruction*> WorkList; diff --git a/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp index a1ddc00..01952cf 100644 --- a/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -23,6 +23,7 @@ #include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" @@ -33,7 +34,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; @@ -78,7 +79,8 @@ namespace { bool HandleFree(CallInst *F); bool handleEndBlock(BasicBlock &BB); void RemoveAccessedObjects(const AliasAnalysis::Location &LoadedLoc, - SmallSetVector<Value*, 16> &DeadStackObjects); + SmallSetVector<Value *, 16> &DeadStackObjects, + const DataLayout &DL); void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); @@ -166,7 +168,7 @@ static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo *TLI) { return true; } } - if (CallSite CS = I) { + if (auto CS = CallSite(I)) { if (Function *F = CS.getCalledFunction()) { if (TLI && TLI->has(LibFunc::strcpy) && F->getName() == TLI->getName(LibFunc::strcpy)) { @@ -194,18 +196,12 @@ static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo *TLI) { /// describe the memory operations for this instruction. static AliasAnalysis::Location getLocForWrite(Instruction *Inst, AliasAnalysis &AA) { - const DataLayout *DL = AA.getDataLayout(); if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) return AA.getLocation(SI); if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(Inst)) { // memcpy/memmove/memset. AliasAnalysis::Location Loc = AA.getLocationForDest(MI); - // If we don't have target data around, an unknown size in Location means - // that we should use the size of the pointee type. This isn't valid for - // memset/memcpy, which writes more than an i8. - if (Loc.Size == AliasAnalysis::UnknownSize && DL == nullptr) - return AliasAnalysis::Location(); return Loc; } @@ -215,11 +211,6 @@ getLocForWrite(Instruction *Inst, AliasAnalysis &AA) { switch (II->getIntrinsicID()) { default: return AliasAnalysis::Location(); // Unhandled intrinsic. case Intrinsic::init_trampoline: - // If we don't have target data around, an unknown size in Location means - // that we should use the size of the pointee type. This isn't valid for - // init.trampoline, which writes more than an i8. - if (!DL) return AliasAnalysis::Location(); - // FIXME: We don't know the size of the trampoline, so we can't really // handle it here. return AliasAnalysis::Location(II->getArgOperand(0)); @@ -271,7 +262,7 @@ static bool isRemovable(Instruction *I) { } } - if (CallSite CS = I) + if (auto CS = CallSite(I)) return CS.getInstruction()->use_empty(); return false; @@ -315,15 +306,16 @@ static Value *getStoredPointerOperand(Instruction *I) { } } - CallSite CS = I; + CallSite CS(I); // All the supported functions so far happen to have dest as their first // argument. return CS.getArgument(0); } -static uint64_t getPointerSize(const Value *V, AliasAnalysis &AA) { +static uint64_t getPointerSize(const Value *V, const DataLayout &DL, + const TargetLibraryInfo *TLI) { uint64_t Size; - if (getObjectSize(V, Size, AA.getDataLayout(), AA.getTargetLibraryInfo())) + if (getObjectSize(V, Size, DL, TLI)) return Size; return AliasAnalysis::UnknownSize; } @@ -343,10 +335,9 @@ namespace { /// overwritten by 'Later', or 'OverwriteUnknown' if nothing can be determined static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later, const AliasAnalysis::Location &Earlier, - AliasAnalysis &AA, - int64_t &EarlierOff, - int64_t &LaterOff) { - const DataLayout *DL = AA.getDataLayout(); + const DataLayout &DL, + const TargetLibraryInfo *TLI, + int64_t &EarlierOff, int64_t &LaterOff) { const Value *P1 = Earlier.Ptr->stripPointerCasts(); const Value *P2 = Later.Ptr->stripPointerCasts(); @@ -367,7 +358,7 @@ static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later, // Otherwise, we have to have size information, and the later store has to be // larger than the earlier one. if (Later.Size == AliasAnalysis::UnknownSize || - Earlier.Size == AliasAnalysis::UnknownSize || DL == nullptr) + Earlier.Size == AliasAnalysis::UnknownSize) return OverwriteUnknown; // Check to see if the later store is to the entire object (either a global, @@ -382,7 +373,7 @@ static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later, return OverwriteUnknown; // If the "Later" store is to a recognizable object, get its size. - uint64_t ObjectSize = getPointerSize(UO2, AA); + uint64_t ObjectSize = getPointerSize(UO2, DL, TLI); if (ObjectSize != AliasAnalysis::UnknownSize) if (ObjectSize == Later.Size && ObjectSize >= Earlier.Size) return OverwriteComplete; @@ -560,8 +551,10 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) { if (isRemovable(DepWrite) && !isPossibleSelfRead(Inst, Loc, DepWrite, *AA)) { int64_t InstWriteOffset, DepWriteOffset; - OverwriteResult OR = isOverwrite(Loc, DepLoc, *AA, - DepWriteOffset, InstWriteOffset); + const DataLayout &DL = BB.getModule()->getDataLayout(); + OverwriteResult OR = + isOverwrite(Loc, DepLoc, DL, AA->getTargetLibraryInfo(), + DepWriteOffset, InstWriteOffset); if (OR == OverwriteComplete) { DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " << *DepWrite << "\n KILLER: " << *Inst << '\n'); @@ -655,6 +648,7 @@ bool DSE::HandleFree(CallInst *F) { AliasAnalysis::Location Loc = AliasAnalysis::Location(F->getOperand(0)); SmallVector<BasicBlock *, 16> Blocks; Blocks.push_back(F->getParent()); + const DataLayout &DL = F->getModule()->getDataLayout(); while (!Blocks.empty()) { BasicBlock *BB = Blocks.pop_back_val(); @@ -668,7 +662,7 @@ bool DSE::HandleFree(CallInst *F) { break; Value *DepPointer = - GetUnderlyingObject(getStoredPointerOperand(Dependency)); + GetUnderlyingObject(getStoredPointerOperand(Dependency), DL); // Check for aliasing. if (!AA->isMustAlias(F->getArgOperand(0), DepPointer)) @@ -728,6 +722,8 @@ bool DSE::handleEndBlock(BasicBlock &BB) { if (AI->hasByValOrInAllocaAttr()) DeadStackObjects.insert(AI); + const DataLayout &DL = BB.getModule()->getDataLayout(); + // Scan the basic block backwards for (BasicBlock::iterator BBI = BB.end(); BBI != BB.begin(); ){ --BBI; @@ -736,7 +732,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) { if (hasMemoryWrite(BBI, TLI) && isRemovable(BBI)) { // See through pointer-to-pointer bitcasts SmallVector<Value *, 4> Pointers; - GetUnderlyingObjects(getStoredPointerOperand(BBI), Pointers); + GetUnderlyingObjects(getStoredPointerOperand(BBI), Pointers, DL); // Stores to stack values are valid candidates for removal. bool AllDead = true; @@ -784,7 +780,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) { continue; } - if (CallSite CS = cast<Value>(BBI)) { + if (auto CS = CallSite(BBI)) { // Remove allocation function calls from the list of dead stack objects; // there can't be any references before the definition. if (isAllocLikeFn(BBI, TLI)) @@ -799,8 +795,8 @@ bool DSE::handleEndBlock(BasicBlock &BB) { // the call is live. DeadStackObjects.remove_if([&](Value *I) { // See if the call site touches the value. - AliasAnalysis::ModRefResult A = - AA->getModRefInfo(CS, I, getPointerSize(I, *AA)); + AliasAnalysis::ModRefResult A = AA->getModRefInfo( + CS, I, getPointerSize(I, DL, AA->getTargetLibraryInfo())); return A == AliasAnalysis::ModRef || A == AliasAnalysis::Ref; }); @@ -835,7 +831,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) { // Remove any allocas from the DeadPointer set that are loaded, as this // makes any stores above the access live. - RemoveAccessedObjects(LoadedLoc, DeadStackObjects); + RemoveAccessedObjects(LoadedLoc, DeadStackObjects, DL); // If all of the allocas were clobbered by the access then we're not going // to find anything else to process. @@ -850,8 +846,9 @@ bool DSE::handleEndBlock(BasicBlock &BB) { /// of the stack objects in the DeadStackObjects set. If so, they become live /// because the location is being loaded. void DSE::RemoveAccessedObjects(const AliasAnalysis::Location &LoadedLoc, - SmallSetVector<Value*, 16> &DeadStackObjects) { - const Value *UnderlyingPointer = GetUnderlyingObject(LoadedLoc.Ptr); + SmallSetVector<Value *, 16> &DeadStackObjects, + const DataLayout &DL) { + const Value *UnderlyingPointer = GetUnderlyingObject(LoadedLoc.Ptr, DL); // A constant can't be in the dead pointer set. if (isa<Constant>(UnderlyingPointer)) @@ -867,7 +864,8 @@ void DSE::RemoveAccessedObjects(const AliasAnalysis::Location &LoadedLoc, // Remove objects that could alias LoadedLoc. DeadStackObjects.remove_if([&](Value *I) { // See if the loaded location could alias the stack location. - AliasAnalysis::Location StackLoc(I, getPointerSize(I, *AA)); + AliasAnalysis::Location StackLoc( + I, getPointerSize(I, DL, AA->getTargetLibraryInfo())); return !AA->isNoAlias(StackLoc, LoadedLoc); }); } diff --git a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp index 969b9a8..d536a93 100644 --- a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp @@ -12,12 +12,14 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/EarlyCSE.h" #include "llvm/ADT/Hashing.h" #include "llvm/ADT/ScopedHashTable.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" @@ -26,7 +28,8 @@ #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/RecyclingAllocator.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" #include <deque> using namespace llvm; @@ -40,49 +43,44 @@ STATISTIC(NumCSELoad, "Number of load instructions CSE'd"); STATISTIC(NumCSECall, "Number of call instructions CSE'd"); STATISTIC(NumDSE, "Number of trivial dead stores removed"); -static unsigned getHash(const void *V) { - return DenseMapInfo<const void*>::getHashValue(V); -} - //===----------------------------------------------------------------------===// // SimpleValue //===----------------------------------------------------------------------===// namespace { - /// SimpleValue - Instances of this struct represent available values in the - /// scoped hash table. - struct SimpleValue { - Instruction *Inst; +/// \brief Struct representing the available values in the scoped hash table. +struct SimpleValue { + Instruction *Inst; - SimpleValue(Instruction *I) : Inst(I) { - assert((isSentinel() || canHandle(I)) && "Inst can't be handled!"); - } + SimpleValue(Instruction *I) : Inst(I) { + assert((isSentinel() || canHandle(I)) && "Inst can't be handled!"); + } - bool isSentinel() const { - return Inst == DenseMapInfo<Instruction*>::getEmptyKey() || - Inst == DenseMapInfo<Instruction*>::getTombstoneKey(); - } + bool isSentinel() const { + return Inst == DenseMapInfo<Instruction *>::getEmptyKey() || + Inst == DenseMapInfo<Instruction *>::getTombstoneKey(); + } - static bool canHandle(Instruction *Inst) { - // This can only handle non-void readnone functions. - if (CallInst *CI = dyn_cast<CallInst>(Inst)) - return CI->doesNotAccessMemory() && !CI->getType()->isVoidTy(); - return isa<CastInst>(Inst) || isa<BinaryOperator>(Inst) || - isa<GetElementPtrInst>(Inst) || isa<CmpInst>(Inst) || - isa<SelectInst>(Inst) || isa<ExtractElementInst>(Inst) || - isa<InsertElementInst>(Inst) || isa<ShuffleVectorInst>(Inst) || - isa<ExtractValueInst>(Inst) || isa<InsertValueInst>(Inst); - } - }; + static bool canHandle(Instruction *Inst) { + // This can only handle non-void readnone functions. + if (CallInst *CI = dyn_cast<CallInst>(Inst)) + return CI->doesNotAccessMemory() && !CI->getType()->isVoidTy(); + return isa<CastInst>(Inst) || isa<BinaryOperator>(Inst) || + isa<GetElementPtrInst>(Inst) || isa<CmpInst>(Inst) || + isa<SelectInst>(Inst) || isa<ExtractElementInst>(Inst) || + isa<InsertElementInst>(Inst) || isa<ShuffleVectorInst>(Inst) || + isa<ExtractValueInst>(Inst) || isa<InsertValueInst>(Inst); + } +}; } namespace llvm { -template<> struct DenseMapInfo<SimpleValue> { +template <> struct DenseMapInfo<SimpleValue> { static inline SimpleValue getEmptyKey() { - return DenseMapInfo<Instruction*>::getEmptyKey(); + return DenseMapInfo<Instruction *>::getEmptyKey(); } static inline SimpleValue getTombstoneKey() { - return DenseMapInfo<Instruction*>::getTombstoneKey(); + return DenseMapInfo<Instruction *>::getTombstoneKey(); } static unsigned getHashValue(SimpleValue Val); static bool isEqual(SimpleValue LHS, SimpleValue RHS); @@ -92,7 +90,7 @@ template<> struct DenseMapInfo<SimpleValue> { unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) { Instruction *Inst = Val.Inst; // Hash in all of the operands as pointers. - if (BinaryOperator* BinOp = dyn_cast<BinaryOperator>(Inst)) { + if (BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst)) { Value *LHS = BinOp->getOperand(0); Value *RHS = BinOp->getOperand(1); if (BinOp->isCommutative() && BinOp->getOperand(0) > BinOp->getOperand(1)) @@ -101,8 +99,9 @@ unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) { if (isa<OverflowingBinaryOperator>(BinOp)) { // Hash the overflow behavior unsigned Overflow = - BinOp->hasNoSignedWrap() * OverflowingBinaryOperator::NoSignedWrap | - BinOp->hasNoUnsignedWrap() * OverflowingBinaryOperator::NoUnsignedWrap; + BinOp->hasNoSignedWrap() * OverflowingBinaryOperator::NoSignedWrap | + BinOp->hasNoUnsignedWrap() * + OverflowingBinaryOperator::NoUnsignedWrap; return hash_combine(BinOp->getOpcode(), Overflow, LHS, RHS); } @@ -135,12 +134,13 @@ unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) { assert((isa<CallInst>(Inst) || isa<BinaryOperator>(Inst) || isa<GetElementPtrInst>(Inst) || isa<SelectInst>(Inst) || isa<ExtractElementInst>(Inst) || isa<InsertElementInst>(Inst) || - isa<ShuffleVectorInst>(Inst)) && "Invalid/unknown instruction"); + isa<ShuffleVectorInst>(Inst)) && + "Invalid/unknown instruction"); // Mix in the opcode. - return hash_combine(Inst->getOpcode(), - hash_combine_range(Inst->value_op_begin(), - Inst->value_op_end())); + return hash_combine( + Inst->getOpcode(), + hash_combine_range(Inst->value_op_begin(), Inst->value_op_end())); } bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) { @@ -149,22 +149,24 @@ bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) { if (LHS.isSentinel() || RHS.isSentinel()) return LHSI == RHSI; - if (LHSI->getOpcode() != RHSI->getOpcode()) return false; - if (LHSI->isIdenticalTo(RHSI)) return true; + if (LHSI->getOpcode() != RHSI->getOpcode()) + return false; + if (LHSI->isIdenticalTo(RHSI)) + return true; // If we're not strictly identical, we still might be a commutable instruction if (BinaryOperator *LHSBinOp = dyn_cast<BinaryOperator>(LHSI)) { if (!LHSBinOp->isCommutative()) return false; - assert(isa<BinaryOperator>(RHSI) - && "same opcode, but different instruction type?"); + assert(isa<BinaryOperator>(RHSI) && + "same opcode, but different instruction type?"); BinaryOperator *RHSBinOp = cast<BinaryOperator>(RHSI); // Check overflow attributes if (isa<OverflowingBinaryOperator>(LHSBinOp)) { - assert(isa<OverflowingBinaryOperator>(RHSBinOp) - && "same opcode, but different operator type?"); + assert(isa<OverflowingBinaryOperator>(RHSBinOp) && + "same opcode, but different operator type?"); if (LHSBinOp->hasNoUnsignedWrap() != RHSBinOp->hasNoUnsignedWrap() || LHSBinOp->hasNoSignedWrap() != RHSBinOp->hasNoSignedWrap()) return false; @@ -172,16 +174,16 @@ bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) { // Commuted equality return LHSBinOp->getOperand(0) == RHSBinOp->getOperand(1) && - LHSBinOp->getOperand(1) == RHSBinOp->getOperand(0); + LHSBinOp->getOperand(1) == RHSBinOp->getOperand(0); } if (CmpInst *LHSCmp = dyn_cast<CmpInst>(LHSI)) { - assert(isa<CmpInst>(RHSI) - && "same opcode, but different instruction type?"); + assert(isa<CmpInst>(RHSI) && + "same opcode, but different instruction type?"); CmpInst *RHSCmp = cast<CmpInst>(RHSI); // Commuted equality return LHSCmp->getOperand(0) == RHSCmp->getOperand(1) && - LHSCmp->getOperand(1) == RHSCmp->getOperand(0) && - LHSCmp->getSwappedPredicate() == RHSCmp->getPredicate(); + LHSCmp->getOperand(1) == RHSCmp->getOperand(0) && + LHSCmp->getSwappedPredicate() == RHSCmp->getPredicate(); } return false; @@ -192,57 +194,52 @@ bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) { //===----------------------------------------------------------------------===// namespace { - /// CallValue - Instances of this struct represent available call values in - /// the scoped hash table. - struct CallValue { - Instruction *Inst; +/// \brief Struct representing the available call values in the scoped hash +/// table. +struct CallValue { + Instruction *Inst; - CallValue(Instruction *I) : Inst(I) { - assert((isSentinel() || canHandle(I)) && "Inst can't be handled!"); - } + CallValue(Instruction *I) : Inst(I) { + assert((isSentinel() || canHandle(I)) && "Inst can't be handled!"); + } - bool isSentinel() const { - return Inst == DenseMapInfo<Instruction*>::getEmptyKey() || - Inst == DenseMapInfo<Instruction*>::getTombstoneKey(); - } + bool isSentinel() const { + return Inst == DenseMapInfo<Instruction *>::getEmptyKey() || + Inst == DenseMapInfo<Instruction *>::getTombstoneKey(); + } - static bool canHandle(Instruction *Inst) { - // Don't value number anything that returns void. - if (Inst->getType()->isVoidTy()) - return false; + static bool canHandle(Instruction *Inst) { + // Don't value number anything that returns void. + if (Inst->getType()->isVoidTy()) + return false; - CallInst *CI = dyn_cast<CallInst>(Inst); - if (!CI || !CI->onlyReadsMemory()) - return false; - return true; - } - }; + CallInst *CI = dyn_cast<CallInst>(Inst); + if (!CI || !CI->onlyReadsMemory()) + return false; + return true; + } +}; } namespace llvm { - template<> struct DenseMapInfo<CallValue> { - static inline CallValue getEmptyKey() { - return DenseMapInfo<Instruction*>::getEmptyKey(); - } - static inline CallValue getTombstoneKey() { - return DenseMapInfo<Instruction*>::getTombstoneKey(); - } - static unsigned getHashValue(CallValue Val); - static bool isEqual(CallValue LHS, CallValue RHS); - }; +template <> struct DenseMapInfo<CallValue> { + static inline CallValue getEmptyKey() { + return DenseMapInfo<Instruction *>::getEmptyKey(); + } + static inline CallValue getTombstoneKey() { + return DenseMapInfo<Instruction *>::getTombstoneKey(); + } + static unsigned getHashValue(CallValue Val); + static bool isEqual(CallValue LHS, CallValue RHS); +}; } + unsigned DenseMapInfo<CallValue>::getHashValue(CallValue Val) { Instruction *Inst = Val.Inst; - // Hash in all of the operands as pointers. - unsigned Res = 0; - for (unsigned i = 0, e = Inst->getNumOperands(); i != e; ++i) { - assert(!Inst->getOperand(i)->getType()->isMetadataTy() && - "Cannot value number calls with metadata operands"); - Res ^= getHash(Inst->getOperand(i)) << (i & 0xF); - } - - // Mix in the opcode. - return (Res << 1) ^ Inst->getOpcode(); + // Hash all of the operands as pointers and mix in the opcode. + return hash_combine( + Inst->getOpcode(), + hash_combine_range(Inst->value_op_begin(), Inst->value_op_end())); } bool DenseMapInfo<CallValue>::isEqual(CallValue LHS, CallValue RHS) { @@ -252,103 +249,104 @@ bool DenseMapInfo<CallValue>::isEqual(CallValue LHS, CallValue RHS) { return LHSI->isIdenticalTo(RHSI); } - //===----------------------------------------------------------------------===// -// EarlyCSE pass. +// EarlyCSE implementation //===----------------------------------------------------------------------===// namespace { - -/// EarlyCSE - This pass does a simple depth-first walk over the dominator -/// tree, eliminating trivially redundant instructions and using instsimplify -/// to canonicalize things as it goes. It is intended to be fast and catch -/// obvious cases so that instcombine and other passes are more effective. It -/// is expected that a later pass of GVN will catch the interesting/hard -/// cases. -class EarlyCSE : public FunctionPass { +/// \brief A simple and fast domtree-based CSE pass. +/// +/// This pass does a simple depth-first walk over the dominator tree, +/// eliminating trivially redundant instructions and using instsimplify to +/// canonicalize things as it goes. It is intended to be fast and catch obvious +/// cases so that instcombine and other passes are more effective. It is +/// expected that a later pass of GVN will catch the interesting/hard cases. +class EarlyCSE { public: - const DataLayout *DL; - const TargetLibraryInfo *TLI; - DominatorTree *DT; - AssumptionCache *AC; - typedef RecyclingAllocator<BumpPtrAllocator, - ScopedHashTableVal<SimpleValue, Value*> > AllocatorTy; - typedef ScopedHashTable<SimpleValue, Value*, DenseMapInfo<SimpleValue>, + Function &F; + const TargetLibraryInfo &TLI; + const TargetTransformInfo &TTI; + DominatorTree &DT; + AssumptionCache &AC; + typedef RecyclingAllocator< + BumpPtrAllocator, ScopedHashTableVal<SimpleValue, Value *>> AllocatorTy; + typedef ScopedHashTable<SimpleValue, Value *, DenseMapInfo<SimpleValue>, AllocatorTy> ScopedHTType; - /// AvailableValues - This scoped hash table contains the current values of - /// all of our simple scalar expressions. As we walk down the domtree, we - /// look to see if instructions are in this: if so, we replace them with what - /// we find, otherwise we insert them so that dominated values can succeed in - /// their lookup. - ScopedHTType *AvailableValues; - - /// AvailableLoads - This scoped hash table contains the current values - /// of loads. This allows us to get efficient access to dominating loads when - /// we have a fully redundant load. In addition to the most recent load, we - /// keep track of a generation count of the read, which is compared against - /// the current generation count. The current generation count is - /// incremented after every possibly writing memory operation, which ensures - /// that we only CSE loads with other loads that have no intervening store. - typedef RecyclingAllocator<BumpPtrAllocator, - ScopedHashTableVal<Value*, std::pair<Value*, unsigned> > > LoadMapAllocator; - typedef ScopedHashTable<Value*, std::pair<Value*, unsigned>, - DenseMapInfo<Value*>, LoadMapAllocator> LoadHTType; - LoadHTType *AvailableLoads; - - /// AvailableCalls - This scoped hash table contains the current values - /// of read-only call values. It uses the same generation count as loads. - typedef ScopedHashTable<CallValue, std::pair<Value*, unsigned> > CallHTType; - CallHTType *AvailableCalls; - - /// CurrentGeneration - This is the current generation of the memory value. + /// \brief A scoped hash table of the current values of all of our simple + /// scalar expressions. + /// + /// As we walk down the domtree, we look to see if instructions are in this: + /// if so, we replace them with what we find, otherwise we insert them so + /// that dominated values can succeed in their lookup. + ScopedHTType AvailableValues; + + /// \brief A scoped hash table of the current values of loads. + /// + /// This allows us to get efficient access to dominating loads when we have + /// a fully redundant load. In addition to the most recent load, we keep + /// track of a generation count of the read, which is compared against the + /// current generation count. The current generation count is incremented + /// after every possibly writing memory operation, which ensures that we only + /// CSE loads with other loads that have no intervening store. + typedef RecyclingAllocator< + BumpPtrAllocator, + ScopedHashTableVal<Value *, std::pair<Value *, unsigned>>> + LoadMapAllocator; + typedef ScopedHashTable<Value *, std::pair<Value *, unsigned>, + DenseMapInfo<Value *>, LoadMapAllocator> LoadHTType; + LoadHTType AvailableLoads; + + /// \brief A scoped hash table of the current values of read-only call + /// values. + /// + /// It uses the same generation count as loads. + typedef ScopedHashTable<CallValue, std::pair<Value *, unsigned>> CallHTType; + CallHTType AvailableCalls; + + /// \brief This is the current generation of the memory value. unsigned CurrentGeneration; - static char ID; - explicit EarlyCSE() : FunctionPass(ID) { - initializeEarlyCSEPass(*PassRegistry::getPassRegistry()); - } + /// \brief Set up the EarlyCSE runner for a particular function. + EarlyCSE(Function &F, const TargetLibraryInfo &TLI, + const TargetTransformInfo &TTI, DominatorTree &DT, + AssumptionCache &AC) + : F(F), TLI(TLI), TTI(TTI), DT(DT), AC(AC), CurrentGeneration(0) {} - bool runOnFunction(Function &F) override; + bool run(); private: - - // NodeScope - almost a POD, but needs to call the constructors for the - // scoped hash tables so that a new scope gets pushed on. These are RAII so - // that the scope gets popped when the NodeScope is destroyed. + // Almost a POD, but needs to call the constructors for the scoped hash + // tables so that a new scope gets pushed on. These are RAII so that the + // scope gets popped when the NodeScope is destroyed. class NodeScope { - public: - NodeScope(ScopedHTType *availableValues, - LoadHTType *availableLoads, - CallHTType *availableCalls) : - Scope(*availableValues), - LoadScope(*availableLoads), - CallScope(*availableCalls) {} - - private: - NodeScope(const NodeScope&) LLVM_DELETED_FUNCTION; - void operator=(const NodeScope&) LLVM_DELETED_FUNCTION; + public: + NodeScope(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads, + CallHTType &AvailableCalls) + : Scope(AvailableValues), LoadScope(AvailableLoads), + CallScope(AvailableCalls) {} + + private: + NodeScope(const NodeScope &) = delete; + void operator=(const NodeScope &) = delete; ScopedHTType::ScopeTy Scope; LoadHTType::ScopeTy LoadScope; CallHTType::ScopeTy CallScope; }; - // StackNode - contains all the needed information to create a stack for - // doing a depth first tranversal of the tree. This includes scopes for - // values, loads, and calls as well as the generation. There is a child - // iterator so that the children do not need to be store spearately. + // Contains all the needed information to create a stack for doing a depth + // first tranversal of the tree. This includes scopes for values, loads, and + // calls as well as the generation. There is a child iterator so that the + // children do not need to be store spearately. class StackNode { - public: - StackNode(ScopedHTType *availableValues, - LoadHTType *availableLoads, - CallHTType *availableCalls, - unsigned cg, DomTreeNode *n, - DomTreeNode::iterator child, DomTreeNode::iterator end) : - CurrentGeneration(cg), ChildGeneration(cg), Node(n), - ChildIter(child), EndIter(end), - Scopes(availableValues, availableLoads, availableCalls), - Processed(false) {} + public: + StackNode(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads, + CallHTType &AvailableCalls, unsigned cg, DomTreeNode *n, + DomTreeNode::iterator child, DomTreeNode::iterator end) + : CurrentGeneration(cg), ChildGeneration(cg), Node(n), ChildIter(child), + EndIter(end), Scopes(AvailableValues, AvailableLoads, AvailableCalls), + Processed(false) {} // Accessors. unsigned currentGeneration() { return CurrentGeneration; } @@ -365,9 +363,9 @@ private: bool isProcessed() { return Processed; } void process() { Processed = true; } - private: - StackNode(const StackNode&) LLVM_DELETED_FUNCTION; - void operator=(const StackNode&) LLVM_DELETED_FUNCTION; + private: + StackNode(const StackNode &) = delete; + void operator=(const StackNode &) = delete; // Members. unsigned CurrentGeneration; @@ -379,31 +377,78 @@ private: bool Processed; }; + /// \brief Wrapper class to handle memory instructions, including loads, + /// stores and intrinsic loads and stores defined by the target. + class ParseMemoryInst { + public: + ParseMemoryInst(Instruction *Inst, const TargetTransformInfo &TTI) + : Load(false), Store(false), Vol(false), MayReadFromMemory(false), + MayWriteToMemory(false), MatchingId(-1), Ptr(nullptr) { + MayReadFromMemory = Inst->mayReadFromMemory(); + MayWriteToMemory = Inst->mayWriteToMemory(); + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) { + MemIntrinsicInfo Info; + if (!TTI.getTgtMemIntrinsic(II, Info)) + return; + if (Info.NumMemRefs == 1) { + Store = Info.WriteMem; + Load = Info.ReadMem; + MatchingId = Info.MatchingId; + MayReadFromMemory = Info.ReadMem; + MayWriteToMemory = Info.WriteMem; + Vol = Info.Vol; + Ptr = Info.PtrVal; + } + } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { + Load = true; + Vol = !LI->isSimple(); + Ptr = LI->getPointerOperand(); + } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { + Store = true; + Vol = !SI->isSimple(); + Ptr = SI->getPointerOperand(); + } + } + bool isLoad() { return Load; } + bool isStore() { return Store; } + bool isVolatile() { return Vol; } + bool isMatchingMemLoc(const ParseMemoryInst &Inst) { + return Ptr == Inst.Ptr && MatchingId == Inst.MatchingId; + } + bool isValid() { return Ptr != nullptr; } + int getMatchingId() { return MatchingId; } + Value *getPtr() { return Ptr; } + bool mayReadFromMemory() { return MayReadFromMemory; } + bool mayWriteToMemory() { return MayWriteToMemory; } + + private: + bool Load; + bool Store; + bool Vol; + bool MayReadFromMemory; + bool MayWriteToMemory; + // For regular (non-intrinsic) loads/stores, this is set to -1. For + // intrinsic loads/stores, the id is retrieved from the corresponding + // field in the MemIntrinsicInfo structure. That field contains + // non-negative values only. + int MatchingId; + Value *Ptr; + }; + bool processNode(DomTreeNode *Node); - // This transformation requires dominator postdominator info - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AssumptionCacheTracker>(); - AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<TargetLibraryInfo>(); - AU.setPreservesCFG(); + Value *getOrCreateResult(Value *Inst, Type *ExpectedType) const { + if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) + return LI; + else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) + return SI->getValueOperand(); + assert(isa<IntrinsicInst>(Inst) && "Instruction not supported"); + return TTI.getOrCreateResultFromMemIntrinsic(cast<IntrinsicInst>(Inst), + ExpectedType); } }; } -char EarlyCSE::ID = 0; - -// createEarlyCSEPass - The public interface to this file. -FunctionPass *llvm::createEarlyCSEPass() { - return new EarlyCSE(); -} - -INITIALIZE_PASS_BEGIN(EarlyCSE, "early-cse", "Early CSE", false, false) -INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) -INITIALIZE_PASS_END(EarlyCSE, "early-cse", "Early CSE", false, false) - bool EarlyCSE::processNode(DomTreeNode *Node) { BasicBlock *BB = Node->getBlock(); @@ -416,21 +461,46 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { if (!BB->getSinglePredecessor()) ++CurrentGeneration; + // If this node has a single predecessor which ends in a conditional branch, + // we can infer the value of the branch condition given that we took this + // path. We need the single predeccesor to ensure there's not another path + // which reaches this block where the condition might hold a different + // value. Since we're adding this to the scoped hash table (like any other + // def), it will have been popped if we encounter a future merge block. + if (BasicBlock *Pred = BB->getSinglePredecessor()) + if (auto *BI = dyn_cast<BranchInst>(Pred->getTerminator())) + if (BI->isConditional()) + if (auto *CondInst = dyn_cast<Instruction>(BI->getCondition())) + if (SimpleValue::canHandle(CondInst)) { + assert(BI->getSuccessor(0) == BB || BI->getSuccessor(1) == BB); + auto *ConditionalConstant = (BI->getSuccessor(0) == BB) ? + ConstantInt::getTrue(BB->getContext()) : + ConstantInt::getFalse(BB->getContext()); + AvailableValues.insert(CondInst, ConditionalConstant); + DEBUG(dbgs() << "EarlyCSE CVP: Add conditional value for '" + << CondInst->getName() << "' as " << *ConditionalConstant + << " in " << BB->getName() << "\n"); + // Replace all dominated uses with the known value + replaceDominatedUsesWith(CondInst, ConditionalConstant, DT, + BasicBlockEdge(Pred, BB)); + } + /// LastStore - Keep track of the last non-volatile store that we saw... for /// as long as there in no instruction that reads memory. If we see a store /// to the same location, we delete the dead store. This zaps trivial dead /// stores which can occur in bitfield code among other things. - StoreInst *LastStore = nullptr; + Instruction *LastStore = nullptr; bool Changed = false; + const DataLayout &DL = BB->getModule()->getDataLayout(); // See if any instructions in the block can be eliminated. If so, do it. If // not, add them to AvailableValues. - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) { + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { Instruction *Inst = I++; // Dead instructions should just be removed. - if (isInstructionTriviallyDead(Inst, TLI)) { + if (isInstructionTriviallyDead(Inst, &TLI)) { DEBUG(dbgs() << "EarlyCSE DCE: " << *Inst << '\n'); Inst->eraseFromParent(); Changed = true; @@ -449,7 +519,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { // If the instruction can be simplified (e.g. X+0 = X) then replace it with // its simpler value. - if (Value *V = SimplifyInstruction(Inst, DL, TLI, DT, AC)) { + if (Value *V = SimplifyInstruction(Inst, DL, &TLI, &DT, &AC)) { DEBUG(dbgs() << "EarlyCSE Simplify: " << *Inst << " to: " << *V << '\n'); Inst->replaceAllUsesWith(V); Inst->eraseFromParent(); @@ -461,7 +531,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { // If this is a simple instruction that we can value number, process it. if (SimpleValue::canHandle(Inst)) { // See if the instruction has an available value. If so, use it. - if (Value *V = AvailableValues->lookup(Inst)) { + if (Value *V = AvailableValues.lookup(Inst)) { DEBUG(dbgs() << "EarlyCSE CSE: " << *Inst << " to: " << *V << '\n'); Inst->replaceAllUsesWith(V); Inst->eraseFromParent(); @@ -471,14 +541,15 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { } // Otherwise, just remember that this value is available. - AvailableValues->insert(Inst, Inst); + AvailableValues.insert(Inst, Inst); continue; } + ParseMemoryInst MemInst(Inst, TTI); // If this is a non-volatile load, process it. - if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { + if (MemInst.isValid() && MemInst.isLoad()) { // Ignore volatile loads. - if (!LI->isSimple()) { + if (MemInst.isVolatile()) { LastStore = nullptr; // Don't CSE across synchronization boundaries. if (Inst->mayWriteToMemory()) @@ -488,38 +559,48 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { // If we have an available version of this load, and if it is the right // generation, replace this instruction. - std::pair<Value*, unsigned> InVal = - AvailableLoads->lookup(Inst->getOperand(0)); + std::pair<Value *, unsigned> InVal = + AvailableLoads.lookup(MemInst.getPtr()); if (InVal.first != nullptr && InVal.second == CurrentGeneration) { - DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst << " to: " - << *InVal.first << '\n'); - if (!Inst->use_empty()) Inst->replaceAllUsesWith(InVal.first); - Inst->eraseFromParent(); - Changed = true; - ++NumCSELoad; - continue; + Value *Op = getOrCreateResult(InVal.first, Inst->getType()); + if (Op != nullptr) { + DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst + << " to: " << *InVal.first << '\n'); + if (!Inst->use_empty()) + Inst->replaceAllUsesWith(Op); + Inst->eraseFromParent(); + Changed = true; + ++NumCSELoad; + continue; + } } // Otherwise, remember that we have this instruction. - AvailableLoads->insert(Inst->getOperand(0), - std::pair<Value*, unsigned>(Inst, CurrentGeneration)); + AvailableLoads.insert(MemInst.getPtr(), std::pair<Value *, unsigned>( + Inst, CurrentGeneration)); LastStore = nullptr; continue; } // If this instruction may read from memory, forget LastStore. - if (Inst->mayReadFromMemory()) + // Load/store intrinsics will indicate both a read and a write to + // memory. The target may override this (e.g. so that a store intrinsic + // does not read from memory, and thus will be treated the same as a + // regular store for commoning purposes). + if (Inst->mayReadFromMemory() && + !(MemInst.isValid() && !MemInst.mayReadFromMemory())) LastStore = nullptr; // If this is a read-only call, process it. if (CallValue::canHandle(Inst)) { // If we have an available version of this call, and if it is the right // generation, replace this instruction. - std::pair<Value*, unsigned> InVal = AvailableCalls->lookup(Inst); + std::pair<Value *, unsigned> InVal = AvailableCalls.lookup(Inst); if (InVal.first != nullptr && InVal.second == CurrentGeneration) { - DEBUG(dbgs() << "EarlyCSE CSE CALL: " << *Inst << " to: " - << *InVal.first << '\n'); - if (!Inst->use_empty()) Inst->replaceAllUsesWith(InVal.first); + DEBUG(dbgs() << "EarlyCSE CSE CALL: " << *Inst + << " to: " << *InVal.first << '\n'); + if (!Inst->use_empty()) + Inst->replaceAllUsesWith(InVal.first); Inst->eraseFromParent(); Changed = true; ++NumCSECall; @@ -527,8 +608,8 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { } // Otherwise, remember that we have this instruction. - AvailableCalls->insert(Inst, - std::pair<Value*, unsigned>(Inst, CurrentGeneration)); + AvailableCalls.insert( + Inst, std::pair<Value *, unsigned>(Inst, CurrentGeneration)); continue; } @@ -538,17 +619,19 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { if (Inst->mayWriteToMemory()) { ++CurrentGeneration; - if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { + if (MemInst.isValid() && MemInst.isStore()) { // We do a trivial form of DSE if there are two stores to the same // location with no intervening loads. Delete the earlier store. - if (LastStore && - LastStore->getPointerOperand() == SI->getPointerOperand()) { - DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore << " due to: " - << *Inst << '\n'); - LastStore->eraseFromParent(); - Changed = true; - ++NumDSE; - LastStore = nullptr; + if (LastStore) { + ParseMemoryInst LastStoreMemInst(LastStore, TTI); + if (LastStoreMemInst.isMatchingMemLoc(MemInst)) { + DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore + << " due to: " << *Inst << '\n'); + LastStore->eraseFromParent(); + Changed = true; + ++NumDSE; + LastStore = nullptr; + } // fallthrough - we can exploit information about this store } @@ -557,12 +640,12 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { // version of the pointer. It is safe to forward from volatile stores // to non-volatile loads, so we don't have to check for volatility of // the store. - AvailableLoads->insert(SI->getPointerOperand(), - std::pair<Value*, unsigned>(SI->getValueOperand(), CurrentGeneration)); + AvailableLoads.insert(MemInst.getPtr(), std::pair<Value *, unsigned>( + Inst, CurrentGeneration)); // Remember that this was the last store we saw for DSE. - if (SI->isSimple()) - LastStore = SI; + if (!MemInst.isVolatile()) + LastStore = Inst; } } } @@ -570,40 +653,20 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { return Changed; } - -bool EarlyCSE::runOnFunction(Function &F) { - if (skipOptnoneFunction(F)) - return false; - - // Note, deque is being used here because there is significant performance gains - // over vector when the container becomes very large due to the specific access - // patterns. For more information see the mailing list discussion on this: +bool EarlyCSE::run() { + // Note, deque is being used here because there is significant performance + // gains over vector when the container becomes very large due to the + // specific access patterns. For more information see the mailing list + // discussion on this: // http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20120116/135228.html std::deque<StackNode *> nodesToProcess; - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - DL = DLP ? &DLP->getDataLayout() : nullptr; - TLI = &getAnalysis<TargetLibraryInfo>(); - DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); - - // Tables that the pass uses when walking the domtree. - ScopedHTType AVTable; - AvailableValues = &AVTable; - LoadHTType LoadTable; - AvailableLoads = &LoadTable; - CallHTType CallTable; - AvailableCalls = &CallTable; - - CurrentGeneration = 0; bool Changed = false; // Process the root node. - nodesToProcess.push_back( - new StackNode(AvailableValues, AvailableLoads, AvailableCalls, - CurrentGeneration, DT->getRootNode(), - DT->getRootNode()->begin(), - DT->getRootNode()->end())); + nodesToProcess.push_back(new StackNode( + AvailableValues, AvailableLoads, AvailableCalls, CurrentGeneration, + DT.getRootNode(), DT.getRootNode()->begin(), DT.getRootNode()->end())); // Save the current generation. unsigned LiveOutGeneration = CurrentGeneration; @@ -627,11 +690,9 @@ bool EarlyCSE::runOnFunction(Function &F) { // Push the next child onto the stack. DomTreeNode *child = NodeToProcess->nextChild(); nodesToProcess.push_back( - new StackNode(AvailableValues, - AvailableLoads, - AvailableCalls, - NodeToProcess->childGeneration(), child, - child->begin(), child->end())); + new StackNode(AvailableValues, AvailableLoads, AvailableCalls, + NodeToProcess->childGeneration(), child, child->begin(), + child->end())); } else { // It has been processed, and there are no more children to process, // so delete it and pop it off the stack. @@ -645,3 +706,74 @@ bool EarlyCSE::runOnFunction(Function &F) { return Changed; } + +PreservedAnalyses EarlyCSEPass::run(Function &F, + AnalysisManager<Function> *AM) { + auto &TLI = AM->getResult<TargetLibraryAnalysis>(F); + auto &TTI = AM->getResult<TargetIRAnalysis>(F); + auto &DT = AM->getResult<DominatorTreeAnalysis>(F); + auto &AC = AM->getResult<AssumptionAnalysis>(F); + + EarlyCSE CSE(F, TLI, TTI, DT, AC); + + if (!CSE.run()) + return PreservedAnalyses::all(); + + // CSE preserves the dominator tree because it doesn't mutate the CFG. + // FIXME: Bundle this with other CFG-preservation. + PreservedAnalyses PA; + PA.preserve<DominatorTreeAnalysis>(); + return PA; +} + +namespace { +/// \brief A simple and fast domtree-based CSE pass. +/// +/// This pass does a simple depth-first walk over the dominator tree, +/// eliminating trivially redundant instructions and using instsimplify to +/// canonicalize things as it goes. It is intended to be fast and catch obvious +/// cases so that instcombine and other passes are more effective. It is +/// expected that a later pass of GVN will catch the interesting/hard cases. +class EarlyCSELegacyPass : public FunctionPass { +public: + static char ID; + + EarlyCSELegacyPass() : FunctionPass(ID) { + initializeEarlyCSELegacyPassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + if (skipOptnoneFunction(F)) + return false; + + auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + + EarlyCSE CSE(F, TLI, TTI, DT, AC); + + return CSE.run(); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.setPreservesCFG(); + } +}; +} + +char EarlyCSELegacyPass::ID = 0; + +FunctionPass *llvm::createEarlyCSEPass() { return new EarlyCSELegacyPass(); } + +INITIALIZE_PASS_BEGIN(EarlyCSELegacyPass, "early-cse", "Early CSE", false, + false) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_END(EarlyCSELegacyPass, "early-cse", "Early CSE", false, false) diff --git a/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp b/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp new file mode 100644 index 0000000..c931422 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp @@ -0,0 +1,540 @@ +//===- Float2Int.cpp - Demote floating point ops to work on integers ------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the Float2Int pass, which aims to demote floating +// point operations to work on integers, where that is losslessly possible. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "float2int" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/APSInt.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/EquivalenceClasses.h" +#include "llvm/ADT/MapVector.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/IR/ConstantRange.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" +#include <deque> +#include <functional> // For std::function +using namespace llvm; + +// The algorithm is simple. Start at instructions that convert from the +// float to the int domain: fptoui, fptosi and fcmp. Walk up the def-use +// graph, using an equivalence datastructure to unify graphs that interfere. +// +// Mappable instructions are those with an integer corrollary that, given +// integer domain inputs, produce an integer output; fadd, for example. +// +// If a non-mappable instruction is seen, this entire def-use graph is marked +// as non-transformable. If we see an instruction that converts from the +// integer domain to FP domain (uitofp,sitofp), we terminate our walk. + +/// The largest integer type worth dealing with. +static cl::opt<unsigned> +MaxIntegerBW("float2int-max-integer-bw", cl::init(64), cl::Hidden, + cl::desc("Max integer bitwidth to consider in float2int" + "(default=64)")); + +namespace { + struct Float2Int : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + Float2Int() : FunctionPass(ID) { + initializeFloat2IntPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + } + + void findRoots(Function &F, SmallPtrSet<Instruction*,8> &Roots); + ConstantRange seen(Instruction *I, ConstantRange R); + ConstantRange badRange(); + ConstantRange unknownRange(); + ConstantRange validateRange(ConstantRange R); + void walkBackwards(const SmallPtrSetImpl<Instruction*> &Roots); + void walkForwards(); + bool validateAndTransform(); + Value *convert(Instruction *I, Type *ToTy); + void cleanup(); + + MapVector<Instruction*, ConstantRange > SeenInsts; + SmallPtrSet<Instruction*,8> Roots; + EquivalenceClasses<Instruction*> ECs; + MapVector<Instruction*, Value*> ConvertedInsts; + LLVMContext *Ctx; + }; +} + +char Float2Int::ID = 0; +INITIALIZE_PASS(Float2Int, "float2int", "Float to int", false, false) + +// Given a FCmp predicate, return a matching ICmp predicate if one +// exists, otherwise return BAD_ICMP_PREDICATE. +static CmpInst::Predicate mapFCmpPred(CmpInst::Predicate P) { + switch (P) { + case CmpInst::FCMP_OEQ: + case CmpInst::FCMP_UEQ: + return CmpInst::ICMP_EQ; + case CmpInst::FCMP_OGT: + case CmpInst::FCMP_UGT: + return CmpInst::ICMP_SGT; + case CmpInst::FCMP_OGE: + case CmpInst::FCMP_UGE: + return CmpInst::ICMP_SGE; + case CmpInst::FCMP_OLT: + case CmpInst::FCMP_ULT: + return CmpInst::ICMP_SLT; + case CmpInst::FCMP_OLE: + case CmpInst::FCMP_ULE: + return CmpInst::ICMP_SLE; + case CmpInst::FCMP_ONE: + case CmpInst::FCMP_UNE: + return CmpInst::ICMP_NE; + default: + return CmpInst::BAD_ICMP_PREDICATE; + } +} + +// Given a floating point binary operator, return the matching +// integer version. +static Instruction::BinaryOps mapBinOpcode(unsigned Opcode) { + switch (Opcode) { + default: llvm_unreachable("Unhandled opcode!"); + case Instruction::FAdd: return Instruction::Add; + case Instruction::FSub: return Instruction::Sub; + case Instruction::FMul: return Instruction::Mul; + } +} + +// Find the roots - instructions that convert from the FP domain to +// integer domain. +void Float2Int::findRoots(Function &F, SmallPtrSet<Instruction*,8> &Roots) { + for (auto &I : inst_range(F)) { + switch (I.getOpcode()) { + default: break; + case Instruction::FPToUI: + case Instruction::FPToSI: + Roots.insert(&I); + break; + case Instruction::FCmp: + if (mapFCmpPred(cast<CmpInst>(&I)->getPredicate()) != + CmpInst::BAD_ICMP_PREDICATE) + Roots.insert(&I); + break; + } + } +} + +// Helper - mark I as having been traversed, having range R. +ConstantRange Float2Int::seen(Instruction *I, ConstantRange R) { + DEBUG(dbgs() << "F2I: " << *I << ":" << R << "\n"); + if (SeenInsts.find(I) != SeenInsts.end()) + SeenInsts.find(I)->second = R; + else + SeenInsts.insert(std::make_pair(I, R)); + return R; +} + +// Helper - get a range representing a poison value. +ConstantRange Float2Int::badRange() { + return ConstantRange(MaxIntegerBW + 1, true); +} +ConstantRange Float2Int::unknownRange() { + return ConstantRange(MaxIntegerBW + 1, false); +} +ConstantRange Float2Int::validateRange(ConstantRange R) { + if (R.getBitWidth() > MaxIntegerBW + 1) + return badRange(); + return R; +} + +// The most obvious way to structure the search is a depth-first, eager +// search from each root. However, that require direct recursion and so +// can only handle small instruction sequences. Instead, we split the search +// up into two phases: +// - walkBackwards: A breadth-first walk of the use-def graph starting from +// the roots. Populate "SeenInsts" with interesting +// instructions and poison values if they're obvious and +// cheap to compute. Calculate the equivalance set structure +// while we're here too. +// - walkForwards: Iterate over SeenInsts in reverse order, so we visit +// defs before their uses. Calculate the real range info. + +// Breadth-first walk of the use-def graph; determine the set of nodes +// we care about and eagerly determine if some of them are poisonous. +void Float2Int::walkBackwards(const SmallPtrSetImpl<Instruction*> &Roots) { + std::deque<Instruction*> Worklist(Roots.begin(), Roots.end()); + while (!Worklist.empty()) { + Instruction *I = Worklist.back(); + Worklist.pop_back(); + + if (SeenInsts.find(I) != SeenInsts.end()) + // Seen already. + continue; + + switch (I->getOpcode()) { + // FIXME: Handle select and phi nodes. + default: + // Path terminated uncleanly. + seen(I, badRange()); + break; + + case Instruction::UIToFP: { + // Path terminated cleanly. + unsigned BW = I->getOperand(0)->getType()->getPrimitiveSizeInBits(); + APInt Min = APInt::getMinValue(BW).zextOrSelf(MaxIntegerBW+1); + APInt Max = APInt::getMaxValue(BW).zextOrSelf(MaxIntegerBW+1); + seen(I, validateRange(ConstantRange(Min, Max))); + continue; + } + + case Instruction::SIToFP: { + // Path terminated cleanly. + unsigned BW = I->getOperand(0)->getType()->getPrimitiveSizeInBits(); + APInt SMin = APInt::getSignedMinValue(BW).sextOrSelf(MaxIntegerBW+1); + APInt SMax = APInt::getSignedMaxValue(BW).sextOrSelf(MaxIntegerBW+1); + seen(I, validateRange(ConstantRange(SMin, SMax))); + continue; + } + + case Instruction::FAdd: + case Instruction::FSub: + case Instruction::FMul: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FCmp: + seen(I, unknownRange()); + break; + } + + for (Value *O : I->operands()) { + if (Instruction *OI = dyn_cast<Instruction>(O)) { + // Unify def-use chains if they interfere. + ECs.unionSets(I, OI); + if (SeenInsts.find(I)->second != badRange()) + Worklist.push_back(OI); + } else if (!isa<ConstantFP>(O)) { + // Not an instruction or ConstantFP? we can't do anything. + seen(I, badRange()); + } + } + } +} + +// Walk forwards down the list of seen instructions, so we visit defs before +// uses. +void Float2Int::walkForwards() { + for (auto It = SeenInsts.rbegin(), E = SeenInsts.rend(); It != E; ++It) { + if (It->second != unknownRange()) + continue; + + Instruction *I = It->first; + std::function<ConstantRange(ArrayRef<ConstantRange>)> Op; + switch (I->getOpcode()) { + // FIXME: Handle select and phi nodes. + default: + case Instruction::UIToFP: + case Instruction::SIToFP: + llvm_unreachable("Should have been handled in walkForwards!"); + + case Instruction::FAdd: + Op = [](ArrayRef<ConstantRange> Ops) { + assert(Ops.size() == 2 && "FAdd is a binary operator!"); + return Ops[0].add(Ops[1]); + }; + break; + + case Instruction::FSub: + Op = [](ArrayRef<ConstantRange> Ops) { + assert(Ops.size() == 2 && "FSub is a binary operator!"); + return Ops[0].sub(Ops[1]); + }; + break; + + case Instruction::FMul: + Op = [](ArrayRef<ConstantRange> Ops) { + assert(Ops.size() == 2 && "FMul is a binary operator!"); + return Ops[0].multiply(Ops[1]); + }; + break; + + // + // Root-only instructions - we'll only see these if they're the + // first node in a walk. + // + case Instruction::FPToUI: + case Instruction::FPToSI: + Op = [](ArrayRef<ConstantRange> Ops) { + assert(Ops.size() == 1 && "FPTo[US]I is a unary operator!"); + return Ops[0]; + }; + break; + + case Instruction::FCmp: + Op = [](ArrayRef<ConstantRange> Ops) { + assert(Ops.size() == 2 && "FCmp is a binary operator!"); + return Ops[0].unionWith(Ops[1]); + }; + break; + } + + bool Abort = false; + SmallVector<ConstantRange,4> OpRanges; + for (Value *O : I->operands()) { + if (Instruction *OI = dyn_cast<Instruction>(O)) { + assert(SeenInsts.find(OI) != SeenInsts.end() && + "def not seen before use!"); + OpRanges.push_back(SeenInsts.find(OI)->second); + } else if (ConstantFP *CF = dyn_cast<ConstantFP>(O)) { + // Work out if the floating point number can be losslessly represented + // as an integer. + // APFloat::convertToInteger(&Exact) purports to do what we want, but + // the exactness can be too precise. For example, negative zero can + // never be exactly converted to an integer. + // + // Instead, we ask APFloat to round itself to an integral value - this + // preserves sign-of-zero - then compare the result with the original. + // + APFloat F = CF->getValueAPF(); + + // First, weed out obviously incorrect values. Non-finite numbers + // can't be represented and neither can negative zero, unless + // we're in fast math mode. + if (!F.isFinite() || + (F.isZero() && F.isNegative() && isa<FPMathOperator>(I) && + !I->hasNoSignedZeros())) { + seen(I, badRange()); + Abort = true; + break; + } + + APFloat NewF = F; + auto Res = NewF.roundToIntegral(APFloat::rmNearestTiesToEven); + if (Res != APFloat::opOK || NewF.compare(F) != APFloat::cmpEqual) { + seen(I, badRange()); + Abort = true; + break; + } + // OK, it's representable. Now get it. + APSInt Int(MaxIntegerBW+1, false); + bool Exact; + CF->getValueAPF().convertToInteger(Int, + APFloat::rmNearestTiesToEven, + &Exact); + OpRanges.push_back(ConstantRange(Int)); + } else { + llvm_unreachable("Should have already marked this as badRange!"); + } + } + + // Reduce the operands' ranges to a single range and return. + if (!Abort) + seen(I, Op(OpRanges)); + } +} + +// If there is a valid transform to be done, do it. +bool Float2Int::validateAndTransform() { + bool MadeChange = false; + + // Iterate over every disjoint partition of the def-use graph. + for (auto It = ECs.begin(), E = ECs.end(); It != E; ++It) { + ConstantRange R(MaxIntegerBW + 1, false); + bool Fail = false; + Type *ConvertedToTy = nullptr; + + // For every member of the partition, union all the ranges together. + for (auto MI = ECs.member_begin(It), ME = ECs.member_end(); + MI != ME; ++MI) { + Instruction *I = *MI; + auto SeenI = SeenInsts.find(I); + if (SeenI == SeenInsts.end()) + continue; + + R = R.unionWith(SeenI->second); + // We need to ensure I has no users that have not been seen. + // If it does, transformation would be illegal. + // + // Don't count the roots, as they terminate the graphs. + if (Roots.count(I) == 0) { + // Set the type of the conversion while we're here. + if (!ConvertedToTy) + ConvertedToTy = I->getType(); + for (User *U : I->users()) { + Instruction *UI = dyn_cast<Instruction>(U); + if (!UI || SeenInsts.find(UI) == SeenInsts.end()) { + DEBUG(dbgs() << "F2I: Failing because of " << *U << "\n"); + Fail = true; + break; + } + } + } + if (Fail) + break; + } + + // If the set was empty, or we failed, or the range is poisonous, + // bail out. + if (ECs.member_begin(It) == ECs.member_end() || Fail || + R.isFullSet() || R.isSignWrappedSet()) + continue; + assert(ConvertedToTy && "Must have set the convertedtoty by this point!"); + + // The number of bits required is the maximum of the upper and + // lower limits, plus one so it can be signed. + unsigned MinBW = std::max(R.getLower().getMinSignedBits(), + R.getUpper().getMinSignedBits()) + 1; + DEBUG(dbgs() << "F2I: MinBitwidth=" << MinBW << ", R: " << R << "\n"); + + // If we've run off the realms of the exactly representable integers, + // the floating point result will differ from an integer approximation. + + // Do we need more bits than are in the mantissa of the type we converted + // to? semanticsPrecision returns the number of mantissa bits plus one + // for the sign bit. + unsigned MaxRepresentableBits + = APFloat::semanticsPrecision(ConvertedToTy->getFltSemantics()) - 1; + if (MinBW > MaxRepresentableBits) { + DEBUG(dbgs() << "F2I: Value not guaranteed to be representable!\n"); + continue; + } + if (MinBW > 64) { + DEBUG(dbgs() << "F2I: Value requires more than 64 bits to represent!\n"); + continue; + } + + // OK, R is known to be representable. Now pick a type for it. + // FIXME: Pick the smallest legal type that will fit. + Type *Ty = (MinBW > 32) ? Type::getInt64Ty(*Ctx) : Type::getInt32Ty(*Ctx); + + for (auto MI = ECs.member_begin(It), ME = ECs.member_end(); + MI != ME; ++MI) + convert(*MI, Ty); + MadeChange = true; + } + + return MadeChange; +} + +Value *Float2Int::convert(Instruction *I, Type *ToTy) { + if (ConvertedInsts.find(I) != ConvertedInsts.end()) + // Already converted this instruction. + return ConvertedInsts[I]; + + SmallVector<Value*,4> NewOperands; + for (Value *V : I->operands()) { + // Don't recurse if we're an instruction that terminates the path. + if (I->getOpcode() == Instruction::UIToFP || + I->getOpcode() == Instruction::SIToFP) { + NewOperands.push_back(V); + } else if (Instruction *VI = dyn_cast<Instruction>(V)) { + NewOperands.push_back(convert(VI, ToTy)); + } else if (ConstantFP *CF = dyn_cast<ConstantFP>(V)) { + APSInt Val(ToTy->getPrimitiveSizeInBits(), /*IsUnsigned=*/false); + bool Exact; + CF->getValueAPF().convertToInteger(Val, + APFloat::rmNearestTiesToEven, + &Exact); + NewOperands.push_back(ConstantInt::get(ToTy, Val)); + } else { + llvm_unreachable("Unhandled operand type?"); + } + } + + // Now create a new instruction. + IRBuilder<> IRB(I); + Value *NewV = nullptr; + switch (I->getOpcode()) { + default: llvm_unreachable("Unhandled instruction!"); + + case Instruction::FPToUI: + NewV = IRB.CreateZExtOrTrunc(NewOperands[0], I->getType()); + break; + + case Instruction::FPToSI: + NewV = IRB.CreateSExtOrTrunc(NewOperands[0], I->getType()); + break; + + case Instruction::FCmp: { + CmpInst::Predicate P = mapFCmpPred(cast<CmpInst>(I)->getPredicate()); + assert(P != CmpInst::BAD_ICMP_PREDICATE && "Unhandled predicate!"); + NewV = IRB.CreateICmp(P, NewOperands[0], NewOperands[1], I->getName()); + break; + } + + case Instruction::UIToFP: + NewV = IRB.CreateZExtOrTrunc(NewOperands[0], ToTy); + break; + + case Instruction::SIToFP: + NewV = IRB.CreateSExtOrTrunc(NewOperands[0], ToTy); + break; + + case Instruction::FAdd: + case Instruction::FSub: + case Instruction::FMul: + NewV = IRB.CreateBinOp(mapBinOpcode(I->getOpcode()), + NewOperands[0], NewOperands[1], + I->getName()); + break; + } + + // If we're a root instruction, RAUW. + if (Roots.count(I)) + I->replaceAllUsesWith(NewV); + + ConvertedInsts[I] = NewV; + return NewV; +} + +// Perform dead code elimination on the instructions we just modified. +void Float2Int::cleanup() { + for (auto I = ConvertedInsts.rbegin(), E = ConvertedInsts.rend(); + I != E; ++I) + I->first->eraseFromParent(); +} + +bool Float2Int::runOnFunction(Function &F) { + if (skipOptnoneFunction(F)) + return false; + + DEBUG(dbgs() << "F2I: Looking at function " << F.getName() << "\n"); + // Clear out all state. + ECs = EquivalenceClasses<Instruction*>(); + SeenInsts.clear(); + ConvertedInsts.clear(); + Roots.clear(); + + Ctx = &F.getParent()->getContext(); + + findRoots(F, Roots); + + walkBackwards(Roots); + walkForwards(); + + bool Modified = validateAndTransform(); + if (Modified) + cleanup(); + return Modified; +} + +FunctionPass *llvm::createFloat2IntPass() { + return new Float2Int(); +} + diff --git a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp index 1ed14d0..7770ddc 100644 --- a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp @@ -33,6 +33,7 @@ #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" #include "llvm/Analysis/PHITransAddr.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" @@ -45,7 +46,7 @@ #include "llvm/Support/Allocator.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SSAUpdater.h" @@ -458,7 +459,7 @@ uint32_t ValueTable::lookup_or_add(Value *V) { return e; } -/// lookup - Returns the value number of the specified value. Fails if +/// Returns the value number of the specified value. Fails if /// the value has not yet been numbered. uint32_t ValueTable::lookup(Value *V) const { DenseMap<Value*, uint32_t>::const_iterator VI = valueNumbering.find(V); @@ -466,7 +467,7 @@ uint32_t ValueTable::lookup(Value *V) const { return VI->second; } -/// lookup_or_add_cmp - Returns the value number of the given comparison, +/// Returns the value number of the given comparison, /// assigning it a new number if it did not have one before. Useful when /// we deduced the result of a comparison, but don't immediately have an /// instruction realizing that comparison to hand. @@ -479,14 +480,14 @@ uint32_t ValueTable::lookup_or_add_cmp(unsigned Opcode, return e; } -/// clear - Remove all entries from the ValueTable. +/// Remove all entries from the ValueTable. void ValueTable::clear() { valueNumbering.clear(); expressionNumbering.clear(); nextValueNumber = 1; } -/// erase - Remove a value from the value numbering. +/// Remove a value from the value numbering. void ValueTable::erase(Value *V) { valueNumbering.erase(V); } @@ -582,23 +583,22 @@ namespace { return cast<MemIntrinsic>(Val.getPointer()); } - /// MaterializeAdjustedValue - Emit code into this block to adjust the value - /// defined here to the specified type. This handles various coercion cases. - Value *MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) const; + /// Emit code into this block to adjust the value defined here to the + /// specified type. This handles various coercion cases. + Value *MaterializeAdjustedValue(LoadInst *LI, GVN &gvn) const; }; class GVN : public FunctionPass { bool NoLoads; MemoryDependenceAnalysis *MD; DominatorTree *DT; - const DataLayout *DL; const TargetLibraryInfo *TLI; AssumptionCache *AC; SetVector<BasicBlock *> DeadBlocks; ValueTable VN; - /// LeaderTable - A mapping from value numbers to lists of Value*'s that + /// A mapping from value numbers to lists of Value*'s that /// have that value number. Use findLeader to query it. struct LeaderTableEntry { Value *Val; @@ -623,20 +623,18 @@ namespace { bool runOnFunction(Function &F) override; - /// markInstructionForDeletion - This removes the specified instruction from + /// This removes the specified instruction from /// our various maps and marks it for deletion. void markInstructionForDeletion(Instruction *I) { VN.erase(I); InstrsToErase.push_back(I); } - const DataLayout *getDataLayout() const { return DL; } DominatorTree &getDominatorTree() const { return *DT; } AliasAnalysis *getAliasAnalysis() const { return VN.getAliasAnalysis(); } MemoryDependenceAnalysis &getMemDep() const { return *MD; } private: - /// addToLeaderTable - Push a new Value to the LeaderTable onto the list for - /// its value number. + /// Push a new Value to the LeaderTable onto the list for its value number. void addToLeaderTable(uint32_t N, Value *V, const BasicBlock *BB) { LeaderTableEntry &Curr = LeaderTable[N]; if (!Curr.Val) { @@ -652,7 +650,7 @@ namespace { Curr.Next = Node; } - /// removeFromLeaderTable - Scan the list of values corresponding to a given + /// Scan the list of values corresponding to a given /// value number, and remove the given instruction if encountered. void removeFromLeaderTable(uint32_t N, Instruction *I, BasicBlock *BB) { LeaderTableEntry* Prev = nullptr; @@ -685,7 +683,7 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<TargetLibraryInfo>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); if (!NoLoads) AU.addRequired<MemoryDependenceAnalysis>(); AU.addRequired<AliasAnalysis>(); @@ -711,13 +709,13 @@ namespace { bool iterateOnFunction(Function &F); bool performPRE(Function &F); bool performScalarPRE(Instruction *I); + bool performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred, + unsigned int ValNo); Value *findLeader(const BasicBlock *BB, uint32_t num); void cleanupGlobalSets(); void verifyRemoved(const Instruction *I) const; bool splitCriticalEdges(); BasicBlock *splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ); - unsigned replaceAllDominatedUsesWith(Value *From, Value *To, - const BasicBlockEdge &Root); bool propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root); bool processFoldableCondBr(BranchInst *BI); void addDeadBlock(BasicBlock *BB); @@ -727,7 +725,7 @@ namespace { char GVN::ID = 0; } -// createGVNPass - The public interface to this file... +// The public interface to this file... FunctionPass *llvm::createGVNPass(bool NoLoads) { return new GVN(NoLoads); } @@ -736,7 +734,7 @@ INITIALIZE_PASS_BEGIN(GVN, "gvn", "Global Value Numbering", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) INITIALIZE_PASS_END(GVN, "gvn", "Global Value Numbering", false, false) @@ -752,7 +750,7 @@ void GVN::dump(DenseMap<uint32_t, Value*>& d) { } #endif -/// IsValueFullyAvailableInBlock - Return true if we can prove that the value +/// Return true if we can prove that the value /// we're analyzing is fully available in the specified block. As we go, keep /// track of which blocks we know are fully alive in FullyAvailableBlocks. This /// map is actually a tri-state map with the following values: @@ -798,7 +796,7 @@ static bool IsValueFullyAvailableInBlock(BasicBlock *BB, return true; -// SpeculationFailure - If we get here, we found out that this is not, after +// If we get here, we found out that this is not, after // all, a fully-available block. We have a problem if we speculated on this and // used the speculation to mark other blocks as available. SpeculationFailure: @@ -833,8 +831,7 @@ SpeculationFailure: } -/// CanCoerceMustAliasedValueToLoad - Return true if -/// CoerceAvailableValueToLoadType will succeed. +/// Return true if CoerceAvailableValueToLoadType will succeed. static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy, const DataLayout &DL) { @@ -853,7 +850,7 @@ static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal, return true; } -/// CoerceAvailableValueToLoadType - If we saw a store of a value to memory, and +/// If we saw a store of a value to memory, and /// then a load from a must-aliased pointer of a different type, try to coerce /// the stored value. LoadedTy is the type of the load we want to replace and /// InsertPt is the place to insert new instructions. @@ -938,7 +935,7 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal, return new BitCastInst(StoredVal, LoadedTy, "bitcast", InsertPt); } -/// AnalyzeLoadFromClobberingWrite - This function is called when we have a +/// This function is called when we have a /// memdep query of a load that ends up being a clobbering memory write (store, /// memset, memcpy, memmove). This means that the write *may* provide bits used /// by the load but we can't be sure because the pointers don't mustalias. @@ -956,8 +953,9 @@ static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr, return -1; int64_t StoreOffset = 0, LoadOffset = 0; - Value *StoreBase = GetPointerBaseWithConstantOffset(WritePtr,StoreOffset,&DL); - Value *LoadBase = GetPointerBaseWithConstantOffset(LoadPtr, LoadOffset, &DL); + Value *StoreBase = + GetPointerBaseWithConstantOffset(WritePtr, StoreOffset, DL); + Value *LoadBase = GetPointerBaseWithConstantOffset(LoadPtr, LoadOffset, DL); if (StoreBase != LoadBase) return -1; @@ -1018,23 +1016,23 @@ static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr, return LoadOffset-StoreOffset; } -/// AnalyzeLoadFromClobberingStore - This function is called when we have a +/// This function is called when we have a /// memdep query of a load that ends up being a clobbering store. static int AnalyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr, - StoreInst *DepSI, - const DataLayout &DL) { + StoreInst *DepSI) { // Cannot handle reading from store of first-class aggregate yet. if (DepSI->getValueOperand()->getType()->isStructTy() || DepSI->getValueOperand()->getType()->isArrayTy()) return -1; + const DataLayout &DL = DepSI->getModule()->getDataLayout(); Value *StorePtr = DepSI->getPointerOperand(); uint64_t StoreSize =DL.getTypeSizeInBits(DepSI->getValueOperand()->getType()); return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, StorePtr, StoreSize, DL); } -/// AnalyzeLoadFromClobberingLoad - This function is called when we have a +/// This function is called when we have a /// memdep query of a load that ends up being clobbered by another load. See if /// the other load can feed into the second load. static int AnalyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr, @@ -1052,11 +1050,11 @@ static int AnalyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr, // then we should widen it! int64_t LoadOffs = 0; const Value *LoadBase = - GetPointerBaseWithConstantOffset(LoadPtr, LoadOffs, &DL); + GetPointerBaseWithConstantOffset(LoadPtr, LoadOffs, DL); unsigned LoadSize = DL.getTypeStoreSize(LoadTy); - unsigned Size = MemoryDependenceAnalysis:: - getLoadLoadClobberFullWidthSize(LoadBase, LoadOffs, LoadSize, DepLI, DL); + unsigned Size = MemoryDependenceAnalysis::getLoadLoadClobberFullWidthSize( + LoadBase, LoadOffs, LoadSize, DepLI); if (Size == 0) return -1; return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, Size*8, DL); @@ -1086,7 +1084,7 @@ static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr, Constant *Src = dyn_cast<Constant>(MTI->getSource()); if (!Src) return -1; - GlobalVariable *GV = dyn_cast<GlobalVariable>(GetUnderlyingObject(Src, &DL)); + GlobalVariable *GV = dyn_cast<GlobalVariable>(GetUnderlyingObject(Src, DL)); if (!GV || !GV->isConstant()) return -1; // See if the access is within the bounds of the transfer. @@ -1102,15 +1100,16 @@ static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr, Type::getInt8PtrTy(Src->getContext(), AS)); Constant *OffsetCst = ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset); - Src = ConstantExpr::getGetElementPtr(Src, OffsetCst); + Src = ConstantExpr::getGetElementPtr(Type::getInt8Ty(Src->getContext()), Src, + OffsetCst); Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS)); - if (ConstantFoldLoadFromConstPtr(Src, &DL)) + if (ConstantFoldLoadFromConstPtr(Src, DL)) return Offset; return -1; } -/// GetStoreValueForLoad - This function is called when we have a +/// This function is called when we have a /// memdep query of a load that ends up being a clobbering store. This means /// that the store provides bits used by the load but we the pointers don't /// mustalias. Check this case to see if there is anything more we can do @@ -1149,7 +1148,7 @@ static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset, return CoerceAvailableValueToLoadType(SrcVal, LoadTy, InsertPt, DL); } -/// GetLoadValueForLoad - This function is called when we have a +/// This function is called when we have a /// memdep query of a load that ends up being a clobbering load. This means /// that the load *may* provide bits used by the load but we can't be sure /// because the pointers don't mustalias. Check this case to see if there is @@ -1157,7 +1156,7 @@ static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset, static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, Type *LoadTy, Instruction *InsertPt, GVN &gvn) { - const DataLayout &DL = *gvn.getDataLayout(); + const DataLayout &DL = SrcVal->getModule()->getDataLayout(); // If Offset+LoadTy exceeds the size of SrcVal, then we must be wanting to // widen SrcVal out to a larger load. unsigned SrcValSize = DL.getTypeStoreSize(SrcVal->getType()); @@ -1212,7 +1211,7 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, } -/// GetMemInstValueForLoad - This function is called when we have a +/// This function is called when we have a /// memdep query of a load that ends up being a clobbering mem intrinsic. static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset, Type *LoadTy, Instruction *InsertPt, @@ -1263,13 +1262,14 @@ static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset, Type::getInt8PtrTy(Src->getContext(), AS)); Constant *OffsetCst = ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset); - Src = ConstantExpr::getGetElementPtr(Src, OffsetCst); + Src = ConstantExpr::getGetElementPtr(Type::getInt8Ty(Src->getContext()), Src, + OffsetCst); Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS)); - return ConstantFoldLoadFromConstPtr(Src, &DL); + return ConstantFoldLoadFromConstPtr(Src, DL); } -/// ConstructSSAForLoadSet - Given a set of loads specified by ValuesPerBlock, +/// Given a set of loads specified by ValuesPerBlock, /// construct SSA form, allowing us to eliminate LI. This returns the value /// that should be used at LI's definition site. static Value *ConstructSSAForLoadSet(LoadInst *LI, @@ -1281,7 +1281,7 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI, gvn.getDominatorTree().properlyDominates(ValuesPerBlock[0].BB, LI->getParent())) { assert(!ValuesPerBlock[0].isUndefValue() && "Dead BB dominate this block"); - return ValuesPerBlock[0].MaterializeAdjustedValue(LI->getType(), gvn); + return ValuesPerBlock[0].MaterializeAdjustedValue(LI, gvn); } // Otherwise, we have to construct SSA form. @@ -1289,8 +1289,6 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI, SSAUpdater SSAUpdate(&NewPHIs); SSAUpdate.Initialize(LI->getType(), LI->getName()); - Type *LoadTy = LI->getType(); - for (unsigned i = 0, e = ValuesPerBlock.size(); i != e; ++i) { const AvailableValueInBlock &AV = ValuesPerBlock[i]; BasicBlock *BB = AV.BB; @@ -1298,7 +1296,7 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI, if (SSAUpdate.HasValueForBlock(BB)) continue; - SSAUpdate.AddAvailableValue(BB, AV.MaterializeAdjustedValue(LoadTy, gvn)); + SSAUpdate.AddAvailableValue(BB, AV.MaterializeAdjustedValue(LI, gvn)); } // Perform PHI construction. @@ -1326,16 +1324,16 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI, return V; } -Value *AvailableValueInBlock::MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) const { +Value *AvailableValueInBlock::MaterializeAdjustedValue(LoadInst *LI, + GVN &gvn) const { Value *Res; + Type *LoadTy = LI->getType(); + const DataLayout &DL = LI->getModule()->getDataLayout(); if (isSimpleValue()) { Res = getSimpleValue(); if (Res->getType() != LoadTy) { - const DataLayout *DL = gvn.getDataLayout(); - assert(DL && "Need target data to handle type mismatch case"); - Res = GetStoreValueForLoad(Res, Offset, LoadTy, BB->getTerminator(), - *DL); - + Res = GetStoreValueForLoad(Res, Offset, LoadTy, BB->getTerminator(), DL); + DEBUG(dbgs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset << " " << *getSimpleValue() << '\n' << *Res << '\n' << "\n\n\n"); @@ -1353,10 +1351,8 @@ Value *AvailableValueInBlock::MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) c << *Res << '\n' << "\n\n\n"); } } else if (isMemIntrinValue()) { - const DataLayout *DL = gvn.getDataLayout(); - assert(DL && "Need target data to handle type mismatch case"); - Res = GetMemInstValueForLoad(getMemIntrinValue(), Offset, - LoadTy, BB->getTerminator(), *DL); + Res = GetMemInstValueForLoad(getMemIntrinValue(), Offset, LoadTy, + BB->getTerminator(), DL); DEBUG(dbgs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset << " " << *getMemIntrinValue() << '\n' << *Res << '\n' << "\n\n\n"); @@ -1383,6 +1379,7 @@ void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps, // dependencies that produce an unknown value for the load (such as a call // that could potentially clobber the load). unsigned NumDeps = Deps.size(); + const DataLayout &DL = LI->getModule()->getDataLayout(); for (unsigned i = 0, e = NumDeps; i != e; ++i) { BasicBlock *DepBB = Deps[i].getBB(); MemDepResult DepInfo = Deps[i].getResult(); @@ -1409,9 +1406,9 @@ void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps, // read by the load, we can extract the bits we need for the load from the // stored value. if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInfo.getInst())) { - if (DL && Address) { - int Offset = AnalyzeLoadFromClobberingStore(LI->getType(), Address, - DepSI, *DL); + if (Address) { + int Offset = + AnalyzeLoadFromClobberingStore(LI->getType(), Address, DepSI); if (Offset != -1) { ValuesPerBlock.push_back(AvailableValueInBlock::get(DepBB, DepSI->getValueOperand(), @@ -1428,9 +1425,9 @@ void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps, if (LoadInst *DepLI = dyn_cast<LoadInst>(DepInfo.getInst())) { // If this is a clobber and L is the first instruction in its block, then // we have the first instruction in the entry block. - if (DepLI != LI && Address && DL) { - int Offset = AnalyzeLoadFromClobberingLoad(LI->getType(), Address, - DepLI, *DL); + if (DepLI != LI && Address) { + int Offset = + AnalyzeLoadFromClobberingLoad(LI->getType(), Address, DepLI, DL); if (Offset != -1) { ValuesPerBlock.push_back(AvailableValueInBlock::getLoad(DepBB,DepLI, @@ -1443,9 +1440,9 @@ void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps, // If the clobbering value is a memset/memcpy/memmove, see if we can // forward a value on from it. if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(DepInfo.getInst())) { - if (DL && Address) { + if (Address) { int Offset = AnalyzeLoadFromClobberingMemInst(LI->getType(), Address, - DepMI, *DL); + DepMI, DL); if (Offset != -1) { ValuesPerBlock.push_back(AvailableValueInBlock::getMI(DepBB, DepMI, Offset)); @@ -1484,8 +1481,8 @@ void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps, if (S->getValueOperand()->getType() != LI->getType()) { // If the stored value is larger or equal to the loaded value, we can // reuse it. - if (!DL || !CanCoerceMustAliasedValueToLoad(S->getValueOperand(), - LI->getType(), *DL)) { + if (!CanCoerceMustAliasedValueToLoad(S->getValueOperand(), + LI->getType(), DL)) { UnavailableBlocks.push_back(DepBB); continue; } @@ -1501,7 +1498,7 @@ void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps, if (LD->getType() != LI->getType()) { // If the stored value is larger or equal to the loaded value, we can // reuse it. - if (!DL || !CanCoerceMustAliasedValueToLoad(LD, LI->getType(),*DL)) { + if (!CanCoerceMustAliasedValueToLoad(LD, LI->getType(), DL)) { UnavailableBlocks.push_back(DepBB); continue; } @@ -1613,6 +1610,7 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, // Check if the load can safely be moved to all the unavailable predecessors. bool CanDoPRE = true; + const DataLayout &DL = LI->getModule()->getDataLayout(); SmallVector<Instruction*, 8> NewInsts; for (auto &PredLoad : PredLoads) { BasicBlock *UnavailablePred = PredLoad.first; @@ -1704,7 +1702,7 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, return true; } -/// processNonLocalLoad - Attempt to eliminate a load whose dependencies are +/// Attempt to eliminate a load whose dependencies are /// non-local by performing PHI construction. bool GVN::processNonLocalLoad(LoadInst *LI) { // Step 1: Find the non-local dependencies of the load. @@ -1817,7 +1815,7 @@ static void patchAndReplaceAllUsesWith(Instruction *I, Value *Repl) { I->replaceAllUsesWith(Repl); } -/// processLoad - Attempt to eliminate a load, first by eliminating it +/// Attempt to eliminate a load, first by eliminating it /// locally, and then attempting non-local elimination if that fails. bool GVN::processLoad(LoadInst *L) { if (!MD) @@ -1833,10 +1831,11 @@ bool GVN::processLoad(LoadInst *L) { // ... to a pointer that has been loaded from before... MemDepResult Dep = MD->getDependency(L); + const DataLayout &DL = L->getModule()->getDataLayout(); // If we have a clobber and target data is around, see if this is a clobber // that we can fix up through code synthesis. - if (Dep.isClobber() && DL) { + if (Dep.isClobber()) { // Check to see if we have something like this: // store i32 123, i32* %P // %A = bitcast i32* %P to i8* @@ -1849,12 +1848,11 @@ bool GVN::processLoad(LoadInst *L) { // access code. Value *AvailVal = nullptr; if (StoreInst *DepSI = dyn_cast<StoreInst>(Dep.getInst())) { - int Offset = AnalyzeLoadFromClobberingStore(L->getType(), - L->getPointerOperand(), - DepSI, *DL); + int Offset = AnalyzeLoadFromClobberingStore( + L->getType(), L->getPointerOperand(), DepSI); if (Offset != -1) AvailVal = GetStoreValueForLoad(DepSI->getValueOperand(), Offset, - L->getType(), L, *DL); + L->getType(), L, DL); } // Check to see if we have something like this: @@ -1867,9 +1865,8 @@ bool GVN::processLoad(LoadInst *L) { if (DepLI == L) return false; - int Offset = AnalyzeLoadFromClobberingLoad(L->getType(), - L->getPointerOperand(), - DepLI, *DL); + int Offset = AnalyzeLoadFromClobberingLoad( + L->getType(), L->getPointerOperand(), DepLI, DL); if (Offset != -1) AvailVal = GetLoadValueForLoad(DepLI, Offset, L->getType(), L, *this); } @@ -1877,11 +1874,10 @@ bool GVN::processLoad(LoadInst *L) { // If the clobbering value is a memset/memcpy/memmove, see if we can forward // a value on from it. if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(Dep.getInst())) { - int Offset = AnalyzeLoadFromClobberingMemInst(L->getType(), - L->getPointerOperand(), - DepMI, *DL); + int Offset = AnalyzeLoadFromClobberingMemInst( + L->getType(), L->getPointerOperand(), DepMI, DL); if (Offset != -1) - AvailVal = GetMemInstValueForLoad(DepMI, Offset, L->getType(), L, *DL); + AvailVal = GetMemInstValueForLoad(DepMI, Offset, L->getType(), L, DL); } if (AvailVal) { @@ -1932,17 +1928,13 @@ bool GVN::processLoad(LoadInst *L) { // actually have the same type. See if we know how to reuse the stored // value (depending on its type). if (StoredVal->getType() != L->getType()) { - if (DL) { - StoredVal = CoerceAvailableValueToLoadType(StoredVal, L->getType(), - L, *DL); - if (!StoredVal) - return false; - - DEBUG(dbgs() << "GVN COERCED STORE:\n" << *DepSI << '\n' << *StoredVal - << '\n' << *L << "\n\n\n"); - } - else + StoredVal = + CoerceAvailableValueToLoadType(StoredVal, L->getType(), L, DL); + if (!StoredVal) return false; + + DEBUG(dbgs() << "GVN COERCED STORE:\n" << *DepSI << '\n' << *StoredVal + << '\n' << *L << "\n\n\n"); } // Remove it! @@ -1961,17 +1953,12 @@ bool GVN::processLoad(LoadInst *L) { // the same type. See if we know how to reuse the previously loaded value // (depending on its type). if (DepLI->getType() != L->getType()) { - if (DL) { - AvailableVal = CoerceAvailableValueToLoadType(DepLI, L->getType(), - L, *DL); - if (!AvailableVal) - return false; - - DEBUG(dbgs() << "GVN COERCED LOAD:\n" << *DepLI << "\n" << *AvailableVal - << "\n" << *L << "\n\n\n"); - } - else + AvailableVal = CoerceAvailableValueToLoadType(DepLI, L->getType(), L, DL); + if (!AvailableVal) return false; + + DEBUG(dbgs() << "GVN COERCED LOAD:\n" << *DepLI << "\n" << *AvailableVal + << "\n" << *L << "\n\n\n"); } // Remove it! @@ -2016,7 +2003,7 @@ bool GVN::processLoad(LoadInst *L) { return false; } -// findLeader - In order to find a leader for a given value number at a +// In order to find a leader for a given value number at a // specific basic block, we first obtain the list of all Values for that number, // and then scan the list to find one whose block dominates the block in // question. This is fast because dominator tree queries consist of only @@ -2044,25 +2031,7 @@ Value *GVN::findLeader(const BasicBlock *BB, uint32_t num) { return Val; } -/// replaceAllDominatedUsesWith - Replace all uses of 'From' with 'To' if the -/// use is dominated by the given basic block. Returns the number of uses that -/// were replaced. -unsigned GVN::replaceAllDominatedUsesWith(Value *From, Value *To, - const BasicBlockEdge &Root) { - unsigned Count = 0; - for (Value::use_iterator UI = From->use_begin(), UE = From->use_end(); - UI != UE; ) { - Use &U = *UI++; - - if (DT->dominates(Root, U)) { - U.set(To); - ++Count; - } - } - return Count; -} - -/// isOnlyReachableViaThisEdge - There is an edge from 'Src' to 'Dst'. Return +/// There is an edge from 'Src' to 'Dst'. Return /// true if every path from the entry block to 'Dst' passes via this edge. In /// particular 'Dst' must not be reachable via another edge from 'Src'. static bool isOnlyReachableViaThisEdge(const BasicBlockEdge &E, @@ -2079,7 +2048,7 @@ static bool isOnlyReachableViaThisEdge(const BasicBlockEdge &E, return Pred != nullptr; } -/// propagateEquality - The given values are known to be equal in every block +/// The given values are known to be equal in every block /// dominated by 'Root'. Exploit this, for example by replacing 'LHS' with /// 'RHS' everywhere in the scope. Returns whether a change was made. bool GVN::propagateEquality(Value *LHS, Value *RHS, @@ -2138,7 +2107,7 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, // LHS always has at least one use that is not dominated by Root, this will // never do anything if LHS has only one use. if (!LHS->hasOneUse()) { - unsigned NumReplacements = replaceAllDominatedUsesWith(LHS, RHS, Root); + unsigned NumReplacements = replaceDominatedUsesWith(LHS, RHS, *DT, Root); Changed |= NumReplacements > 0; NumGVNEqProp += NumReplacements; } @@ -2210,7 +2179,7 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, Value *NotCmp = findLeader(Root.getEnd(), Num); if (NotCmp && isa<Instruction>(NotCmp)) { unsigned NumReplacements = - replaceAllDominatedUsesWith(NotCmp, NotVal, Root); + replaceDominatedUsesWith(NotCmp, NotVal, *DT, Root); Changed |= NumReplacements > 0; NumGVNEqProp += NumReplacements; } @@ -2229,7 +2198,7 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, return Changed; } -/// processInstruction - When calculating availability, handle an instruction +/// When calculating availability, handle an instruction /// by inserting it into the appropriate sets bool GVN::processInstruction(Instruction *I) { // Ignore dbg info intrinsics. @@ -2240,6 +2209,7 @@ bool GVN::processInstruction(Instruction *I) { // to value numbering it. Value numbering often exposes redundancies, for // example if it determines that %y is equal to %x then the instruction // "%z = and i32 %x, %y" becomes "%z = and i32 %x, %x" which we now simplify. + const DataLayout &DL = I->getModule()->getDataLayout(); if (Value *V = SimplifyInstruction(I, DL, TLI, DT, AC)) { I->replaceAllUsesWith(V); if (MD && V->getType()->getScalarType()->isPointerTy()) @@ -2358,10 +2328,8 @@ bool GVN::runOnFunction(Function& F) { if (!NoLoads) MD = &getAnalysis<MemoryDependenceAnalysis>(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - DL = DLP ? &DLP->getDataLayout() : nullptr; AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); - TLI = &getAnalysis<TargetLibraryInfo>(); + TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); VN.setAliasAnalysis(&getAnalysis<AliasAnalysis>()); VN.setMemDep(MD); VN.setDomTree(DT); @@ -2374,7 +2342,8 @@ bool GVN::runOnFunction(Function& F) { for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ) { BasicBlock *BB = FI++; - bool removedBlock = MergeBlockIntoPredecessor(BB, this); + bool removedBlock = MergeBlockIntoPredecessor( + BB, DT, /* LoopInfo */ nullptr, VN.getAliasAnalysis(), MD); if (removedBlock) ++NumGVNBlocks; Changed |= removedBlock; @@ -2457,6 +2426,43 @@ bool GVN::processBlock(BasicBlock *BB) { return ChangedFunction; } +// Instantiate an expression in a predecessor that lacked it. +bool GVN::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred, + unsigned int ValNo) { + // Because we are going top-down through the block, all value numbers + // will be available in the predecessor by the time we need them. Any + // that weren't originally present will have been instantiated earlier + // in this loop. + bool success = true; + for (unsigned i = 0, e = Instr->getNumOperands(); i != e; ++i) { + Value *Op = Instr->getOperand(i); + if (isa<Argument>(Op) || isa<Constant>(Op) || isa<GlobalValue>(Op)) + continue; + + if (Value *V = findLeader(Pred, VN.lookup(Op))) { + Instr->setOperand(i, V); + } else { + success = false; + break; + } + } + + // Fail out if we encounter an operand that is not available in + // the PRE predecessor. This is typically because of loads which + // are not value numbered precisely. + if (!success) + return false; + + Instr->insertBefore(Pred->getTerminator()); + Instr->setName(Instr->getName() + ".pre"); + Instr->setDebugLoc(Instr->getDebugLoc()); + VN.add(Instr, ValNo); + + // Update the availability map to include the new instruction. + addToLeaderTable(ValNo, Instr, Pred); + return true; +} + bool GVN::performScalarPRE(Instruction *CurInst) { SmallVector<std::pair<Value*, BasicBlock*>, 8> predMap; @@ -2523,60 +2529,43 @@ bool GVN::performScalarPRE(Instruction *CurInst) { // Don't do PRE when it might increase code size, i.e. when // we would need to insert instructions in more than one pred. - if (NumWithout != 1 || NumWith == 0) + if (NumWithout > 1 || NumWith == 0) return false; - // Don't do PRE across indirect branch. - if (isa<IndirectBrInst>(PREPred->getTerminator())) - return false; + // We may have a case where all predecessors have the instruction, + // and we just need to insert a phi node. Otherwise, perform + // insertion. + Instruction *PREInstr = nullptr; - // We can't do PRE safely on a critical edge, so instead we schedule - // the edge to be split and perform the PRE the next time we iterate - // on the function. - unsigned SuccNum = GetSuccessorNumber(PREPred, CurrentBlock); - if (isCriticalEdge(PREPred->getTerminator(), SuccNum)) { - toSplit.push_back(std::make_pair(PREPred->getTerminator(), SuccNum)); - return false; - } - - // Instantiate the expression in the predecessor that lacked it. - // Because we are going top-down through the block, all value numbers - // will be available in the predecessor by the time we need them. Any - // that weren't originally present will have been instantiated earlier - // in this loop. - Instruction *PREInstr = CurInst->clone(); - bool success = true; - for (unsigned i = 0, e = CurInst->getNumOperands(); i != e; ++i) { - Value *Op = PREInstr->getOperand(i); - if (isa<Argument>(Op) || isa<Constant>(Op) || isa<GlobalValue>(Op)) - continue; + if (NumWithout != 0) { + // Don't do PRE across indirect branch. + if (isa<IndirectBrInst>(PREPred->getTerminator())) + return false; - if (Value *V = findLeader(PREPred, VN.lookup(Op))) { - PREInstr->setOperand(i, V); - } else { - success = false; - break; + // We can't do PRE safely on a critical edge, so instead we schedule + // the edge to be split and perform the PRE the next time we iterate + // on the function. + unsigned SuccNum = GetSuccessorNumber(PREPred, CurrentBlock); + if (isCriticalEdge(PREPred->getTerminator(), SuccNum)) { + toSplit.push_back(std::make_pair(PREPred->getTerminator(), SuccNum)); + return false; + } + // We need to insert somewhere, so let's give it a shot + PREInstr = CurInst->clone(); + if (!performScalarPREInsertion(PREInstr, PREPred, ValNo)) { + // If we failed insertion, make sure we remove the instruction. + DEBUG(verifyRemoved(PREInstr)); + delete PREInstr; + return false; } } - // Fail out if we encounter an operand that is not available in - // the PRE predecessor. This is typically because of loads which - // are not value numbered precisely. - if (!success) { - DEBUG(verifyRemoved(PREInstr)); - delete PREInstr; - return false; - } + // Either we should have filled in the PRE instruction, or we should + // not have needed insertions. + assert (PREInstr != nullptr || NumWithout == 0); - PREInstr->insertBefore(PREPred->getTerminator()); - PREInstr->setName(CurInst->getName() + ".pre"); - PREInstr->setDebugLoc(CurInst->getDebugLoc()); - VN.add(PREInstr, ValNo); ++NumGVNPRE; - // Update the availability map to include the new instruction. - addToLeaderTable(ValNo, PREInstr, PREPred); - // Create a PHI to make the value available in this block. PHINode *Phi = PHINode::Create(CurInst->getType(), predMap.size(), @@ -2612,10 +2601,12 @@ bool GVN::performScalarPRE(Instruction *CurInst) { MD->removeInstruction(CurInst); DEBUG(verifyRemoved(CurInst)); CurInst->eraseFromParent(); + ++NumGVNInstr; + return true; } -/// performPRE - Perform a purely local form of PRE that looks for diamond +/// Perform a purely local form of PRE that looks for diamond /// control flow patterns and attempts to perform simple PRE at the join point. bool GVN::performPRE(Function &F) { bool Changed = false; @@ -2645,26 +2636,28 @@ bool GVN::performPRE(Function &F) { /// Split the critical edge connecting the given two blocks, and return /// the block inserted to the critical edge. BasicBlock *GVN::splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ) { - BasicBlock *BB = SplitCriticalEdge(Pred, Succ, this); + BasicBlock *BB = SplitCriticalEdge( + Pred, Succ, CriticalEdgeSplittingOptions(getAliasAnalysis(), DT)); if (MD) MD->invalidateCachedPredecessors(); return BB; } -/// splitCriticalEdges - Split critical edges found during the previous +/// Split critical edges found during the previous /// iteration that may enable further optimization. bool GVN::splitCriticalEdges() { if (toSplit.empty()) return false; do { std::pair<TerminatorInst*, unsigned> Edge = toSplit.pop_back_val(); - SplitCriticalEdge(Edge.first, Edge.second, this); + SplitCriticalEdge(Edge.first, Edge.second, + CriticalEdgeSplittingOptions(getAliasAnalysis(), DT)); } while (!toSplit.empty()); if (MD) MD->invalidateCachedPredecessors(); return true; } -/// iterateOnFunction - Executes one iteration of GVN +/// Executes one iteration of GVN bool GVN::iterateOnFunction(Function &F) { cleanupGlobalSets(); @@ -2695,7 +2688,7 @@ void GVN::cleanupGlobalSets() { TableAllocator.Reset(); } -/// verifyRemoved - Verify that the specified instruction does not occur in our +/// Verify that the specified instruction does not occur in our /// internal data structures. void GVN::verifyRemoved(const Instruction *Inst) const { VN.verifyRemoved(Inst); @@ -2714,11 +2707,10 @@ void GVN::verifyRemoved(const Instruction *Inst) const { } } -// BB is declared dead, which implied other blocks become dead as well. This -// function is to add all these blocks to "DeadBlocks". For the dead blocks' -// live successors, update their phi nodes by replacing the operands -// corresponding to dead blocks with UndefVal. -// +/// BB is declared dead, which implied other blocks become dead as well. This +/// function is to add all these blocks to "DeadBlocks". For the dead blocks' +/// live successors, update their phi nodes by replacing the operands +/// corresponding to dead blocks with UndefVal. void GVN::addDeadBlock(BasicBlock *BB) { SmallVector<BasicBlock *, 4> NewDead; SmallSetVector<BasicBlock *, 4> DF; diff --git a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp index c01f57f..600589c 100644 --- a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -31,6 +31,7 @@ #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" @@ -44,7 +45,6 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SimplifyIndVar.h" @@ -73,7 +73,6 @@ namespace { LoopInfo *LI; ScalarEvolution *SE; DominatorTree *DT; - const DataLayout *DL; TargetLibraryInfo *TLI; const TargetTransformInfo *TTI; @@ -82,8 +81,8 @@ namespace { public: static char ID; // Pass identification, replacement for typeid - IndVarSimplify() : LoopPass(ID), LI(nullptr), SE(nullptr), DT(nullptr), - DL(nullptr), Changed(false) { + IndVarSimplify() + : LoopPass(ID), LI(nullptr), SE(nullptr), DT(nullptr), Changed(false) { initializeIndVarSimplifyPass(*PassRegistry::getPassRegistry()); } @@ -91,7 +90,7 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<LoopInfo>(); + AU.addRequired<LoopInfoWrapperPass>(); AU.addRequired<ScalarEvolution>(); AU.addRequiredID(LoopSimplifyID); AU.addRequiredID(LCSSAID); @@ -126,7 +125,7 @@ char IndVarSimplify::ID = 0; INITIALIZE_PASS_BEGIN(IndVarSimplify, "indvars", "Induction Variable Simplification", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LCSSA) @@ -622,17 +621,6 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { PN->eraseFromParent(); } } - - // If we were unable to completely replace the PHI node, clone the PHI - // and delete the original one. This lets IVUsers and any other maps - // purge the original user from their records. - if (!LCSSASafePhiForRAUW) { - PHINode *NewPN = cast<PHINode>(PN->clone()); - NewPN->takeName(PN); - NewPN->insertBefore(PN); - PN->replaceAllUsesWith(NewPN); - PN->eraseFromParent(); - } } } @@ -663,14 +651,14 @@ namespace { /// extended by this sign or zero extend operation. This is used to determine /// the final width of the IV before actually widening it. static void visitIVCast(CastInst *Cast, WideIVInfo &WI, ScalarEvolution *SE, - const DataLayout *DL, const TargetTransformInfo *TTI) { + const TargetTransformInfo *TTI) { bool IsSigned = Cast->getOpcode() == Instruction::SExt; if (!IsSigned && Cast->getOpcode() != Instruction::ZExt) return; Type *Ty = Cast->getType(); uint64_t Width = SE->getTypeSizeInBits(Ty); - if (DL && !DL->isLegalInteger(Width)) + if (!Cast->getModule()->getDataLayout().isLegalInteger(Width)) return; // Cast is either an sext or zext up to this point. @@ -916,8 +904,8 @@ const SCEVAddRecExpr* WidenIV::GetExtendedOperandRecurrence(NarrowIVDefUse DU) { return AddRec; } -/// GetWideRecurrence - Is this instruction potentially interesting from -/// IVUsers' perspective after widening it's type? In other words, can the +/// GetWideRecurrence - Is this instruction potentially interesting for further +/// simplification after widening it's type? In other words, can the /// extend be safely hoisted out of the loop with SCEV reducing the value to a /// recurrence on the same loop. If so, return the sign or zero extended /// recurrence. Otherwise return NULL. @@ -1201,7 +1189,6 @@ PHINode *WidenIV::CreateWideIV(SCEVExpander &Rewriter) { namespace { class IndVarSimplifyVisitor : public IVVisitor { ScalarEvolution *SE; - const DataLayout *DL; const TargetTransformInfo *TTI; PHINode *IVPhi; @@ -1209,9 +1196,9 @@ namespace { WideIVInfo WI; IndVarSimplifyVisitor(PHINode *IV, ScalarEvolution *SCEV, - const DataLayout *DL, const TargetTransformInfo *TTI, + const TargetTransformInfo *TTI, const DominatorTree *DTree) - : SE(SCEV), DL(DL), TTI(TTI), IVPhi(IV) { + : SE(SCEV), TTI(TTI), IVPhi(IV) { DT = DTree; WI.NarrowIV = IVPhi; if (ReduceLiveIVs) @@ -1219,9 +1206,7 @@ namespace { } // Implement the interface used by simplifyUsersOfIV. - void visitCast(CastInst *Cast) override { - visitIVCast(Cast, WI, SE, DL, TTI); - } + void visitCast(CastInst *Cast) override { visitIVCast(Cast, WI, SE, TTI); } }; } @@ -1255,7 +1240,7 @@ void IndVarSimplify::SimplifyAndExtend(Loop *L, PHINode *CurrIV = LoopPhis.pop_back_val(); // Information about sign/zero extensions of CurrIV. - IndVarSimplifyVisitor Visitor(CurrIV, SE, DL, TTI, DT); + IndVarSimplifyVisitor Visitor(CurrIV, SE, TTI, DT); Changed |= simplifyUsersOfIV(CurrIV, SE, &LPM, DeadInsts, &Visitor); @@ -1278,55 +1263,6 @@ void IndVarSimplify::SimplifyAndExtend(Loop *L, // LinearFunctionTestReplace and its kin. Rewrite the loop exit condition. //===----------------------------------------------------------------------===// -/// Check for expressions that ScalarEvolution generates to compute -/// BackedgeTakenInfo. If these expressions have not been reduced, then -/// expanding them may incur additional cost (albeit in the loop preheader). -static bool isHighCostExpansion(const SCEV *S, BranchInst *BI, - SmallPtrSetImpl<const SCEV*> &Processed, - ScalarEvolution *SE) { - if (!Processed.insert(S).second) - return false; - - // If the backedge-taken count is a UDiv, it's very likely a UDiv that - // ScalarEvolution's HowFarToZero or HowManyLessThans produced to compute a - // precise expression, rather than a UDiv from the user's code. If we can't - // find a UDiv in the code with some simple searching, assume the former and - // forego rewriting the loop. - if (isa<SCEVUDivExpr>(S)) { - ICmpInst *OrigCond = dyn_cast<ICmpInst>(BI->getCondition()); - if (!OrigCond) return true; - const SCEV *R = SE->getSCEV(OrigCond->getOperand(1)); - R = SE->getMinusSCEV(R, SE->getConstant(R->getType(), 1)); - if (R != S) { - const SCEV *L = SE->getSCEV(OrigCond->getOperand(0)); - L = SE->getMinusSCEV(L, SE->getConstant(L->getType(), 1)); - if (L != S) - return true; - } - } - - // Recurse past add expressions, which commonly occur in the - // BackedgeTakenCount. They may already exist in program code, and if not, - // they are not too expensive rematerialize. - if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { - for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end(); - I != E; ++I) { - if (isHighCostExpansion(*I, BI, Processed, SE)) - return true; - } - return false; - } - - // HowManyLessThans uses a Max expression whenever the loop is not guarded by - // the exit condition. - if (isa<SCEVSMaxExpr>(S) || isa<SCEVUMaxExpr>(S)) - return true; - - // If we haven't recognized an expensive SCEV pattern, assume it's an - // expression produced by program code. - return false; -} - /// canExpandBackedgeTakenCount - Return true if this loop's backedge taken /// count expression can be safely and cheaply expanded into an instruction /// sequence that can be used by LinearFunctionTestReplace. @@ -1340,7 +1276,8 @@ static bool isHighCostExpansion(const SCEV *S, BranchInst *BI, /// used by ABI constrained operation, as opposed to inttoptr/ptrtoint). /// However, we don't yet have a strong motivation for converting loop tests /// into inequality tests. -static bool canExpandBackedgeTakenCount(Loop *L, ScalarEvolution *SE) { +static bool canExpandBackedgeTakenCount(Loop *L, ScalarEvolution *SE, + SCEVExpander &Rewriter) { const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L); if (isa<SCEVCouldNotCompute>(BackedgeTakenCount) || BackedgeTakenCount->isZero()) @@ -1350,12 +1287,10 @@ static bool canExpandBackedgeTakenCount(Loop *L, ScalarEvolution *SE) { return false; // Can't rewrite non-branch yet. - BranchInst *BI = dyn_cast<BranchInst>(L->getExitingBlock()->getTerminator()); - if (!BI) + if (!isa<BranchInst>(L->getExitingBlock()->getTerminator())) return false; - SmallPtrSet<const SCEV*, 8> Processed; - if (isHighCostExpansion(BackedgeTakenCount, BI, Processed, SE)) + if (Rewriter.isHighCostExpansion(BackedgeTakenCount, L)) return false; return true; @@ -1521,9 +1456,8 @@ static bool AlmostDeadIV(PHINode *Phi, BasicBlock *LatchBlock, Value *Cond) { /// FIXME: Accept non-unit stride as long as SCEV can reduce BECount * Stride. /// This is difficult in general for SCEV because of potential overflow. But we /// could at least handle constant BECounts. -static PHINode * -FindLoopCounter(Loop *L, const SCEV *BECount, - ScalarEvolution *SE, DominatorTree *DT, const DataLayout *DL) { +static PHINode *FindLoopCounter(Loop *L, const SCEV *BECount, + ScalarEvolution *SE, DominatorTree *DT) { uint64_t BCWidth = SE->getTypeSizeInBits(BECount->getType()); Value *Cond = @@ -1552,7 +1486,8 @@ FindLoopCounter(Loop *L, const SCEV *BECount, // AR may be wider than BECount. With eq/ne tests overflow is immaterial. // AR may not be a narrower type, or we may never exit. uint64_t PhiWidth = SE->getTypeSizeInBits(AR->getType()); - if (PhiWidth < BCWidth || (DL && !DL->isLegalInteger(PhiWidth))) + if (PhiWidth < BCWidth || + !L->getHeader()->getModule()->getDataLayout().isLegalInteger(PhiWidth)) continue; const SCEV *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE)); @@ -1641,7 +1576,7 @@ static Value *genLoopLimit(PHINode *IndVar, const SCEV *IVCount, Loop *L, && "unit stride pointer IV must be i8*"); IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); - return Builder.CreateGEP(GEPBase, GEPOffset, "lftr.limit"); + return Builder.CreateGEP(nullptr, GEPBase, GEPOffset, "lftr.limit"); } else { // In any other case, convert both IVInit and IVCount to integers before @@ -1695,7 +1630,7 @@ LinearFunctionTestReplace(Loop *L, const SCEV *BackedgeTakenCount, PHINode *IndVar, SCEVExpander &Rewriter) { - assert(canExpandBackedgeTakenCount(L, SE) && "precondition"); + assert(canExpandBackedgeTakenCount(L, SE, Rewriter) && "precondition"); // Initialize CmpIndVar and IVCount to their preincremented values. Value *CmpIndVar = IndVar; @@ -1705,51 +1640,15 @@ LinearFunctionTestReplace(Loop *L, // compare against the post-incremented value, otherwise we must compare // against the preincremented value. if (L->getExitingBlock() == L->getLoopLatch()) { + // Add one to the "backedge-taken" count to get the trip count. + // This addition may overflow, which is valid as long as the comparison is + // truncated to BackedgeTakenCount->getType(). + IVCount = SE->getAddExpr(BackedgeTakenCount, + SE->getConstant(BackedgeTakenCount->getType(), 1)); // The BackedgeTaken expression contains the number of times that the // backedge branches to the loop header. This is one less than the // number of times the loop executes, so use the incremented indvar. - llvm::Value *IncrementedIndvar = - IndVar->getIncomingValueForBlock(L->getExitingBlock()); - const auto *IncrementedIndvarSCEV = - cast<SCEVAddRecExpr>(SE->getSCEV(IncrementedIndvar)); - // It is unsafe to use the incremented indvar if it has a wrapping flag, we - // don't want to compare against a poison value. Check the SCEV that - // corresponds to the incremented indvar, the SCEVExpander will only insert - // flags in the IR if the SCEV originally had wrapping flags. - // FIXME: In theory, SCEV could drop flags even though they exist in IR. - // A more robust solution would involve getting a new expression for - // CmpIndVar by applying non-NSW/NUW AddExprs. - auto WrappingFlags = - ScalarEvolution::setFlags(SCEV::FlagNUW, SCEV::FlagNSW); - const SCEV *IVInit = IncrementedIndvarSCEV->getStart(); - if (SE->getTypeSizeInBits(IVInit->getType()) > - SE->getTypeSizeInBits(IVCount->getType())) - IVInit = SE->getTruncateExpr(IVInit, IVCount->getType()); - unsigned BitWidth = SE->getTypeSizeInBits(IVCount->getType()); - Type *WideTy = IntegerType::get(SE->getContext(), BitWidth + 1); - // Check if InitIV + BECount+1 requires sign/zero extension. - // If not, clear the corresponding flag from WrappingFlags because it is not - // necessary for those flags in the IncrementedIndvarSCEV expression. - if (SE->getSignExtendExpr(SE->getAddExpr(IVInit, BackedgeTakenCount), - WideTy) == - SE->getAddExpr(SE->getSignExtendExpr(IVInit, WideTy), - SE->getSignExtendExpr(BackedgeTakenCount, WideTy))) - WrappingFlags = ScalarEvolution::clearFlags(WrappingFlags, SCEV::FlagNSW); - if (SE->getZeroExtendExpr(SE->getAddExpr(IVInit, BackedgeTakenCount), - WideTy) == - SE->getAddExpr(SE->getZeroExtendExpr(IVInit, WideTy), - SE->getZeroExtendExpr(BackedgeTakenCount, WideTy))) - WrappingFlags = ScalarEvolution::clearFlags(WrappingFlags, SCEV::FlagNUW); - if (!ScalarEvolution::maskFlags(IncrementedIndvarSCEV->getNoWrapFlags(), - WrappingFlags)) { - // Add one to the "backedge-taken" count to get the trip count. - // This addition may overflow, which is valid as long as the comparison is - // truncated to BackedgeTakenCount->getType(). - IVCount = - SE->getAddExpr(BackedgeTakenCount, - SE->getConstant(BackedgeTakenCount->getType(), 1)); - CmpIndVar = IncrementedIndvar; - } + CmpIndVar = IndVar->getIncomingValueForBlock(L->getExitingBlock()); } Value *ExitCnt = genLoopLimit(IndVar, IVCount, L, Rewriter, SE); @@ -1929,13 +1828,14 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { if (!L->isLoopSimplifyForm()) return false; - LI = &getAnalysis<LoopInfo>(); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); SE = &getAnalysis<ScalarEvolution>(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - DL = DLP ? &DLP->getDataLayout() : nullptr; - TLI = getAnalysisIfAvailable<TargetLibraryInfo>(); - TTI = getAnalysisIfAvailable<TargetTransformInfo>(); + auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); + TLI = TLIP ? &TLIP->getTLI() : nullptr; + auto *TTIP = getAnalysisIfAvailable<TargetTransformInfoWrapperPass>(); + TTI = TTIP ? &TTIP->getTTI(*L->getHeader()->getParent()) : nullptr; + const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); DeadInsts.clear(); Changed = false; @@ -1947,7 +1847,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L); // Create a rewriter object which we'll use to transform the code with. - SCEVExpander Rewriter(*SE, "indvars"); + SCEVExpander Rewriter(*SE, DL, "indvars"); #ifndef NDEBUG Rewriter.setDebugType(DEBUG_TYPE); #endif @@ -1975,8 +1875,8 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { // If we have a trip count expression, rewrite the loop's exit condition // using it. We can currently only handle loops with a single exit. - if (canExpandBackedgeTakenCount(L, SE) && needsLFTR(L, DT)) { - PHINode *IndVar = FindLoopCounter(L, BackedgeTakenCount, SE, DT, DL); + if (canExpandBackedgeTakenCount(L, SE, Rewriter) && needsLFTR(L, DT)) { + PHINode *IndVar = FindLoopCounter(L, BackedgeTakenCount, SE, DT); if (IndVar) { // Check preconditions for proper SCEVExpander operation. SCEV does not // express SCEVExpander's dependencies, such as LoopSimplify. Instead any diff --git a/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp new file mode 100644 index 0000000..cbdacad --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp @@ -0,0 +1,1495 @@ +//===-- InductiveRangeCheckElimination.cpp - ------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// The InductiveRangeCheckElimination pass splits a loop's iteration space into +// three disjoint ranges. It does that in a way such that the loop running in +// the middle loop provably does not need range checks. As an example, it will +// convert +// +// len = < known positive > +// for (i = 0; i < n; i++) { +// if (0 <= i && i < len) { +// do_something(); +// } else { +// throw_out_of_bounds(); +// } +// } +// +// to +// +// len = < known positive > +// limit = smin(n, len) +// // no first segment +// for (i = 0; i < limit; i++) { +// if (0 <= i && i < len) { // this check is fully redundant +// do_something(); +// } else { +// throw_out_of_bounds(); +// } +// } +// for (i = limit; i < n; i++) { +// if (0 <= i && i < len) { +// do_something(); +// } else { +// throw_out_of_bounds(); +// } +// } +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/Optional.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/IR/ValueHandle.h" +#include "llvm/IR/Verifier.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/LoopUtils.h" +#include "llvm/Transforms/Utils/SimplifyIndVar.h" +#include "llvm/Transforms/Utils/UnrollLoop.h" +#include <array> + +using namespace llvm; + +static cl::opt<unsigned> LoopSizeCutoff("irce-loop-size-cutoff", cl::Hidden, + cl::init(64)); + +static cl::opt<bool> PrintChangedLoops("irce-print-changed-loops", cl::Hidden, + cl::init(false)); + +static cl::opt<bool> PrintRangeChecks("irce-print-range-checks", cl::Hidden, + cl::init(false)); + +static cl::opt<int> MaxExitProbReciprocal("irce-max-exit-prob-reciprocal", + cl::Hidden, cl::init(10)); + +#define DEBUG_TYPE "irce" + +namespace { + +/// An inductive range check is conditional branch in a loop with +/// +/// 1. a very cold successor (i.e. the branch jumps to that successor very +/// rarely) +/// +/// and +/// +/// 2. a condition that is provably true for some contiguous range of values +/// taken by the containing loop's induction variable. +/// +class InductiveRangeCheck { + // Classifies a range check + enum RangeCheckKind : unsigned { + // Range check of the form "0 <= I". + RANGE_CHECK_LOWER = 1, + + // Range check of the form "I < L" where L is known positive. + RANGE_CHECK_UPPER = 2, + + // The logical and of the RANGE_CHECK_LOWER and RANGE_CHECK_UPPER + // conditions. + RANGE_CHECK_BOTH = RANGE_CHECK_LOWER | RANGE_CHECK_UPPER, + + // Unrecognized range check condition. + RANGE_CHECK_UNKNOWN = (unsigned)-1 + }; + + static const char *rangeCheckKindToStr(RangeCheckKind); + + const SCEV *Offset; + const SCEV *Scale; + Value *Length; + BranchInst *Branch; + RangeCheckKind Kind; + + static RangeCheckKind parseRangeCheckICmp(Loop *L, ICmpInst *ICI, + ScalarEvolution &SE, Value *&Index, + Value *&Length); + + static InductiveRangeCheck::RangeCheckKind + parseRangeCheck(Loop *L, ScalarEvolution &SE, Value *Condition, + const SCEV *&Index, Value *&UpperLimit); + + InductiveRangeCheck() : + Offset(nullptr), Scale(nullptr), Length(nullptr), Branch(nullptr) { } + +public: + const SCEV *getOffset() const { return Offset; } + const SCEV *getScale() const { return Scale; } + Value *getLength() const { return Length; } + + void print(raw_ostream &OS) const { + OS << "InductiveRangeCheck:\n"; + OS << " Kind: " << rangeCheckKindToStr(Kind) << "\n"; + OS << " Offset: "; + Offset->print(OS); + OS << " Scale: "; + Scale->print(OS); + OS << " Length: "; + if (Length) + Length->print(OS); + else + OS << "(null)"; + OS << "\n Branch: "; + getBranch()->print(OS); + OS << "\n"; + } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + void dump() { + print(dbgs()); + } +#endif + + BranchInst *getBranch() const { return Branch; } + + /// Represents an signed integer range [Range.getBegin(), Range.getEnd()). If + /// R.getEnd() sle R.getBegin(), then R denotes the empty range. + + class Range { + const SCEV *Begin; + const SCEV *End; + + public: + Range(const SCEV *Begin, const SCEV *End) : Begin(Begin), End(End) { + assert(Begin->getType() == End->getType() && "ill-typed range!"); + } + + Type *getType() const { return Begin->getType(); } + const SCEV *getBegin() const { return Begin; } + const SCEV *getEnd() const { return End; } + }; + + typedef SpecificBumpPtrAllocator<InductiveRangeCheck> AllocatorTy; + + /// This is the value the condition of the branch needs to evaluate to for the + /// branch to take the hot successor (see (1) above). + bool getPassingDirection() { return true; } + + /// Computes a range for the induction variable (IndVar) in which the range + /// check is redundant and can be constant-folded away. The induction + /// variable is not required to be the canonical {0,+,1} induction variable. + Optional<Range> computeSafeIterationSpace(ScalarEvolution &SE, + const SCEVAddRecExpr *IndVar, + IRBuilder<> &B) const; + + /// Create an inductive range check out of BI if possible, else return + /// nullptr. + static InductiveRangeCheck *create(AllocatorTy &Alloc, BranchInst *BI, + Loop *L, ScalarEvolution &SE, + BranchProbabilityInfo &BPI); +}; + +class InductiveRangeCheckElimination : public LoopPass { + InductiveRangeCheck::AllocatorTy Allocator; + +public: + static char ID; + InductiveRangeCheckElimination() : LoopPass(ID) { + initializeInductiveRangeCheckEliminationPass( + *PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<LoopInfoWrapperPass>(); + AU.addRequiredID(LoopSimplifyID); + AU.addRequiredID(LCSSAID); + AU.addRequired<ScalarEvolution>(); + AU.addRequired<BranchProbabilityInfo>(); + } + + bool runOnLoop(Loop *L, LPPassManager &LPM) override; +}; + +char InductiveRangeCheckElimination::ID = 0; +} + +INITIALIZE_PASS(InductiveRangeCheckElimination, "irce", + "Inductive range check elimination", false, false) + +const char *InductiveRangeCheck::rangeCheckKindToStr( + InductiveRangeCheck::RangeCheckKind RCK) { + switch (RCK) { + case InductiveRangeCheck::RANGE_CHECK_UNKNOWN: + return "RANGE_CHECK_UNKNOWN"; + + case InductiveRangeCheck::RANGE_CHECK_UPPER: + return "RANGE_CHECK_UPPER"; + + case InductiveRangeCheck::RANGE_CHECK_LOWER: + return "RANGE_CHECK_LOWER"; + + case InductiveRangeCheck::RANGE_CHECK_BOTH: + return "RANGE_CHECK_BOTH"; + } + + llvm_unreachable("unknown range check type!"); +} + +/// Parse a single ICmp instruction, `ICI`, into a range check. If `ICI` +/// cannot +/// be interpreted as a range check, return `RANGE_CHECK_UNKNOWN` and set +/// `Index` and `Length` to `nullptr`. Otherwise set `Index` to the value +/// being +/// range checked, and set `Length` to the upper limit `Index` is being range +/// checked with if (and only if) the range check type is stronger or equal to +/// RANGE_CHECK_UPPER. +/// +InductiveRangeCheck::RangeCheckKind +InductiveRangeCheck::parseRangeCheckICmp(Loop *L, ICmpInst *ICI, + ScalarEvolution &SE, Value *&Index, + Value *&Length) { + + auto IsNonNegativeAndNotLoopVarying = [&SE, L](Value *V) { + const SCEV *S = SE.getSCEV(V); + if (isa<SCEVCouldNotCompute>(S)) + return false; + + return SE.getLoopDisposition(S, L) == ScalarEvolution::LoopInvariant && + SE.isKnownNonNegative(S); + }; + + using namespace llvm::PatternMatch; + + ICmpInst::Predicate Pred = ICI->getPredicate(); + Value *LHS = ICI->getOperand(0); + Value *RHS = ICI->getOperand(1); + + switch (Pred) { + default: + return RANGE_CHECK_UNKNOWN; + + case ICmpInst::ICMP_SLE: + std::swap(LHS, RHS); + // fallthrough + case ICmpInst::ICMP_SGE: + if (match(RHS, m_ConstantInt<0>())) { + Index = LHS; + return RANGE_CHECK_LOWER; + } + return RANGE_CHECK_UNKNOWN; + + case ICmpInst::ICMP_SLT: + std::swap(LHS, RHS); + // fallthrough + case ICmpInst::ICMP_SGT: + if (match(RHS, m_ConstantInt<-1>())) { + Index = LHS; + return RANGE_CHECK_LOWER; + } + + if (IsNonNegativeAndNotLoopVarying(LHS)) { + Index = RHS; + Length = LHS; + return RANGE_CHECK_UPPER; + } + return RANGE_CHECK_UNKNOWN; + + case ICmpInst::ICMP_ULT: + std::swap(LHS, RHS); + // fallthrough + case ICmpInst::ICMP_UGT: + if (IsNonNegativeAndNotLoopVarying(LHS)) { + Index = RHS; + Length = LHS; + return RANGE_CHECK_BOTH; + } + return RANGE_CHECK_UNKNOWN; + } + + llvm_unreachable("default clause returns!"); +} + +/// Parses an arbitrary condition into a range check. `Length` is set only if +/// the range check is recognized to be `RANGE_CHECK_UPPER` or stronger. +InductiveRangeCheck::RangeCheckKind +InductiveRangeCheck::parseRangeCheck(Loop *L, ScalarEvolution &SE, + Value *Condition, const SCEV *&Index, + Value *&Length) { + using namespace llvm::PatternMatch; + + Value *A = nullptr; + Value *B = nullptr; + + if (match(Condition, m_And(m_Value(A), m_Value(B)))) { + Value *IndexA = nullptr, *IndexB = nullptr; + Value *LengthA = nullptr, *LengthB = nullptr; + ICmpInst *ICmpA = dyn_cast<ICmpInst>(A), *ICmpB = dyn_cast<ICmpInst>(B); + + if (!ICmpA || !ICmpB) + return InductiveRangeCheck::RANGE_CHECK_UNKNOWN; + + auto RCKindA = parseRangeCheckICmp(L, ICmpA, SE, IndexA, LengthA); + auto RCKindB = parseRangeCheckICmp(L, ICmpB, SE, IndexB, LengthB); + + if (RCKindA == InductiveRangeCheck::RANGE_CHECK_UNKNOWN || + RCKindB == InductiveRangeCheck::RANGE_CHECK_UNKNOWN) + return InductiveRangeCheck::RANGE_CHECK_UNKNOWN; + + if (IndexA != IndexB) + return InductiveRangeCheck::RANGE_CHECK_UNKNOWN; + + if (LengthA != nullptr && LengthB != nullptr && LengthA != LengthB) + return InductiveRangeCheck::RANGE_CHECK_UNKNOWN; + + Index = SE.getSCEV(IndexA); + if (isa<SCEVCouldNotCompute>(Index)) + return InductiveRangeCheck::RANGE_CHECK_UNKNOWN; + + Length = LengthA == nullptr ? LengthB : LengthA; + + return (InductiveRangeCheck::RangeCheckKind)(RCKindA | RCKindB); + } + + if (ICmpInst *ICI = dyn_cast<ICmpInst>(Condition)) { + Value *IndexVal = nullptr; + + auto RCKind = parseRangeCheckICmp(L, ICI, SE, IndexVal, Length); + + if (RCKind == InductiveRangeCheck::RANGE_CHECK_UNKNOWN) + return InductiveRangeCheck::RANGE_CHECK_UNKNOWN; + + Index = SE.getSCEV(IndexVal); + if (isa<SCEVCouldNotCompute>(Index)) + return InductiveRangeCheck::RANGE_CHECK_UNKNOWN; + + return RCKind; + } + + return InductiveRangeCheck::RANGE_CHECK_UNKNOWN; +} + + +InductiveRangeCheck * +InductiveRangeCheck::create(InductiveRangeCheck::AllocatorTy &A, BranchInst *BI, + Loop *L, ScalarEvolution &SE, + BranchProbabilityInfo &BPI) { + + if (BI->isUnconditional() || BI->getParent() == L->getLoopLatch()) + return nullptr; + + BranchProbability LikelyTaken(15, 16); + + if (BPI.getEdgeProbability(BI->getParent(), (unsigned) 0) < LikelyTaken) + return nullptr; + + Value *Length = nullptr; + const SCEV *IndexSCEV = nullptr; + + auto RCKind = InductiveRangeCheck::parseRangeCheck(L, SE, BI->getCondition(), + IndexSCEV, Length); + + if (RCKind == InductiveRangeCheck::RANGE_CHECK_UNKNOWN) + return nullptr; + + assert(IndexSCEV && "contract with SplitRangeCheckCondition!"); + assert((!(RCKind & InductiveRangeCheck::RANGE_CHECK_UPPER) || Length) && + "contract with SplitRangeCheckCondition!"); + + const SCEVAddRecExpr *IndexAddRec = dyn_cast<SCEVAddRecExpr>(IndexSCEV); + bool IsAffineIndex = + IndexAddRec && (IndexAddRec->getLoop() == L) && IndexAddRec->isAffine(); + + if (!IsAffineIndex) + return nullptr; + + InductiveRangeCheck *IRC = new (A.Allocate()) InductiveRangeCheck; + IRC->Length = Length; + IRC->Offset = IndexAddRec->getStart(); + IRC->Scale = IndexAddRec->getStepRecurrence(SE); + IRC->Branch = BI; + IRC->Kind = RCKind; + return IRC; +} + +namespace { + +// Keeps track of the structure of a loop. This is similar to llvm::Loop, +// except that it is more lightweight and can track the state of a loop through +// changing and potentially invalid IR. This structure also formalizes the +// kinds of loops we can deal with -- ones that have a single latch that is also +// an exiting block *and* have a canonical induction variable. +struct LoopStructure { + const char *Tag; + + BasicBlock *Header; + BasicBlock *Latch; + + // `Latch's terminator instruction is `LatchBr', and it's `LatchBrExitIdx'th + // successor is `LatchExit', the exit block of the loop. + BranchInst *LatchBr; + BasicBlock *LatchExit; + unsigned LatchBrExitIdx; + + Value *IndVarNext; + Value *IndVarStart; + Value *LoopExitAt; + bool IndVarIncreasing; + + LoopStructure() + : Tag(""), Header(nullptr), Latch(nullptr), LatchBr(nullptr), + LatchExit(nullptr), LatchBrExitIdx(-1), IndVarNext(nullptr), + IndVarStart(nullptr), LoopExitAt(nullptr), IndVarIncreasing(false) {} + + template <typename M> LoopStructure map(M Map) const { + LoopStructure Result; + Result.Tag = Tag; + Result.Header = cast<BasicBlock>(Map(Header)); + Result.Latch = cast<BasicBlock>(Map(Latch)); + Result.LatchBr = cast<BranchInst>(Map(LatchBr)); + Result.LatchExit = cast<BasicBlock>(Map(LatchExit)); + Result.LatchBrExitIdx = LatchBrExitIdx; + Result.IndVarNext = Map(IndVarNext); + Result.IndVarStart = Map(IndVarStart); + Result.LoopExitAt = Map(LoopExitAt); + Result.IndVarIncreasing = IndVarIncreasing; + return Result; + } + + static Optional<LoopStructure> parseLoopStructure(ScalarEvolution &, + BranchProbabilityInfo &BPI, + Loop &, + const char *&); +}; + +/// This class is used to constrain loops to run within a given iteration space. +/// The algorithm this class implements is given a Loop and a range [Begin, +/// End). The algorithm then tries to break out a "main loop" out of the loop +/// it is given in a way that the "main loop" runs with the induction variable +/// in a subset of [Begin, End). The algorithm emits appropriate pre and post +/// loops to run any remaining iterations. The pre loop runs any iterations in +/// which the induction variable is < Begin, and the post loop runs any +/// iterations in which the induction variable is >= End. +/// +class LoopConstrainer { + // The representation of a clone of the original loop we started out with. + struct ClonedLoop { + // The cloned blocks + std::vector<BasicBlock *> Blocks; + + // `Map` maps values in the clonee into values in the cloned version + ValueToValueMapTy Map; + + // An instance of `LoopStructure` for the cloned loop + LoopStructure Structure; + }; + + // Result of rewriting the range of a loop. See changeIterationSpaceEnd for + // more details on what these fields mean. + struct RewrittenRangeInfo { + BasicBlock *PseudoExit; + BasicBlock *ExitSelector; + std::vector<PHINode *> PHIValuesAtPseudoExit; + PHINode *IndVarEnd; + + RewrittenRangeInfo() + : PseudoExit(nullptr), ExitSelector(nullptr), IndVarEnd(nullptr) {} + }; + + // Calculated subranges we restrict the iteration space of the main loop to. + // See the implementation of `calculateSubRanges' for more details on how + // these fields are computed. `LowLimit` is None if there is no restriction + // on low end of the restricted iteration space of the main loop. `HighLimit` + // is None if there is no restriction on high end of the restricted iteration + // space of the main loop. + + struct SubRanges { + Optional<const SCEV *> LowLimit; + Optional<const SCEV *> HighLimit; + }; + + // A utility function that does a `replaceUsesOfWith' on the incoming block + // set of a `PHINode' -- replaces instances of `Block' in the `PHINode's + // incoming block list with `ReplaceBy'. + static void replacePHIBlock(PHINode *PN, BasicBlock *Block, + BasicBlock *ReplaceBy); + + // Compute a safe set of limits for the main loop to run in -- effectively the + // intersection of `Range' and the iteration space of the original loop. + // Return None if unable to compute the set of subranges. + // + Optional<SubRanges> calculateSubRanges() const; + + // Clone `OriginalLoop' and return the result in CLResult. The IR after + // running `cloneLoop' is well formed except for the PHI nodes in CLResult -- + // the PHI nodes say that there is an incoming edge from `OriginalPreheader` + // but there is no such edge. + // + void cloneLoop(ClonedLoop &CLResult, const char *Tag) const; + + // Rewrite the iteration space of the loop denoted by (LS, Preheader). The + // iteration space of the rewritten loop ends at ExitLoopAt. The start of the + // iteration space is not changed. `ExitLoopAt' is assumed to be slt + // `OriginalHeaderCount'. + // + // If there are iterations left to execute, control is made to jump to + // `ContinuationBlock', otherwise they take the normal loop exit. The + // returned `RewrittenRangeInfo' object is populated as follows: + // + // .PseudoExit is a basic block that unconditionally branches to + // `ContinuationBlock'. + // + // .ExitSelector is a basic block that decides, on exit from the loop, + // whether to branch to the "true" exit or to `PseudoExit'. + // + // .PHIValuesAtPseudoExit are PHINodes in `PseudoExit' that compute the value + // for each PHINode in the loop header on taking the pseudo exit. + // + // After changeIterationSpaceEnd, `Preheader' is no longer a legitimate + // preheader because it is made to branch to the loop header only + // conditionally. + // + RewrittenRangeInfo + changeIterationSpaceEnd(const LoopStructure &LS, BasicBlock *Preheader, + Value *ExitLoopAt, + BasicBlock *ContinuationBlock) const; + + // The loop denoted by `LS' has `OldPreheader' as its preheader. This + // function creates a new preheader for `LS' and returns it. + // + BasicBlock *createPreheader(const LoopStructure &LS, BasicBlock *OldPreheader, + const char *Tag) const; + + // `ContinuationBlockAndPreheader' was the continuation block for some call to + // `changeIterationSpaceEnd' and is the preheader to the loop denoted by `LS'. + // This function rewrites the PHI nodes in `LS.Header' to start with the + // correct value. + void rewriteIncomingValuesForPHIs( + LoopStructure &LS, BasicBlock *ContinuationBlockAndPreheader, + const LoopConstrainer::RewrittenRangeInfo &RRI) const; + + // Even though we do not preserve any passes at this time, we at least need to + // keep the parent loop structure consistent. The `LPPassManager' seems to + // verify this after running a loop pass. This function adds the list of + // blocks denoted by BBs to this loops parent loop if required. + void addToParentLoopIfNeeded(ArrayRef<BasicBlock *> BBs); + + // Some global state. + Function &F; + LLVMContext &Ctx; + ScalarEvolution &SE; + + // Information about the original loop we started out with. + Loop &OriginalLoop; + LoopInfo &OriginalLoopInfo; + const SCEV *LatchTakenCount; + BasicBlock *OriginalPreheader; + + // The preheader of the main loop. This may or may not be different from + // `OriginalPreheader'. + BasicBlock *MainLoopPreheader; + + // The range we need to run the main loop in. + InductiveRangeCheck::Range Range; + + // The structure of the main loop (see comment at the beginning of this class + // for a definition) + LoopStructure MainLoopStructure; + +public: + LoopConstrainer(Loop &L, LoopInfo &LI, const LoopStructure &LS, + ScalarEvolution &SE, InductiveRangeCheck::Range R) + : F(*L.getHeader()->getParent()), Ctx(L.getHeader()->getContext()), + SE(SE), OriginalLoop(L), OriginalLoopInfo(LI), LatchTakenCount(nullptr), + OriginalPreheader(nullptr), MainLoopPreheader(nullptr), Range(R), + MainLoopStructure(LS) {} + + // Entry point for the algorithm. Returns true on success. + bool run(); +}; + +} + +void LoopConstrainer::replacePHIBlock(PHINode *PN, BasicBlock *Block, + BasicBlock *ReplaceBy) { + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) + if (PN->getIncomingBlock(i) == Block) + PN->setIncomingBlock(i, ReplaceBy); +} + +static bool CanBeSMax(ScalarEvolution &SE, const SCEV *S) { + APInt SMax = + APInt::getSignedMaxValue(cast<IntegerType>(S->getType())->getBitWidth()); + return SE.getSignedRange(S).contains(SMax) && + SE.getUnsignedRange(S).contains(SMax); +} + +static bool CanBeSMin(ScalarEvolution &SE, const SCEV *S) { + APInt SMin = + APInt::getSignedMinValue(cast<IntegerType>(S->getType())->getBitWidth()); + return SE.getSignedRange(S).contains(SMin) && + SE.getUnsignedRange(S).contains(SMin); +} + +Optional<LoopStructure> +LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BPI, + Loop &L, const char *&FailureReason) { + assert(L.isLoopSimplifyForm() && "should follow from addRequired<>"); + + BasicBlock *Latch = L.getLoopLatch(); + if (!L.isLoopExiting(Latch)) { + FailureReason = "no loop latch"; + return None; + } + + BasicBlock *Header = L.getHeader(); + BasicBlock *Preheader = L.getLoopPreheader(); + if (!Preheader) { + FailureReason = "no preheader"; + return None; + } + + BranchInst *LatchBr = dyn_cast<BranchInst>(&*Latch->rbegin()); + if (!LatchBr || LatchBr->isUnconditional()) { + FailureReason = "latch terminator not conditional branch"; + return None; + } + + unsigned LatchBrExitIdx = LatchBr->getSuccessor(0) == Header ? 1 : 0; + + BranchProbability ExitProbability = + BPI.getEdgeProbability(LatchBr->getParent(), LatchBrExitIdx); + + if (ExitProbability > BranchProbability(1, MaxExitProbReciprocal)) { + FailureReason = "short running loop, not profitable"; + return None; + } + + ICmpInst *ICI = dyn_cast<ICmpInst>(LatchBr->getCondition()); + if (!ICI || !isa<IntegerType>(ICI->getOperand(0)->getType())) { + FailureReason = "latch terminator branch not conditional on integral icmp"; + return None; + } + + const SCEV *LatchCount = SE.getExitCount(&L, Latch); + if (isa<SCEVCouldNotCompute>(LatchCount)) { + FailureReason = "could not compute latch count"; + return None; + } + + ICmpInst::Predicate Pred = ICI->getPredicate(); + Value *LeftValue = ICI->getOperand(0); + const SCEV *LeftSCEV = SE.getSCEV(LeftValue); + IntegerType *IndVarTy = cast<IntegerType>(LeftValue->getType()); + + Value *RightValue = ICI->getOperand(1); + const SCEV *RightSCEV = SE.getSCEV(RightValue); + + // We canonicalize `ICI` such that `LeftSCEV` is an add recurrence. + if (!isa<SCEVAddRecExpr>(LeftSCEV)) { + if (isa<SCEVAddRecExpr>(RightSCEV)) { + std::swap(LeftSCEV, RightSCEV); + std::swap(LeftValue, RightValue); + Pred = ICmpInst::getSwappedPredicate(Pred); + } else { + FailureReason = "no add recurrences in the icmp"; + return None; + } + } + + auto HasNoSignedWrap = [&](const SCEVAddRecExpr *AR) { + if (AR->getNoWrapFlags(SCEV::FlagNSW)) + return true; + + IntegerType *Ty = cast<IntegerType>(AR->getType()); + IntegerType *WideTy = + IntegerType::get(Ty->getContext(), Ty->getBitWidth() * 2); + + const SCEVAddRecExpr *ExtendAfterOp = + dyn_cast<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy)); + if (ExtendAfterOp) { + const SCEV *ExtendedStart = SE.getSignExtendExpr(AR->getStart(), WideTy); + const SCEV *ExtendedStep = + SE.getSignExtendExpr(AR->getStepRecurrence(SE), WideTy); + + bool NoSignedWrap = ExtendAfterOp->getStart() == ExtendedStart && + ExtendAfterOp->getStepRecurrence(SE) == ExtendedStep; + + if (NoSignedWrap) + return true; + } + + // We may have proved this when computing the sign extension above. + return AR->getNoWrapFlags(SCEV::FlagNSW) != SCEV::FlagAnyWrap; + }; + + auto IsInductionVar = [&](const SCEVAddRecExpr *AR, bool &IsIncreasing) { + if (!AR->isAffine()) + return false; + + // Currently we only work with induction variables that have been proved to + // not wrap. This restriction can potentially be lifted in the future. + + if (!HasNoSignedWrap(AR)) + return false; + + if (const SCEVConstant *StepExpr = + dyn_cast<SCEVConstant>(AR->getStepRecurrence(SE))) { + ConstantInt *StepCI = StepExpr->getValue(); + if (StepCI->isOne() || StepCI->isMinusOne()) { + IsIncreasing = StepCI->isOne(); + return true; + } + } + + return false; + }; + + // `ICI` is interpreted as taking the backedge if the *next* value of the + // induction variable satisfies some constraint. + + const SCEVAddRecExpr *IndVarNext = cast<SCEVAddRecExpr>(LeftSCEV); + bool IsIncreasing = false; + if (!IsInductionVar(IndVarNext, IsIncreasing)) { + FailureReason = "LHS in icmp not induction variable"; + return None; + } + + ConstantInt *One = ConstantInt::get(IndVarTy, 1); + // TODO: generalize the predicates here to also match their unsigned variants. + if (IsIncreasing) { + bool FoundExpectedPred = + (Pred == ICmpInst::ICMP_SLT && LatchBrExitIdx == 1) || + (Pred == ICmpInst::ICMP_SGT && LatchBrExitIdx == 0); + + if (!FoundExpectedPred) { + FailureReason = "expected icmp slt semantically, found something else"; + return None; + } + + if (LatchBrExitIdx == 0) { + if (CanBeSMax(SE, RightSCEV)) { + // TODO: this restriction is easily removable -- we just have to + // remember that the icmp was an slt and not an sle. + FailureReason = "limit may overflow when coercing sle to slt"; + return None; + } + + IRBuilder<> B(&*Preheader->rbegin()); + RightValue = B.CreateAdd(RightValue, One); + } + + } else { + bool FoundExpectedPred = + (Pred == ICmpInst::ICMP_SGT && LatchBrExitIdx == 1) || + (Pred == ICmpInst::ICMP_SLT && LatchBrExitIdx == 0); + + if (!FoundExpectedPred) { + FailureReason = "expected icmp sgt semantically, found something else"; + return None; + } + + if (LatchBrExitIdx == 0) { + if (CanBeSMin(SE, RightSCEV)) { + // TODO: this restriction is easily removable -- we just have to + // remember that the icmp was an sgt and not an sge. + FailureReason = "limit may overflow when coercing sge to sgt"; + return None; + } + + IRBuilder<> B(&*Preheader->rbegin()); + RightValue = B.CreateSub(RightValue, One); + } + } + + const SCEV *StartNext = IndVarNext->getStart(); + const SCEV *Addend = SE.getNegativeSCEV(IndVarNext->getStepRecurrence(SE)); + const SCEV *IndVarStart = SE.getAddExpr(StartNext, Addend); + + BasicBlock *LatchExit = LatchBr->getSuccessor(LatchBrExitIdx); + + assert(SE.getLoopDisposition(LatchCount, &L) == + ScalarEvolution::LoopInvariant && + "loop variant exit count doesn't make sense!"); + + assert(!L.contains(LatchExit) && "expected an exit block!"); + const DataLayout &DL = Preheader->getModule()->getDataLayout(); + Value *IndVarStartV = + SCEVExpander(SE, DL, "irce") + .expandCodeFor(IndVarStart, IndVarTy, &*Preheader->rbegin()); + IndVarStartV->setName("indvar.start"); + + LoopStructure Result; + + Result.Tag = "main"; + Result.Header = Header; + Result.Latch = Latch; + Result.LatchBr = LatchBr; + Result.LatchExit = LatchExit; + Result.LatchBrExitIdx = LatchBrExitIdx; + Result.IndVarStart = IndVarStartV; + Result.IndVarNext = LeftValue; + Result.IndVarIncreasing = IsIncreasing; + Result.LoopExitAt = RightValue; + + FailureReason = nullptr; + + return Result; +} + +Optional<LoopConstrainer::SubRanges> +LoopConstrainer::calculateSubRanges() const { + IntegerType *Ty = cast<IntegerType>(LatchTakenCount->getType()); + + if (Range.getType() != Ty) + return None; + + LoopConstrainer::SubRanges Result; + + // I think we can be more aggressive here and make this nuw / nsw if the + // addition that feeds into the icmp for the latch's terminating branch is nuw + // / nsw. In any case, a wrapping 2's complement addition is safe. + ConstantInt *One = ConstantInt::get(Ty, 1); + const SCEV *Start = SE.getSCEV(MainLoopStructure.IndVarStart); + const SCEV *End = SE.getSCEV(MainLoopStructure.LoopExitAt); + + bool Increasing = MainLoopStructure.IndVarIncreasing; + + // We compute `Smallest` and `Greatest` such that [Smallest, Greatest) is the + // range of values the induction variable takes. + + const SCEV *Smallest = nullptr, *Greatest = nullptr; + + if (Increasing) { + Smallest = Start; + Greatest = End; + } else { + // These two computations may sign-overflow. Here is why that is okay: + // + // We know that the induction variable does not sign-overflow on any + // iteration except the last one, and it starts at `Start` and ends at + // `End`, decrementing by one every time. + // + // * if `Smallest` sign-overflows we know `End` is `INT_SMAX`. Since the + // induction variable is decreasing we know that that the smallest value + // the loop body is actually executed with is `INT_SMIN` == `Smallest`. + // + // * if `Greatest` sign-overflows, we know it can only be `INT_SMIN`. In + // that case, `Clamp` will always return `Smallest` and + // [`Result.LowLimit`, `Result.HighLimit`) = [`Smallest`, `Smallest`) + // will be an empty range. Returning an empty range is always safe. + // + + Smallest = SE.getAddExpr(End, SE.getSCEV(One)); + Greatest = SE.getAddExpr(Start, SE.getSCEV(One)); + } + + auto Clamp = [this, Smallest, Greatest](const SCEV *S) { + return SE.getSMaxExpr(Smallest, SE.getSMinExpr(Greatest, S)); + }; + + // In some cases we can prove that we don't need a pre or post loop + + bool ProvablyNoPreloop = + SE.isKnownPredicate(ICmpInst::ICMP_SLE, Range.getBegin(), Smallest); + if (!ProvablyNoPreloop) + Result.LowLimit = Clamp(Range.getBegin()); + + bool ProvablyNoPostLoop = + SE.isKnownPredicate(ICmpInst::ICMP_SLE, Greatest, Range.getEnd()); + if (!ProvablyNoPostLoop) + Result.HighLimit = Clamp(Range.getEnd()); + + return Result; +} + +void LoopConstrainer::cloneLoop(LoopConstrainer::ClonedLoop &Result, + const char *Tag) const { + for (BasicBlock *BB : OriginalLoop.getBlocks()) { + BasicBlock *Clone = CloneBasicBlock(BB, Result.Map, Twine(".") + Tag, &F); + Result.Blocks.push_back(Clone); + Result.Map[BB] = Clone; + } + + auto GetClonedValue = [&Result](Value *V) { + assert(V && "null values not in domain!"); + auto It = Result.Map.find(V); + if (It == Result.Map.end()) + return V; + return static_cast<Value *>(It->second); + }; + + Result.Structure = MainLoopStructure.map(GetClonedValue); + Result.Structure.Tag = Tag; + + for (unsigned i = 0, e = Result.Blocks.size(); i != e; ++i) { + BasicBlock *ClonedBB = Result.Blocks[i]; + BasicBlock *OriginalBB = OriginalLoop.getBlocks()[i]; + + assert(Result.Map[OriginalBB] == ClonedBB && "invariant!"); + + for (Instruction &I : *ClonedBB) + RemapInstruction(&I, Result.Map, + RF_NoModuleLevelChanges | RF_IgnoreMissingEntries); + + // Exit blocks will now have one more predecessor and their PHI nodes need + // to be edited to reflect that. No phi nodes need to be introduced because + // the loop is in LCSSA. + + for (auto SBBI = succ_begin(OriginalBB), SBBE = succ_end(OriginalBB); + SBBI != SBBE; ++SBBI) { + + if (OriginalLoop.contains(*SBBI)) + continue; // not an exit block + + for (Instruction &I : **SBBI) { + if (!isa<PHINode>(&I)) + break; + + PHINode *PN = cast<PHINode>(&I); + Value *OldIncoming = PN->getIncomingValueForBlock(OriginalBB); + PN->addIncoming(GetClonedValue(OldIncoming), ClonedBB); + } + } + } +} + +LoopConstrainer::RewrittenRangeInfo LoopConstrainer::changeIterationSpaceEnd( + const LoopStructure &LS, BasicBlock *Preheader, Value *ExitSubloopAt, + BasicBlock *ContinuationBlock) const { + + // We start with a loop with a single latch: + // + // +--------------------+ + // | | + // | preheader | + // | | + // +--------+-----------+ + // | ----------------\ + // | / | + // +--------v----v------+ | + // | | | + // | header | | + // | | | + // +--------------------+ | + // | + // ..... | + // | + // +--------------------+ | + // | | | + // | latch >----------/ + // | | + // +-------v------------+ + // | + // | + // | +--------------------+ + // | | | + // +---> original exit | + // | | + // +--------------------+ + // + // We change the control flow to look like + // + // + // +--------------------+ + // | | + // | preheader >-------------------------+ + // | | | + // +--------v-----------+ | + // | /-------------+ | + // | / | | + // +--------v--v--------+ | | + // | | | | + // | header | | +--------+ | + // | | | | | | + // +--------------------+ | | +-----v-----v-----------+ + // | | | | + // | | | .pseudo.exit | + // | | | | + // | | +-----------v-----------+ + // | | | + // ..... | | | + // | | +--------v-------------+ + // +--------------------+ | | | | + // | | | | | ContinuationBlock | + // | latch >------+ | | | + // | | | +----------------------+ + // +---------v----------+ | + // | | + // | | + // | +---------------^-----+ + // | | | + // +-----> .exit.selector | + // | | + // +----------v----------+ + // | + // +--------------------+ | + // | | | + // | original exit <----+ + // | | + // +--------------------+ + // + + RewrittenRangeInfo RRI; + + auto BBInsertLocation = std::next(Function::iterator(LS.Latch)); + RRI.ExitSelector = BasicBlock::Create(Ctx, Twine(LS.Tag) + ".exit.selector", + &F, BBInsertLocation); + RRI.PseudoExit = BasicBlock::Create(Ctx, Twine(LS.Tag) + ".pseudo.exit", &F, + BBInsertLocation); + + BranchInst *PreheaderJump = cast<BranchInst>(&*Preheader->rbegin()); + bool Increasing = LS.IndVarIncreasing; + + IRBuilder<> B(PreheaderJump); + + // EnterLoopCond - is it okay to start executing this `LS'? + Value *EnterLoopCond = Increasing + ? B.CreateICmpSLT(LS.IndVarStart, ExitSubloopAt) + : B.CreateICmpSGT(LS.IndVarStart, ExitSubloopAt); + + B.CreateCondBr(EnterLoopCond, LS.Header, RRI.PseudoExit); + PreheaderJump->eraseFromParent(); + + LS.LatchBr->setSuccessor(LS.LatchBrExitIdx, RRI.ExitSelector); + B.SetInsertPoint(LS.LatchBr); + Value *TakeBackedgeLoopCond = + Increasing ? B.CreateICmpSLT(LS.IndVarNext, ExitSubloopAt) + : B.CreateICmpSGT(LS.IndVarNext, ExitSubloopAt); + Value *CondForBranch = LS.LatchBrExitIdx == 1 + ? TakeBackedgeLoopCond + : B.CreateNot(TakeBackedgeLoopCond); + + LS.LatchBr->setCondition(CondForBranch); + + B.SetInsertPoint(RRI.ExitSelector); + + // IterationsLeft - are there any more iterations left, given the original + // upper bound on the induction variable? If not, we branch to the "real" + // exit. + Value *IterationsLeft = Increasing + ? B.CreateICmpSLT(LS.IndVarNext, LS.LoopExitAt) + : B.CreateICmpSGT(LS.IndVarNext, LS.LoopExitAt); + B.CreateCondBr(IterationsLeft, RRI.PseudoExit, LS.LatchExit); + + BranchInst *BranchToContinuation = + BranchInst::Create(ContinuationBlock, RRI.PseudoExit); + + // We emit PHI nodes into `RRI.PseudoExit' that compute the "latest" value of + // each of the PHI nodes in the loop header. This feeds into the initial + // value of the same PHI nodes if/when we continue execution. + for (Instruction &I : *LS.Header) { + if (!isa<PHINode>(&I)) + break; + + PHINode *PN = cast<PHINode>(&I); + + PHINode *NewPHI = PHINode::Create(PN->getType(), 2, PN->getName() + ".copy", + BranchToContinuation); + + NewPHI->addIncoming(PN->getIncomingValueForBlock(Preheader), Preheader); + NewPHI->addIncoming(PN->getIncomingValueForBlock(LS.Latch), + RRI.ExitSelector); + RRI.PHIValuesAtPseudoExit.push_back(NewPHI); + } + + RRI.IndVarEnd = PHINode::Create(LS.IndVarNext->getType(), 2, "indvar.end", + BranchToContinuation); + RRI.IndVarEnd->addIncoming(LS.IndVarStart, Preheader); + RRI.IndVarEnd->addIncoming(LS.IndVarNext, RRI.ExitSelector); + + // The latch exit now has a branch from `RRI.ExitSelector' instead of + // `LS.Latch'. The PHI nodes need to be updated to reflect that. + for (Instruction &I : *LS.LatchExit) { + if (PHINode *PN = dyn_cast<PHINode>(&I)) + replacePHIBlock(PN, LS.Latch, RRI.ExitSelector); + else + break; + } + + return RRI; +} + +void LoopConstrainer::rewriteIncomingValuesForPHIs( + LoopStructure &LS, BasicBlock *ContinuationBlock, + const LoopConstrainer::RewrittenRangeInfo &RRI) const { + + unsigned PHIIndex = 0; + for (Instruction &I : *LS.Header) { + if (!isa<PHINode>(&I)) + break; + + PHINode *PN = cast<PHINode>(&I); + + for (unsigned i = 0, e = PN->getNumIncomingValues(); i < e; ++i) + if (PN->getIncomingBlock(i) == ContinuationBlock) + PN->setIncomingValue(i, RRI.PHIValuesAtPseudoExit[PHIIndex++]); + } + + LS.IndVarStart = RRI.IndVarEnd; +} + +BasicBlock *LoopConstrainer::createPreheader(const LoopStructure &LS, + BasicBlock *OldPreheader, + const char *Tag) const { + + BasicBlock *Preheader = BasicBlock::Create(Ctx, Tag, &F, LS.Header); + BranchInst::Create(LS.Header, Preheader); + + for (Instruction &I : *LS.Header) { + if (!isa<PHINode>(&I)) + break; + + PHINode *PN = cast<PHINode>(&I); + for (unsigned i = 0, e = PN->getNumIncomingValues(); i < e; ++i) + replacePHIBlock(PN, OldPreheader, Preheader); + } + + return Preheader; +} + +void LoopConstrainer::addToParentLoopIfNeeded(ArrayRef<BasicBlock *> BBs) { + Loop *ParentLoop = OriginalLoop.getParentLoop(); + if (!ParentLoop) + return; + + for (BasicBlock *BB : BBs) + ParentLoop->addBasicBlockToLoop(BB, OriginalLoopInfo); +} + +bool LoopConstrainer::run() { + BasicBlock *Preheader = nullptr; + LatchTakenCount = SE.getExitCount(&OriginalLoop, MainLoopStructure.Latch); + Preheader = OriginalLoop.getLoopPreheader(); + assert(!isa<SCEVCouldNotCompute>(LatchTakenCount) && Preheader != nullptr && + "preconditions!"); + + OriginalPreheader = Preheader; + MainLoopPreheader = Preheader; + + Optional<SubRanges> MaybeSR = calculateSubRanges(); + if (!MaybeSR.hasValue()) { + DEBUG(dbgs() << "irce: could not compute subranges\n"); + return false; + } + + SubRanges SR = MaybeSR.getValue(); + bool Increasing = MainLoopStructure.IndVarIncreasing; + IntegerType *IVTy = + cast<IntegerType>(MainLoopStructure.IndVarNext->getType()); + + SCEVExpander Expander(SE, F.getParent()->getDataLayout(), "irce"); + Instruction *InsertPt = OriginalPreheader->getTerminator(); + + // It would have been better to make `PreLoop' and `PostLoop' + // `Optional<ClonedLoop>'s, but `ValueToValueMapTy' does not have a copy + // constructor. + ClonedLoop PreLoop, PostLoop; + bool NeedsPreLoop = + Increasing ? SR.LowLimit.hasValue() : SR.HighLimit.hasValue(); + bool NeedsPostLoop = + Increasing ? SR.HighLimit.hasValue() : SR.LowLimit.hasValue(); + + Value *ExitPreLoopAt = nullptr; + Value *ExitMainLoopAt = nullptr; + const SCEVConstant *MinusOneS = + cast<SCEVConstant>(SE.getConstant(IVTy, -1, true /* isSigned */)); + + if (NeedsPreLoop) { + const SCEV *ExitPreLoopAtSCEV = nullptr; + + if (Increasing) + ExitPreLoopAtSCEV = *SR.LowLimit; + else { + if (CanBeSMin(SE, *SR.HighLimit)) { + DEBUG(dbgs() << "irce: could not prove no-overflow when computing " + << "preloop exit limit. HighLimit = " << *(*SR.HighLimit) + << "\n"); + return false; + } + ExitPreLoopAtSCEV = SE.getAddExpr(*SR.HighLimit, MinusOneS); + } + + ExitPreLoopAt = Expander.expandCodeFor(ExitPreLoopAtSCEV, IVTy, InsertPt); + ExitPreLoopAt->setName("exit.preloop.at"); + } + + if (NeedsPostLoop) { + const SCEV *ExitMainLoopAtSCEV = nullptr; + + if (Increasing) + ExitMainLoopAtSCEV = *SR.HighLimit; + else { + if (CanBeSMin(SE, *SR.LowLimit)) { + DEBUG(dbgs() << "irce: could not prove no-overflow when computing " + << "mainloop exit limit. LowLimit = " << *(*SR.LowLimit) + << "\n"); + return false; + } + ExitMainLoopAtSCEV = SE.getAddExpr(*SR.LowLimit, MinusOneS); + } + + ExitMainLoopAt = Expander.expandCodeFor(ExitMainLoopAtSCEV, IVTy, InsertPt); + ExitMainLoopAt->setName("exit.mainloop.at"); + } + + // We clone these ahead of time so that we don't have to deal with changing + // and temporarily invalid IR as we transform the loops. + if (NeedsPreLoop) + cloneLoop(PreLoop, "preloop"); + if (NeedsPostLoop) + cloneLoop(PostLoop, "postloop"); + + RewrittenRangeInfo PreLoopRRI; + + if (NeedsPreLoop) { + Preheader->getTerminator()->replaceUsesOfWith(MainLoopStructure.Header, + PreLoop.Structure.Header); + + MainLoopPreheader = + createPreheader(MainLoopStructure, Preheader, "mainloop"); + PreLoopRRI = changeIterationSpaceEnd(PreLoop.Structure, Preheader, + ExitPreLoopAt, MainLoopPreheader); + rewriteIncomingValuesForPHIs(MainLoopStructure, MainLoopPreheader, + PreLoopRRI); + } + + BasicBlock *PostLoopPreheader = nullptr; + RewrittenRangeInfo PostLoopRRI; + + if (NeedsPostLoop) { + PostLoopPreheader = + createPreheader(PostLoop.Structure, Preheader, "postloop"); + PostLoopRRI = changeIterationSpaceEnd(MainLoopStructure, MainLoopPreheader, + ExitMainLoopAt, PostLoopPreheader); + rewriteIncomingValuesForPHIs(PostLoop.Structure, PostLoopPreheader, + PostLoopRRI); + } + + BasicBlock *NewMainLoopPreheader = + MainLoopPreheader != Preheader ? MainLoopPreheader : nullptr; + BasicBlock *NewBlocks[] = {PostLoopPreheader, PreLoopRRI.PseudoExit, + PreLoopRRI.ExitSelector, PostLoopRRI.PseudoExit, + PostLoopRRI.ExitSelector, NewMainLoopPreheader}; + + // Some of the above may be nullptr, filter them out before passing to + // addToParentLoopIfNeeded. + auto NewBlocksEnd = + std::remove(std::begin(NewBlocks), std::end(NewBlocks), nullptr); + + addToParentLoopIfNeeded(makeArrayRef(std::begin(NewBlocks), NewBlocksEnd)); + addToParentLoopIfNeeded(PreLoop.Blocks); + addToParentLoopIfNeeded(PostLoop.Blocks); + + return true; +} + +/// Computes and returns a range of values for the induction variable (IndVar) +/// in which the range check can be safely elided. If it cannot compute such a +/// range, returns None. +Optional<InductiveRangeCheck::Range> +InductiveRangeCheck::computeSafeIterationSpace(ScalarEvolution &SE, + const SCEVAddRecExpr *IndVar, + IRBuilder<> &) const { + // IndVar is of the form "A + B * I" (where "I" is the canonical induction + // variable, that may or may not exist as a real llvm::Value in the loop) and + // this inductive range check is a range check on the "C + D * I" ("C" is + // getOffset() and "D" is getScale()). We rewrite the value being range + // checked to "M + N * IndVar" where "N" = "D * B^(-1)" and "M" = "C - NA". + // Currently we support this only for "B" = "D" = { 1 or -1 }, but the code + // can be generalized as needed. + // + // The actual inequalities we solve are of the form + // + // 0 <= M + 1 * IndVar < L given L >= 0 (i.e. N == 1) + // + // The inequality is satisfied by -M <= IndVar < (L - M) [^1]. All additions + // and subtractions are twos-complement wrapping and comparisons are signed. + // + // Proof: + // + // If there exists IndVar such that -M <= IndVar < (L - M) then it follows + // that -M <= (-M + L) [== Eq. 1]. Since L >= 0, if (-M + L) sign-overflows + // then (-M + L) < (-M). Hence by [Eq. 1], (-M + L) could not have + // overflown. + // + // This means IndVar = t + (-M) for t in [0, L). Hence (IndVar + M) = t. + // Hence 0 <= (IndVar + M) < L + + // [^1]: Note that the solution does _not_ apply if L < 0; consider values M = + // 127, IndVar = 126 and L = -2 in an i8 world. + + if (!IndVar->isAffine()) + return None; + + const SCEV *A = IndVar->getStart(); + const SCEVConstant *B = dyn_cast<SCEVConstant>(IndVar->getStepRecurrence(SE)); + if (!B) + return None; + + const SCEV *C = getOffset(); + const SCEVConstant *D = dyn_cast<SCEVConstant>(getScale()); + if (D != B) + return None; + + ConstantInt *ConstD = D->getValue(); + if (!(ConstD->isMinusOne() || ConstD->isOne())) + return None; + + const SCEV *M = SE.getMinusSCEV(C, A); + + const SCEV *Begin = SE.getNegativeSCEV(M); + const SCEV *UpperLimit = nullptr; + + // We strengthen "0 <= I" to "0 <= I < INT_SMAX" and "I < L" to "0 <= I < L". + // We can potentially do much better here. + if (Value *V = getLength()) { + UpperLimit = SE.getSCEV(V); + } else { + assert(Kind == InductiveRangeCheck::RANGE_CHECK_LOWER && "invariant!"); + unsigned BitWidth = cast<IntegerType>(IndVar->getType())->getBitWidth(); + UpperLimit = SE.getConstant(APInt::getSignedMaxValue(BitWidth)); + } + + const SCEV *End = SE.getMinusSCEV(UpperLimit, M); + return InductiveRangeCheck::Range(Begin, End); +} + +static Optional<InductiveRangeCheck::Range> +IntersectRange(ScalarEvolution &SE, + const Optional<InductiveRangeCheck::Range> &R1, + const InductiveRangeCheck::Range &R2, IRBuilder<> &B) { + if (!R1.hasValue()) + return R2; + auto &R1Value = R1.getValue(); + + // TODO: we could widen the smaller range and have this work; but for now we + // bail out to keep things simple. + if (R1Value.getType() != R2.getType()) + return None; + + const SCEV *NewBegin = SE.getSMaxExpr(R1Value.getBegin(), R2.getBegin()); + const SCEV *NewEnd = SE.getSMinExpr(R1Value.getEnd(), R2.getEnd()); + + return InductiveRangeCheck::Range(NewBegin, NewEnd); +} + +bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) { + if (L->getBlocks().size() >= LoopSizeCutoff) { + DEBUG(dbgs() << "irce: giving up constraining loop, too large\n";); + return false; + } + + BasicBlock *Preheader = L->getLoopPreheader(); + if (!Preheader) { + DEBUG(dbgs() << "irce: loop has no preheader, leaving\n"); + return false; + } + + LLVMContext &Context = Preheader->getContext(); + InductiveRangeCheck::AllocatorTy IRCAlloc; + SmallVector<InductiveRangeCheck *, 16> RangeChecks; + ScalarEvolution &SE = getAnalysis<ScalarEvolution>(); + BranchProbabilityInfo &BPI = getAnalysis<BranchProbabilityInfo>(); + + for (auto BBI : L->getBlocks()) + if (BranchInst *TBI = dyn_cast<BranchInst>(BBI->getTerminator())) + if (InductiveRangeCheck *IRC = + InductiveRangeCheck::create(IRCAlloc, TBI, L, SE, BPI)) + RangeChecks.push_back(IRC); + + if (RangeChecks.empty()) + return false; + + auto PrintRecognizedRangeChecks = [&](raw_ostream &OS) { + OS << "irce: looking at loop "; L->print(OS); + OS << "irce: loop has " << RangeChecks.size() + << " inductive range checks: \n"; + for (InductiveRangeCheck *IRC : RangeChecks) + IRC->print(OS); + }; + + DEBUG(PrintRecognizedRangeChecks(dbgs())); + + if (PrintRangeChecks) + PrintRecognizedRangeChecks(errs()); + + const char *FailureReason = nullptr; + Optional<LoopStructure> MaybeLoopStructure = + LoopStructure::parseLoopStructure(SE, BPI, *L, FailureReason); + if (!MaybeLoopStructure.hasValue()) { + DEBUG(dbgs() << "irce: could not parse loop structure: " << FailureReason + << "\n";); + return false; + } + LoopStructure LS = MaybeLoopStructure.getValue(); + bool Increasing = LS.IndVarIncreasing; + const SCEV *MinusOne = + SE.getConstant(LS.IndVarNext->getType(), Increasing ? -1 : 1, true); + const SCEVAddRecExpr *IndVar = + cast<SCEVAddRecExpr>(SE.getAddExpr(SE.getSCEV(LS.IndVarNext), MinusOne)); + + Optional<InductiveRangeCheck::Range> SafeIterRange; + Instruction *ExprInsertPt = Preheader->getTerminator(); + + SmallVector<InductiveRangeCheck *, 4> RangeChecksToEliminate; + + IRBuilder<> B(ExprInsertPt); + for (InductiveRangeCheck *IRC : RangeChecks) { + auto Result = IRC->computeSafeIterationSpace(SE, IndVar, B); + if (Result.hasValue()) { + auto MaybeSafeIterRange = + IntersectRange(SE, SafeIterRange, Result.getValue(), B); + if (MaybeSafeIterRange.hasValue()) { + RangeChecksToEliminate.push_back(IRC); + SafeIterRange = MaybeSafeIterRange.getValue(); + } + } + } + + if (!SafeIterRange.hasValue()) + return false; + + LoopConstrainer LC(*L, getAnalysis<LoopInfoWrapperPass>().getLoopInfo(), LS, + SE, SafeIterRange.getValue()); + bool Changed = LC.run(); + + if (Changed) { + auto PrintConstrainedLoopInfo = [L]() { + dbgs() << "irce: in function "; + dbgs() << L->getHeader()->getParent()->getName() << ": "; + dbgs() << "constrained "; + L->print(dbgs()); + }; + + DEBUG(PrintConstrainedLoopInfo()); + + if (PrintChangedLoops) + PrintConstrainedLoopInfo(); + + // Optimize away the now-redundant range checks. + + for (InductiveRangeCheck *IRC : RangeChecksToEliminate) { + ConstantInt *FoldedRangeCheck = IRC->getPassingDirection() + ? ConstantInt::getTrue(Context) + : ConstantInt::getFalse(Context); + IRC->getBranch()->setCondition(FoldedRangeCheck); + } + } + + return Changed; +} + +Pass *llvm::createInductiveRangeCheckEliminationPass() { + return new InductiveRangeCheckElimination; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp index 78beb3f..711df41 100644 --- a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -23,6 +23,7 @@ #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LazyValueInfo.h" #include "llvm/Analysis/Loads.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" @@ -32,7 +33,6 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SSAUpdater.h" @@ -78,7 +78,6 @@ namespace { /// revectored to the false side of the second if. /// class JumpThreading : public FunctionPass { - const DataLayout *DL; TargetLibraryInfo *TLI; LazyValueInfo *LVI; #ifdef NDEBUG @@ -115,7 +114,7 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<LazyValueInfo>(); AU.addPreserved<LazyValueInfo>(); - AU.addRequired<TargetLibraryInfo>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); } void FindLoopHeaders(Function &F); @@ -145,7 +144,7 @@ char JumpThreading::ID = 0; INITIALIZE_PASS_BEGIN(JumpThreading, "jump-threading", "Jump Threading", false, false) INITIALIZE_PASS_DEPENDENCY(LazyValueInfo) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END(JumpThreading, "jump-threading", "Jump Threading", false, false) @@ -159,9 +158,7 @@ bool JumpThreading::runOnFunction(Function &F) { return false; DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n"); - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - DL = DLP ? &DLP->getDataLayout() : nullptr; - TLI = &getAnalysis<TargetLibraryInfo>(); + TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); LVI = &getAnalysis<LazyValueInfo>(); // Remove unreachable blocks from function as they may result in infinite @@ -505,6 +502,7 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result, assert(Preference == WantInteger && "Compares only produce integers"); PHINode *PN = dyn_cast<PHINode>(Cmp->getOperand(0)); if (PN && PN->getParent() == BB) { + const DataLayout &DL = PN->getModule()->getDataLayout(); // We can do this simplification if any comparisons fold to true or false. // See if any do. for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { @@ -709,7 +707,8 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) { // Run constant folding to see if we can reduce the condition to a simple // constant. if (Instruction *I = dyn_cast<Instruction>(Condition)) { - Value *SimpleVal = ConstantFoldInstruction(I, DL, TLI); + Value *SimpleVal = + ConstantFoldInstruction(I, BB->getModule()->getDataLayout(), TLI); if (SimpleVal) { I->replaceAllUsesWith(SimpleVal); I->eraseFromParent(); @@ -792,6 +791,17 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) { CondBr->getSuccessor(ToRemove)->removePredecessor(BB, true); BranchInst::Create(CondBr->getSuccessor(ToKeep), CondBr); CondBr->eraseFromParent(); + if (CondCmp->use_empty()) + CondCmp->eraseFromParent(); + else if (CondCmp->getParent() == BB) { + // If the fact we just learned is true for all uses of the + // condition, replace it with a constant value + auto *CI = Baseline == LazyValueInfo::True ? + ConstantInt::getTrue(CondCmp->getType()) : + ConstantInt::getFalse(CondCmp->getType()); + CondCmp->replaceAllUsesWith(CI); + CondCmp->eraseFromParent(); + } return true; } } @@ -993,7 +1003,7 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { // Split them out to their own block. UnavailablePred = - SplitBlockPredecessors(LoadBB, PredsToSplit, "thread-pre-split", this); + SplitBlockPredecessors(LoadBB, PredsToSplit, "thread-pre-split"); } // If the value isn't available in all predecessors, then there will be @@ -1418,7 +1428,7 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB, else { DEBUG(dbgs() << " Factoring out " << PredBBs.size() << " common predecessors.\n"); - PredBB = SplitBlockPredecessors(BB, PredBBs, ".thr_comm", this); + PredBB = SplitBlockPredecessors(BB, PredBBs, ".thr_comm"); } // And finally, do it! @@ -1521,7 +1531,7 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB, // At this point, the IR is fully up to date and consistent. Do a quick scan // over the new instructions and zap any that are constants or dead. This // frequently happens because of phi translation. - SimplifyInstructionsInBlock(NewBB, DL, TLI); + SimplifyInstructionsInBlock(NewBB, TLI); // Threaded an edge! ++NumThreads; @@ -1561,7 +1571,7 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB, else { DEBUG(dbgs() << " Factoring out " << PredBBs.size() << " common predecessors.\n"); - PredBB = SplitBlockPredecessors(BB, PredBBs, ".thr_comm", this); + PredBB = SplitBlockPredecessors(BB, PredBBs, ".thr_comm"); } // Okay, we decided to do this! Clone all the instructions in BB onto the end @@ -1575,7 +1585,7 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB, BranchInst *OldPredBranch = dyn_cast<BranchInst>(PredBB->getTerminator()); if (!OldPredBranch || !OldPredBranch->isUnconditional()) { - PredBB = SplitEdge(PredBB, BB, this); + PredBB = SplitEdge(PredBB, BB); OldPredBranch = cast<BranchInst>(PredBB->getTerminator()); } @@ -1586,7 +1596,6 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB, BasicBlock::iterator BI = BB->begin(); for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI) ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB); - // Clone the non-phi instructions of BB into PredBB, keeping track of the // mapping and using it to remap operands in the cloned instructions. for (; BI != BB->end(); ++BI) { @@ -1603,7 +1612,8 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB, // If this instruction can be simplified after the operands are updated, // just use the simplified value instead. This frequently happens due to // phi translation. - if (Value *IV = SimplifyInstruction(New, DL)) { + if (Value *IV = + SimplifyInstruction(New, BB->getModule()->getDataLayout())) { delete New; ValueMapping[BI] = IV; } else { diff --git a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp index e145981..f0e6d64 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp @@ -38,6 +38,7 @@ #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" @@ -52,7 +53,6 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/SSAUpdater.h" @@ -71,6 +71,33 @@ static cl::opt<bool> DisablePromotion("disable-licm-promotion", cl::Hidden, cl::desc("Disable memory promotion in LICM pass")); +static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI); +static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop); +static bool hoist(Instruction &I, BasicBlock *Preheader); +static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT, + const Loop *CurLoop, AliasSetTracker *CurAST ); +static bool isGuaranteedToExecute(const Instruction &Inst, + const DominatorTree *DT, + const Loop *CurLoop, + const LICMSafetyInfo *SafetyInfo); +static bool isSafeToExecuteUnconditionally(const Instruction &Inst, + const DominatorTree *DT, + const TargetLibraryInfo *TLI, + const Loop *CurLoop, + const LICMSafetyInfo *SafetyInfo, + const Instruction *CtxI = nullptr); +static bool pointerInvalidatedByLoop(Value *V, uint64_t Size, + const AAMDNodes &AAInfo, + AliasSetTracker *CurAST); +static Instruction *CloneInstructionInExitBlock(const Instruction &I, + BasicBlock &ExitBlock, + PHINode &PN, + const LoopInfo *LI); +static bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA, + DominatorTree *DT, TargetLibraryInfo *TLI, + Loop *CurLoop, AliasSetTracker *CurAST, + LICMSafetyInfo *SafetyInfo); + namespace { struct LICM : public LoopPass { static char ID; // Pass identification, replacement for typeid @@ -86,7 +113,7 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<LoopInfo>(); + AU.addRequired<LoopInfoWrapperPass>(); AU.addRequiredID(LoopSimplifyID); AU.addPreservedID(LoopSimplifyID); AU.addRequiredID(LCSSAID); @@ -94,7 +121,7 @@ namespace { AU.addRequired<AliasAnalysis>(); AU.addPreserved<AliasAnalysis>(); AU.addPreserved<ScalarEvolution>(); - AU.addRequired<TargetLibraryInfo>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); } using llvm::Pass::doFinalization; @@ -109,7 +136,6 @@ namespace { LoopInfo *LI; // Current LoopInfo DominatorTree *DT; // Dominator Tree for the current Loop. - const DataLayout *DL; // DataLayout for constant folding. TargetLibraryInfo *TLI; // TargetLibraryInfo for constant folding. // State that is updated as we process loops. @@ -117,10 +143,6 @@ namespace { BasicBlock *Preheader; // The preheader block of the current loop... Loop *CurLoop; // The current loop we are working on... AliasSetTracker *CurAST; // AliasSet information for the current loop... - bool MayThrow; // The current loop contains an instruction which - // may throw, thus preventing code motion of - // instructions with side effects. - bool HeaderMayThrow; // Same as previous, but specific to loop header DenseMap<Loop*, AliasSetTracker*> LoopToAliasSetMap; /// cloneBasicBlockAnalysis - Simple Analysis hook. Clone alias set info. @@ -133,88 +155,17 @@ namespace { /// Simple Analysis hook. Delete loop L from alias set map. void deleteAnalysisLoop(Loop *L) override; - - /// SinkRegion - Walk the specified region of the CFG (defined by all blocks - /// dominated by the specified block, and that are in the current loop) in - /// reverse depth first order w.r.t the DominatorTree. This allows us to - /// visit uses before definitions, allowing us to sink a loop body in one - /// pass without iteration. - /// - void SinkRegion(DomTreeNode *N); - - /// HoistRegion - Walk the specified region of the CFG (defined by all - /// blocks dominated by the specified block, and that are in the current - /// loop) in depth first order w.r.t the DominatorTree. This allows us to - /// visit definitions before uses, allowing us to hoist a loop body in one - /// pass without iteration. - /// - void HoistRegion(DomTreeNode *N); - - /// inSubLoop - Little predicate that returns true if the specified basic - /// block is in a subloop of the current one, not the current one itself. - /// - bool inSubLoop(BasicBlock *BB) { - assert(CurLoop->contains(BB) && "Only valid if BB is IN the loop"); - return LI->getLoopFor(BB) != CurLoop; - } - - /// sink - When an instruction is found to only be used outside of the loop, - /// this function moves it to the exit blocks and patches up SSA form as - /// needed. - /// - void sink(Instruction &I); - - /// hoist - When an instruction is found to only use loop invariant operands - /// that is safe to hoist, this instruction is called to do the dirty work. - /// - void hoist(Instruction &I); - - /// isSafeToExecuteUnconditionally - Only sink or hoist an instruction if it - /// is not a trapping instruction or if it is a trapping instruction and is - /// guaranteed to execute. - /// - bool isSafeToExecuteUnconditionally(Instruction &I); - - /// isGuaranteedToExecute - Check that the instruction is guaranteed to - /// execute. - /// - bool isGuaranteedToExecute(Instruction &I); - - /// pointerInvalidatedByLoop - Return true if the body of this loop may - /// store into the memory location pointed to by V. - /// - bool pointerInvalidatedByLoop(Value *V, uint64_t Size, - const AAMDNodes &AAInfo) { - // Check to see if any of the basic blocks in CurLoop invalidate *V. - return CurAST->getAliasSetForPointer(V, Size, AAInfo).isMod(); - } - - bool canSinkOrHoistInst(Instruction &I); - bool isNotUsedInLoop(Instruction &I); - - void PromoteAliasSet(AliasSet &AS, - SmallVectorImpl<BasicBlock*> &ExitBlocks, - SmallVectorImpl<Instruction*> &InsertPts, - PredIteratorCache &PIC); - - /// \brief Create a copy of the instruction in the exit block and patch up - /// SSA. - /// PN is a user of I in ExitBlock that can be used to get the number and - /// list of predecessors fast. - Instruction *CloneInstructionInExitBlock(Instruction &I, - BasicBlock &ExitBlock, - PHINode &PN); }; } char LICM::ID = 0; INITIALIZE_PASS_BEGIN(LICM, "licm", "Loop Invariant Code Motion", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LCSSA) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) INITIALIZE_PASS_END(LICM, "licm", "Loop Invariant Code Motion", false, false) @@ -231,13 +182,11 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) { Changed = false; // Get our Loop and Alias Analysis information... - LI = &getAnalysis<LoopInfo>(); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); AA = &getAnalysis<AliasAnalysis>(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - DL = DLP ? &DLP->getDataLayout() : nullptr; - TLI = &getAnalysis<TargetLibraryInfo>(); + TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); assert(L->isLCSSAForm(*DT) && "Loop is not in LCSSA form."); @@ -274,19 +223,9 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) { CurAST->add(*BB); // Incorporate the specified basic block } - HeaderMayThrow = false; - BasicBlock *Header = L->getHeader(); - for (BasicBlock::iterator I = Header->begin(), E = Header->end(); - (I != E) && !HeaderMayThrow; ++I) - HeaderMayThrow |= I->mayThrow(); - MayThrow = HeaderMayThrow; - // TODO: We've already searched for instructions which may throw in subloops. - // We may want to reuse this information. - for (Loop::block_iterator BB = L->block_begin(), BBE = L->block_end(); - (BB != BBE) && !MayThrow ; ++BB) - for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end(); - (I != E) && !MayThrow; ++I) - MayThrow |= I->mayThrow(); + // Compute loop safety information. + LICMSafetyInfo SafetyInfo; + computeLICMSafetyInfo(&SafetyInfo, CurLoop); // We want to visit all of the instructions in this loop... that are not parts // of our subloops (they have already had their invariants hoisted out of @@ -299,9 +238,11 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) { // instructions, we perform another pass to hoist them out of the loop. // if (L->hasDedicatedExits()) - SinkRegion(DT->getNode(L->getHeader())); + Changed |= sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, CurLoop, + CurAST, &SafetyInfo); if (Preheader) - HoistRegion(DT->getNode(L->getHeader())); + Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, + CurLoop, CurAST, &SafetyInfo); // Now that all loop invariants have been removed from the loop, promote any // memory references to scalars that we can. @@ -313,7 +254,9 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) { // Loop over all of the alias sets in the tracker object. for (AliasSetTracker::iterator I = CurAST->begin(), E = CurAST->end(); I != E; ++I) - PromoteAliasSet(*I, ExitBlocks, InsertPts, PIC); + Changed |= promoteLoopAccessesToScalars(*I, ExitBlocks, InsertPts, + PIC, LI, DT, CurLoop, + CurAST, &SafetyInfo); // Once we have promoted values across the loop body we have to recursively // reform LCSSA as any nested loop may now have values defined within the @@ -346,27 +289,35 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) { return Changed; } -/// SinkRegion - Walk the specified region of the CFG (defined by all blocks -/// dominated by the specified block, and that are in the current loop) in -/// reverse depth first order w.r.t the DominatorTree. This allows us to visit -/// uses before definitions, allowing us to sink a loop body in one pass without -/// iteration. +/// Walk the specified region of the CFG (defined by all blocks dominated by +/// the specified block, and that are in the current loop) in reverse depth +/// first order w.r.t the DominatorTree. This allows us to visit uses before +/// definitions, allowing us to sink a loop body in one pass without iteration. /// -void LICM::SinkRegion(DomTreeNode *N) { - assert(N != nullptr && "Null dominator tree node?"); +bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, + DominatorTree *DT, TargetLibraryInfo *TLI, Loop *CurLoop, + AliasSetTracker *CurAST, LICMSafetyInfo *SafetyInfo) { + + // Verify inputs. + assert(N != nullptr && AA != nullptr && LI != nullptr && + DT != nullptr && CurLoop != nullptr && CurAST != nullptr && + SafetyInfo != nullptr && "Unexpected input to sinkRegion"); + + // Set changed as false. + bool Changed = false; + // Get basic block BasicBlock *BB = N->getBlock(); - // If this subregion is not in the top level loop at all, exit. - if (!CurLoop->contains(BB)) return; + if (!CurLoop->contains(BB)) return Changed; // We are processing blocks in reverse dfo, so process children first. const std::vector<DomTreeNode*> &Children = N->getChildren(); for (unsigned i = 0, e = Children.size(); i != e; ++i) - SinkRegion(Children[i]); - + Changed |= + sinkRegion(Children[i], AA, LI, DT, TLI, CurLoop, CurAST, SafetyInfo); // Only need to process the contents of this block if it is not part of a // subloop (which would already have been processed). - if (inSubLoop(BB)) return; + if (inSubLoop(BB,CurLoop,LI)) return Changed; for (BasicBlock::iterator II = BB->end(); II != BB->begin(); ) { Instruction &I = *--II; @@ -387,35 +338,43 @@ void LICM::SinkRegion(DomTreeNode *N) { // outside of the loop. In this case, it doesn't even matter if the // operands of the instruction are loop invariant. // - if (isNotUsedInLoop(I) && canSinkOrHoistInst(I)) { + if (isNotUsedInLoop(I, CurLoop) && + canSinkOrHoistInst(I, AA, DT, TLI, CurLoop, CurAST, SafetyInfo)) { ++II; - sink(I); + Changed |= sink(I, LI, DT, CurLoop, CurAST); } } + return Changed; } -/// HoistRegion - Walk the specified region of the CFG (defined by all blocks -/// dominated by the specified block, and that are in the current loop) in depth -/// first order w.r.t the DominatorTree. This allows us to visit definitions -/// before uses, allowing us to hoist a loop body in one pass without iteration. +/// Walk the specified region of the CFG (defined by all blocks dominated by +/// the specified block, and that are in the current loop) in depth first +/// order w.r.t the DominatorTree. This allows us to visit definitions before +/// uses, allowing us to hoist a loop body in one pass without iteration. /// -void LICM::HoistRegion(DomTreeNode *N) { - assert(N != nullptr && "Null dominator tree node?"); +bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, + DominatorTree *DT, TargetLibraryInfo *TLI, Loop *CurLoop, + AliasSetTracker *CurAST, LICMSafetyInfo *SafetyInfo) { + // Verify inputs. + assert(N != nullptr && AA != nullptr && LI != nullptr && + DT != nullptr && CurLoop != nullptr && CurAST != nullptr && + SafetyInfo != nullptr && "Unexpected input to hoistRegion"); + // Set changed as false. + bool Changed = false; + // Get basic block BasicBlock *BB = N->getBlock(); - // If this subregion is not in the top level loop at all, exit. - if (!CurLoop->contains(BB)) return; - + if (!CurLoop->contains(BB)) return Changed; // Only need to process the contents of this block if it is not part of a // subloop (which would already have been processed). - if (!inSubLoop(BB)) + if (!inSubLoop(BB, CurLoop, LI)) for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E; ) { Instruction &I = *II++; - // Try constant folding this instruction. If all the operands are // constants, it is technically hoistable, but it would be better to just // fold it. - if (Constant *C = ConstantFoldInstruction(&I, DL, TLI)) { + if (Constant *C = ConstantFoldInstruction( + &I, I.getModule()->getDataLayout(), TLI)) { DEBUG(dbgs() << "LICM folding inst: " << I << " --> " << *C << '\n'); CurAST->copyValue(&I, C); CurAST->deleteValue(&I); @@ -428,20 +387,49 @@ void LICM::HoistRegion(DomTreeNode *N) { // if all of the operands of the instruction are loop invariant and if it // is safe to hoist the instruction. // - if (CurLoop->hasLoopInvariantOperands(&I) && canSinkOrHoistInst(I) && - isSafeToExecuteUnconditionally(I)) - hoist(I); + if (CurLoop->hasLoopInvariantOperands(&I) && + canSinkOrHoistInst(I, AA, DT, TLI, CurLoop, CurAST, SafetyInfo) && + isSafeToExecuteUnconditionally(I, DT, TLI, CurLoop, SafetyInfo, + CurLoop->getLoopPreheader()->getTerminator())) + Changed |= hoist(I, CurLoop->getLoopPreheader()); } const std::vector<DomTreeNode*> &Children = N->getChildren(); for (unsigned i = 0, e = Children.size(); i != e; ++i) - HoistRegion(Children[i]); + Changed |= + hoistRegion(Children[i], AA, LI, DT, TLI, CurLoop, CurAST, SafetyInfo); + return Changed; +} + +/// Computes loop safety information, checks loop body & header +/// for the possiblity of may throw exception. +/// +void llvm::computeLICMSafetyInfo(LICMSafetyInfo * SafetyInfo, Loop * CurLoop) { + assert(CurLoop != nullptr && "CurLoop cant be null"); + BasicBlock *Header = CurLoop->getHeader(); + // Setting default safety values. + SafetyInfo->MayThrow = false; + SafetyInfo->HeaderMayThrow = false; + // Iterate over header and compute dafety info. + for (BasicBlock::iterator I = Header->begin(), E = Header->end(); + (I != E) && !SafetyInfo->HeaderMayThrow; ++I) + SafetyInfo->HeaderMayThrow |= I->mayThrow(); + + SafetyInfo->MayThrow = SafetyInfo->HeaderMayThrow; + // Iterate over loop instructions and compute safety info. + for (Loop::block_iterator BB = CurLoop->block_begin(), + BBE = CurLoop->block_end(); (BB != BBE) && !SafetyInfo->MayThrow ; ++BB) + for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end(); + (I != E) && !SafetyInfo->MayThrow; ++I) + SafetyInfo->MayThrow |= I->mayThrow(); } /// canSinkOrHoistInst - Return true if the hoister and sinker can handle this /// instruction. /// -bool LICM::canSinkOrHoistInst(Instruction &I) { +bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA, DominatorTree *DT, + TargetLibraryInfo *TLI, Loop *CurLoop, + AliasSetTracker *CurAST, LICMSafetyInfo *SafetyInfo) { // Loads have extra constraints we have to verify before we can hoist them. if (LoadInst *LI = dyn_cast<LoadInst>(&I)) { if (!LI->isUnordered()) @@ -462,7 +450,7 @@ bool LICM::canSinkOrHoistInst(Instruction &I) { AAMDNodes AAInfo; LI->getAAMetadata(AAInfo); - return !pointerInvalidatedByLoop(LI->getOperand(0), Size, AAInfo); + return !pointerInvalidatedByLoop(LI->getOperand(0), Size, AAInfo, CurAST); } else if (CallInst *CI = dyn_cast<CallInst>(&I)) { // Don't sink or hoist dbg info; it's legal, but not useful. if (isa<DbgInfoIntrinsic>(I)) @@ -501,30 +489,34 @@ bool LICM::canSinkOrHoistInst(Instruction &I) { !isa<InsertValueInst>(I)) return false; - return isSafeToExecuteUnconditionally(I); + // TODO: Plumb the context instruction through to make hoisting and sinking + // more powerful. Hoisting of loads already works due to the special casing + // above. + return isSafeToExecuteUnconditionally(I, DT, TLI, CurLoop, SafetyInfo, + nullptr); } -/// \brief Returns true if a PHINode is a trivially replaceable with an +/// Returns true if a PHINode is a trivially replaceable with an /// Instruction. +/// This is true when all incoming values are that instruction. +/// This pattern occurs most often with LCSSA PHI nodes. /// -/// This is true when all incoming values are that instruction. This pattern -/// occurs most often with LCSSA PHI nodes. -static bool isTriviallyReplacablePHI(PHINode &PN, Instruction &I) { - for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) - if (PN.getIncomingValue(i) != &I) +static bool isTriviallyReplacablePHI(const PHINode &PN, const Instruction &I) { + for (const Value *IncValue : PN.incoming_values()) + if (IncValue != &I) return false; return true; } -/// isNotUsedInLoop - Return true if the only users of this instruction are -/// outside of the loop. If this is true, we can sink the instruction to the -/// exit blocks of the loop. +/// Return true if the only users of this instruction are outside of +/// the loop. If this is true, we can sink the instruction to the exit +/// blocks of the loop. /// -bool LICM::isNotUsedInLoop(Instruction &I) { - for (User *U : I.users()) { - Instruction *UI = cast<Instruction>(U); - if (PHINode *PN = dyn_cast<PHINode>(UI)) { +static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop) { + for (const User *U : I.users()) { + const Instruction *UI = cast<Instruction>(U); + if (const PHINode *PN = dyn_cast<PHINode>(UI)) { // A PHI node where all of the incoming values are this instruction are // special -- they can just be RAUW'ed with the instruction and thus // don't require a use in the predecessor. This is a particular important @@ -552,9 +544,10 @@ bool LICM::isNotUsedInLoop(Instruction &I) { return true; } -Instruction *LICM::CloneInstructionInExitBlock(Instruction &I, - BasicBlock &ExitBlock, - PHINode &PN) { +static Instruction *CloneInstructionInExitBlock(const Instruction &I, + BasicBlock &ExitBlock, + PHINode &PN, + const LoopInfo *LI) { Instruction *New = I.clone(); ExitBlock.getInstList().insert(ExitBlock.getFirstInsertionPt(), New); if (!I.getName().empty()) New->setName(I.getName() + ".le"); @@ -581,14 +574,15 @@ Instruction *LICM::CloneInstructionInExitBlock(Instruction &I, return New; } -/// sink - When an instruction is found to only be used outside of the loop, -/// this function moves it to the exit blocks and patches up SSA form as needed. +/// When an instruction is found to only be used outside of the loop, this +/// function moves it to the exit blocks and patches up SSA form as needed. /// This method is guaranteed to remove the original instruction from its /// position, and may either delete it or move it to outside of the loop. /// -void LICM::sink(Instruction &I) { +static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT, + const Loop *CurLoop, AliasSetTracker *CurAST ) { DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n"); - + bool Changed = false; if (isa<LoadInst>(I)) ++NumMovedLoads; else if (isa<CallInst>(I)) ++NumMovedCalls; ++NumSunk; @@ -597,7 +591,8 @@ void LICM::sink(Instruction &I) { #ifndef NDEBUG SmallVector<BasicBlock *, 32> ExitBlocks; CurLoop->getUniqueExitBlocks(ExitBlocks); - SmallPtrSet<BasicBlock *, 32> ExitBlockSet(ExitBlocks.begin(), ExitBlocks.end()); + SmallPtrSet<BasicBlock *, 32> ExitBlockSet(ExitBlocks.begin(), + ExitBlocks.end()); #endif // Clones of this instruction. Don't create more than one per exit block! @@ -625,7 +620,7 @@ void LICM::sink(Instruction &I) { New = It->second; else New = SunkCopies[ExitBlock] = - CloneInstructionInExitBlock(I, *ExitBlock, *PN); + CloneInstructionInExitBlock(I, *ExitBlock, *PN, LI); PN->replaceAllUsesWith(New); PN->eraseFromParent(); @@ -633,37 +628,43 @@ void LICM::sink(Instruction &I) { CurAST->deleteValue(&I); I.eraseFromParent(); + return Changed; } -/// hoist - When an instruction is found to only use loop invariant operands -/// that is safe to hoist, this instruction is called to do the dirty work. +/// When an instruction is found to only use loop invariant operands that +/// is safe to hoist, this instruction is called to do the dirty work. /// -void LICM::hoist(Instruction &I) { +static bool hoist(Instruction &I, BasicBlock *Preheader) { DEBUG(dbgs() << "LICM hoisting to " << Preheader->getName() << ": " << I << "\n"); - // Move the new node to the Preheader, before its terminator. I.moveBefore(Preheader->getTerminator()); if (isa<LoadInst>(I)) ++NumMovedLoads; else if (isa<CallInst>(I)) ++NumMovedCalls; ++NumHoisted; - Changed = true; + return true; } -/// isSafeToExecuteUnconditionally - Only sink or hoist an instruction if it is -/// not a trapping instruction or if it is a trapping instruction and is -/// guaranteed to execute. -/// -bool LICM::isSafeToExecuteUnconditionally(Instruction &Inst) { - // If it is not a trapping instruction, it is always safe to hoist. - if (isSafeToSpeculativelyExecute(&Inst, DL)) +/// Only sink or hoist an instruction if it is not a trapping instruction, +/// or if the instruction is known not to trap when moved to the preheader. +/// or if it is a trapping instruction and is guaranteed to execute. +static bool isSafeToExecuteUnconditionally(const Instruction &Inst, + const DominatorTree *DT, + const TargetLibraryInfo *TLI, + const Loop *CurLoop, + const LICMSafetyInfo *SafetyInfo, + const Instruction *CtxI) { + if (isSafeToSpeculativelyExecute(&Inst, CtxI, DT, TLI)) return true; - return isGuaranteedToExecute(Inst); + return isGuaranteedToExecute(Inst, DT, CurLoop, SafetyInfo); } -bool LICM::isGuaranteedToExecute(Instruction &Inst) { +static bool isGuaranteedToExecute(const Instruction &Inst, + const DominatorTree *DT, + const Loop *CurLoop, + const LICMSafetyInfo * SafetyInfo) { // We have to check to make sure that the instruction dominates all // of the exit blocks. If it doesn't, then there is a path out of the loop @@ -675,11 +676,11 @@ bool LICM::isGuaranteedToExecute(Instruction &Inst) { if (Inst.getParent() == CurLoop->getHeader()) // If there's a throw in the header block, we can't guarantee we'll reach // Inst. - return !HeaderMayThrow; + return !SafetyInfo->HeaderMayThrow; // Somewhere in this loop there is an instruction which may throw and make us // exit the loop. - if (MayThrow) + if (SafetyInfo->MayThrow) return false; // Get the exit blocks for the current loop. @@ -719,17 +720,18 @@ namespace { // We need to create an LCSSA PHI node for the incoming value and // store that. PHINode *PN = PHINode::Create( - I->getType(), PredCache.GetNumPreds(BB), + I->getType(), PredCache.size(BB), I->getName() + ".lcssa", BB->begin()); - for (BasicBlock **PI = PredCache.GetPreds(BB); *PI; ++PI) - PN->addIncoming(I, *PI); + for (BasicBlock *Pred : PredCache.get(BB)) + PN->addIncoming(I, Pred); return PN; } return V; } public: - LoopPromoter(Value *SP, const SmallVectorImpl<Instruction *> &Insts, + LoopPromoter(Value *SP, + ArrayRef<const Instruction *> Insts, SSAUpdater &S, SmallPtrSetImpl<Value *> &PMA, SmallVectorImpl<BasicBlock *> &LEB, SmallVectorImpl<Instruction *> &LIP, PredIteratorCache &PIC, @@ -777,25 +779,37 @@ namespace { }; } // end anon namespace -/// PromoteAliasSet - Try to promote memory values to scalars by sinking -/// stores out of the loop and moving loads to before the loop. We do this by -/// looping over the stores in the loop, looking for stores to Must pointers -/// which are loop invariant. +/// Try to promote memory values to scalars by sinking stores out of the +/// loop and moving loads to before the loop. We do this by looping over +/// the stores in the loop, looking for stores to Must pointers which are +/// loop invariant. /// -void LICM::PromoteAliasSet(AliasSet &AS, - SmallVectorImpl<BasicBlock*> &ExitBlocks, - SmallVectorImpl<Instruction*> &InsertPts, - PredIteratorCache &PIC) { +bool llvm::promoteLoopAccessesToScalars(AliasSet &AS, + SmallVectorImpl<BasicBlock*>&ExitBlocks, + SmallVectorImpl<Instruction*>&InsertPts, + PredIteratorCache &PIC, LoopInfo *LI, + DominatorTree *DT, Loop *CurLoop, + AliasSetTracker *CurAST, + LICMSafetyInfo * SafetyInfo) { + // Verify inputs. + assert(LI != nullptr && DT != nullptr && + CurLoop != nullptr && CurAST != nullptr && + SafetyInfo != nullptr && + "Unexpected Input to promoteLoopAccessesToScalars"); + // Initially set Changed status to false. + bool Changed = false; // We can promote this alias set if it has a store, if it is a "Must" alias // set, if the pointer is loop invariant, and if we are not eliminating any // volatile loads or stores. if (AS.isForwardingAliasSet() || !AS.isMod() || !AS.isMustAlias() || AS.isVolatile() || !CurLoop->isLoopInvariant(AS.begin()->getValue())) - return; + return Changed; assert(!AS.empty() && "Must alias set should have at least one pointer element in it!"); + Value *SomePtr = AS.begin()->getValue(); + BasicBlock * Preheader = CurLoop->getLoopPreheader(); // It isn't safe to promote a load/store from the loop if the load/store is // conditional. For example, turning: @@ -832,7 +846,7 @@ void LICM::PromoteAliasSet(AliasSet &AS, // cannot (yet) promote a memory location that is loaded and stored in // different sizes. if (SomePtr->getType() != ASIV->getType()) - return; + return Changed; for (User *U : ASIV->users()) { // Ignore instructions that are outside the loop. @@ -842,25 +856,25 @@ void LICM::PromoteAliasSet(AliasSet &AS, // If there is an non-load/store instruction in the loop, we can't promote // it. - if (LoadInst *load = dyn_cast<LoadInst>(UI)) { + if (const LoadInst *load = dyn_cast<LoadInst>(UI)) { assert(!load->isVolatile() && "AST broken"); if (!load->isSimple()) - return; - } else if (StoreInst *store = dyn_cast<StoreInst>(UI)) { + return Changed; + } else if (const StoreInst *store = dyn_cast<StoreInst>(UI)) { // Stores *of* the pointer are not interesting, only stores *to* the // pointer. if (UI->getOperand(1) != ASIV) continue; assert(!store->isVolatile() && "AST broken"); if (!store->isSimple()) - return; + return Changed; // Don't sink stores from loops without dedicated block exits. Exits // containing indirect branches are not transformed by loop simplify, // make sure we catch that. An additional load may be generated in the // preheader for SSA updater, so also avoid sinking when no preheader // is available. if (!HasDedicatedExits || !Preheader) - return; + return Changed; // Note that we only check GuaranteedToExecute inside the store case // so that we do not introduce stores where they did not exist before @@ -872,16 +886,17 @@ void LICM::PromoteAliasSet(AliasSet &AS, // Larger is better, with the exception of 0 being the best alignment. unsigned InstAlignment = store->getAlignment(); if ((InstAlignment > Alignment || InstAlignment == 0) && Alignment != 0) - if (isGuaranteedToExecute(*UI)) { + if (isGuaranteedToExecute(*UI, DT, CurLoop, SafetyInfo)) { GuaranteedToExecute = true; Alignment = InstAlignment; } if (!GuaranteedToExecute) - GuaranteedToExecute = isGuaranteedToExecute(*UI); + GuaranteedToExecute = isGuaranteedToExecute(*UI, DT, + CurLoop, SafetyInfo); } else - return; // Not a load or store. + return Changed; // Not a load or store. // Merge the AA tags. if (LoopUses.empty()) { @@ -897,7 +912,7 @@ void LICM::PromoteAliasSet(AliasSet &AS, // If there isn't a guaranteed-to-execute instruction, we can't promote. if (!GuaranteedToExecute) - return; + return Changed; // Otherwise, this is safe to promote, lets do it! DEBUG(dbgs() << "LICM: Promoting value stored to in loop: " <<*SomePtr<<'\n'); @@ -922,7 +937,8 @@ void LICM::PromoteAliasSet(AliasSet &AS, // We use the SSAUpdater interface to insert phi nodes as required. SmallVector<PHINode*, 16> NewPHIs; SSAUpdater SSA(&NewPHIs); - LoopPromoter Promoter(SomePtr, LoopUses, SSA, PointerMustAliases, ExitBlocks, + LoopPromoter Promoter(SomePtr, LoopUses, SSA, + PointerMustAliases, ExitBlocks, InsertPts, PIC, *CurAST, *LI, DL, Alignment, AATags); // Set up the preheader to have a definition of the value. It is the live-out @@ -942,10 +958,12 @@ void LICM::PromoteAliasSet(AliasSet &AS, // If the SSAUpdater didn't use the load in the preheader, just zap it now. if (PreheaderLoad->use_empty()) PreheaderLoad->eraseFromParent(); -} + return Changed; +} -/// cloneBasicBlockAnalysis - Simple Analysis hook. Clone alias set info. +/// Simple Analysis hook. Clone alias set info. +/// void LICM::cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To, Loop *L) { AliasSetTracker *AST = LoopToAliasSetMap.lookup(L); if (!AST) @@ -954,8 +972,8 @@ void LICM::cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To, Loop *L) { AST->copyValue(From, To); } -/// deleteAnalysisValue - Simple Analysis hook. Delete value V from alias -/// set. +/// Simple Analysis hook. Delete value V from alias set +/// void LICM::deleteAnalysisValue(Value *V, Loop *L) { AliasSetTracker *AST = LoopToAliasSetMap.lookup(L); if (!AST) @@ -965,6 +983,7 @@ void LICM::deleteAnalysisValue(Value *V, Loop *L) { } /// Simple Analysis hook. Delete value L from alias set map. +/// void LICM::deleteAnalysisLoop(Loop *L) { AliasSetTracker *AST = LoopToAliasSetMap.lookup(L); if (!AST) @@ -973,3 +992,23 @@ void LICM::deleteAnalysisLoop(Loop *L) { delete AST; LoopToAliasSetMap.erase(L); } + + +/// Return true if the body of this loop may store into the memory +/// location pointed to by V. +/// +static bool pointerInvalidatedByLoop(Value *V, uint64_t Size, + const AAMDNodes &AAInfo, + AliasSetTracker *CurAST) { + // Check to see if any of the basic blocks in CurLoop invalidate *V. + return CurAST->getAliasSetForPointer(V, Size, AAInfo).isMod(); +} + +/// Little predicate that returns true if the specified basic block is in +/// a subloop of the current one, not the current one itself. +/// +static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI) { + assert(CurLoop->contains(BB) && "Only valid if BB is IN the loop"); + return LI->getLoopFor(BB) != CurLoop; +} + diff --git a/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp b/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp index 11e4d76..c19cd19 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp @@ -12,17 +12,17 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar.h" - #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" #include "llvm/Analysis/TargetFolder.h" -#include "llvm/Pass.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" -#include "llvm/IR/Instructions.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" @@ -41,9 +41,9 @@ struct PointerOffsetPair { }; struct LoadPOPPair { + LoadPOPPair() = default; LoadPOPPair(LoadInst *L, PointerOffsetPair P, unsigned O) : Load(L), POP(P), InsertOrder(O) {} - LoadPOPPair() {} LoadInst *Load; PointerOffsetPair POP; /// \brief The new load needs to be created before the first load in IR order. @@ -52,13 +52,10 @@ struct LoadPOPPair { class LoadCombine : public BasicBlockPass { LLVMContext *C; - const DataLayout *DL; AliasAnalysis *AA; public: - LoadCombine() - : BasicBlockPass(ID), - C(nullptr), DL(nullptr), AA(nullptr) { + LoadCombine() : BasicBlockPass(ID), C(nullptr), AA(nullptr) { initializeSROAPass(*PassRegistry::getPassRegistry()); } @@ -85,12 +82,6 @@ private: bool LoadCombine::doInitialization(Function &F) { DEBUG(dbgs() << "LoadCombine function: " << F.getName() << "\n"); C = &F.getContext(); - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - if (!DLP) { - DEBUG(dbgs() << " Skipping LoadCombine -- no target data!\n"); - return false; - } - DL = &DLP->getDataLayout(); return true; } @@ -100,9 +91,10 @@ PointerOffsetPair LoadCombine::getPointerOffsetPair(LoadInst &LI) { POP.Offset = 0; while (isa<BitCastInst>(POP.Pointer) || isa<GetElementPtrInst>(POP.Pointer)) { if (auto *GEP = dyn_cast<GetElementPtrInst>(POP.Pointer)) { - unsigned BitWidth = DL->getPointerTypeSizeInBits(GEP->getType()); + auto &DL = LI.getModule()->getDataLayout(); + unsigned BitWidth = DL.getPointerTypeSizeInBits(GEP->getType()); APInt Offset(BitWidth, 0); - if (GEP->accumulateConstantOffset(*DL, Offset)) + if (GEP->accumulateConstantOffset(DL, Offset)) POP.Offset += Offset.getZExtValue(); else // Can't handle GEPs with variable indices. @@ -145,7 +137,8 @@ bool LoadCombine::aggregateLoads(SmallVectorImpl<LoadPOPPair> &Loads) { if (PrevOffset == -1ull) { BaseLoad = L.Load; PrevOffset = L.POP.Offset; - PrevSize = DL->getTypeStoreSize(L.Load->getType()); + PrevSize = L.Load->getModule()->getDataLayout().getTypeStoreSize( + L.Load->getType()); AggregateLoads.push_back(L); continue; } @@ -164,7 +157,8 @@ bool LoadCombine::aggregateLoads(SmallVectorImpl<LoadPOPPair> &Loads) { // FIXME: We may want to handle this case. continue; PrevOffset = L.POP.Offset; - PrevSize = DL->getTypeStoreSize(L.Load->getType()); + PrevSize = L.Load->getModule()->getDataLayout().getTypeStoreSize( + L.Load->getType()); AggregateLoads.push_back(L); } if (combineLoads(AggregateLoads)) @@ -215,7 +209,8 @@ bool LoadCombine::combineLoads(SmallVectorImpl<LoadPOPPair> &Loads) { for (const auto &L : Loads) { Builder->SetInsertPoint(L.Load); Value *V = Builder->CreateExtractInteger( - *DL, NewLoad, cast<IntegerType>(L.Load->getType()), + L.Load->getModule()->getDataLayout(), NewLoad, + cast<IntegerType>(L.Load->getType()), L.POP.Offset - Loads[0].POP.Offset, "combine.extract"); L.Load->replaceAllUsesWith(V); } @@ -225,13 +220,13 @@ bool LoadCombine::combineLoads(SmallVectorImpl<LoadPOPPair> &Loads) { } bool LoadCombine::runOnBasicBlock(BasicBlock &BB) { - if (skipOptnoneFunction(BB) || !DL) + if (skipOptnoneFunction(BB)) return false; AA = &getAnalysis<AliasAnalysis>(); - IRBuilder<true, TargetFolder> - TheBuilder(BB.getContext(), TargetFolder(DL)); + IRBuilder<true, TargetFolder> TheBuilder( + BB.getContext(), TargetFolder(BB.getModule()->getDataLayout())); Builder = &TheBuilder; DenseMap<const Value *, SmallVector<LoadPOPPair, 8>> LoadMap; diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp index 1d1f33a..98b068e 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp @@ -39,14 +39,14 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<LoopInfo>(); + AU.addRequired<LoopInfoWrapperPass>(); AU.addRequired<ScalarEvolution>(); AU.addRequiredID(LoopSimplifyID); AU.addRequiredID(LCSSAID); AU.addPreserved<ScalarEvolution>(); AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addPreserved<LoopInfo>(); + AU.addPreserved<LoopInfoWrapperPass>(); AU.addPreservedID(LoopSimplifyID); AU.addPreservedID(LCSSAID); } @@ -63,7 +63,7 @@ char LoopDeletion::ID = 0; INITIALIZE_PASS_BEGIN(LoopDeletion, "loop-deletion", "Delete dead loops", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LCSSA) @@ -236,7 +236,7 @@ bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &LPM) { // Finally, the blocks from loopinfo. This has to happen late because // otherwise our loop iterators won't work. - LoopInfo &loopInfo = getAnalysis<LoopInfo>(); + LoopInfo &loopInfo = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); SmallPtrSet<BasicBlock*, 8> blocks; blocks.insert(L->block_begin(), L->block_end()); for (BasicBlock *BB : blocks) diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp new file mode 100644 index 0000000..a907d59 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp @@ -0,0 +1,976 @@ +//===- LoopDistribute.cpp - Loop Distribution Pass ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the Loop Distribution Pass. Its main focus is to +// distribute loops that cannot be vectorized due to dependence cycles. It +// tries to isolate the offending dependences into a new loop allowing +// vectorization of the remaining parts. +// +// For dependence analysis, the pass uses the LoopVectorizer's +// LoopAccessAnalysis. Because this analysis presumes no change in the order of +// memory operations, special care is taken to preserve the lexical order of +// these operations. +// +// Similarly to the Vectorizer, the pass also supports loop versioning to +// run-time disambiguate potentially overlapping arrays. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/EquivalenceClasses.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/LoopAccessAnalysis.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include <list> + +#define LDIST_NAME "loop-distribute" +#define DEBUG_TYPE LDIST_NAME + +using namespace llvm; + +static cl::opt<bool> + LDistVerify("loop-distribute-verify", cl::Hidden, + cl::desc("Turn on DominatorTree and LoopInfo verification " + "after Loop Distribution"), + cl::init(false)); + +static cl::opt<bool> DistributeNonIfConvertible( + "loop-distribute-non-if-convertible", cl::Hidden, + cl::desc("Whether to distribute into a loop that may not be " + "if-convertible by the loop vectorizer"), + cl::init(false)); + +STATISTIC(NumLoopsDistributed, "Number of loops distributed"); + +/// \brief Remaps instructions in a loop including the preheader. +static void remapInstructionsInLoop(const SmallVectorImpl<BasicBlock *> &Blocks, + ValueToValueMapTy &VMap) { + // Rewrite the code to refer to itself. + for (auto *BB : Blocks) + for (auto &Inst : *BB) + RemapInstruction(&Inst, VMap, + RF_NoModuleLevelChanges | RF_IgnoreMissingEntries); +} + +/// \brief Clones a loop \p OrigLoop. Returns the loop and the blocks in \p +/// Blocks. +/// +/// Updates LoopInfo and DominatorTree assuming the loop is dominated by block +/// \p LoopDomBB. Insert the new blocks before block specified in \p Before. +static Loop *cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB, + Loop *OrigLoop, ValueToValueMapTy &VMap, + const Twine &NameSuffix, LoopInfo *LI, + DominatorTree *DT, + SmallVectorImpl<BasicBlock *> &Blocks) { + Function *F = OrigLoop->getHeader()->getParent(); + Loop *ParentLoop = OrigLoop->getParentLoop(); + + Loop *NewLoop = new Loop(); + if (ParentLoop) + ParentLoop->addChildLoop(NewLoop); + else + LI->addTopLevelLoop(NewLoop); + + BasicBlock *OrigPH = OrigLoop->getLoopPreheader(); + BasicBlock *NewPH = CloneBasicBlock(OrigPH, VMap, NameSuffix, F); + // To rename the loop PHIs. + VMap[OrigPH] = NewPH; + Blocks.push_back(NewPH); + + // Update LoopInfo. + if (ParentLoop) + ParentLoop->addBasicBlockToLoop(NewPH, *LI); + + // Update DominatorTree. + DT->addNewBlock(NewPH, LoopDomBB); + + for (BasicBlock *BB : OrigLoop->getBlocks()) { + BasicBlock *NewBB = CloneBasicBlock(BB, VMap, NameSuffix, F); + VMap[BB] = NewBB; + + // Update LoopInfo. + NewLoop->addBasicBlockToLoop(NewBB, *LI); + + // Update DominatorTree. + BasicBlock *IDomBB = DT->getNode(BB)->getIDom()->getBlock(); + DT->addNewBlock(NewBB, cast<BasicBlock>(VMap[IDomBB])); + + Blocks.push_back(NewBB); + } + + // Move them physically from the end of the block list. + F->getBasicBlockList().splice(Before, F->getBasicBlockList(), NewPH); + F->getBasicBlockList().splice(Before, F->getBasicBlockList(), + NewLoop->getHeader(), F->end()); + + return NewLoop; +} + +namespace { +/// \brief Maintains the set of instructions of the loop for a partition before +/// cloning. After cloning, it hosts the new loop. +class InstPartition { + typedef SmallPtrSet<Instruction *, 8> InstructionSet; + +public: + InstPartition(Instruction *I, Loop *L, bool DepCycle = false) + : DepCycle(DepCycle), OrigLoop(L), ClonedLoop(nullptr) { + Set.insert(I); + } + + /// \brief Returns whether this partition contains a dependence cycle. + bool hasDepCycle() const { return DepCycle; } + + /// \brief Adds an instruction to this partition. + void add(Instruction *I) { Set.insert(I); } + + /// \brief Collection accessors. + InstructionSet::iterator begin() { return Set.begin(); } + InstructionSet::iterator end() { return Set.end(); } + InstructionSet::const_iterator begin() const { return Set.begin(); } + InstructionSet::const_iterator end() const { return Set.end(); } + bool empty() const { return Set.empty(); } + + /// \brief Moves this partition into \p Other. This partition becomes empty + /// after this. + void moveTo(InstPartition &Other) { + Other.Set.insert(Set.begin(), Set.end()); + Set.clear(); + Other.DepCycle |= DepCycle; + } + + /// \brief Populates the partition with a transitive closure of all the + /// instructions that the seeded instructions dependent on. + void populateUsedSet() { + // FIXME: We currently don't use control-dependence but simply include all + // blocks (possibly empty at the end) and let simplifycfg mostly clean this + // up. + for (auto *B : OrigLoop->getBlocks()) + Set.insert(B->getTerminator()); + + // Follow the use-def chains to form a transitive closure of all the + // instructions that the originally seeded instructions depend on. + SmallVector<Instruction *, 8> Worklist(Set.begin(), Set.end()); + while (!Worklist.empty()) { + Instruction *I = Worklist.pop_back_val(); + // Insert instructions from the loop that we depend on. + for (Value *V : I->operand_values()) { + auto *I = dyn_cast<Instruction>(V); + if (I && OrigLoop->contains(I->getParent()) && Set.insert(I).second) + Worklist.push_back(I); + } + } + } + + /// \brief Clones the original loop. + /// + /// Updates LoopInfo and DominatorTree using the information that block \p + /// LoopDomBB dominates the loop. + Loop *cloneLoopWithPreheader(BasicBlock *InsertBefore, BasicBlock *LoopDomBB, + unsigned Index, LoopInfo *LI, + DominatorTree *DT) { + ClonedLoop = ::cloneLoopWithPreheader(InsertBefore, LoopDomBB, OrigLoop, + VMap, Twine(".ldist") + Twine(Index), + LI, DT, ClonedLoopBlocks); + return ClonedLoop; + } + + /// \brief The cloned loop. If this partition is mapped to the original loop, + /// this is null. + const Loop *getClonedLoop() const { return ClonedLoop; } + + /// \brief Returns the loop where this partition ends up after distribution. + /// If this partition is mapped to the original loop then use the block from + /// the loop. + const Loop *getDistributedLoop() const { + return ClonedLoop ? ClonedLoop : OrigLoop; + } + + /// \brief The VMap that is populated by cloning and then used in + /// remapinstruction to remap the cloned instructions. + ValueToValueMapTy &getVMap() { return VMap; } + + /// \brief Remaps the cloned instructions using VMap. + void remapInstructions() { remapInstructionsInLoop(ClonedLoopBlocks, VMap); } + + /// \brief Based on the set of instructions selected for this partition, + /// removes the unnecessary ones. + void removeUnusedInsts() { + SmallVector<Instruction *, 8> Unused; + + for (auto *Block : OrigLoop->getBlocks()) + for (auto &Inst : *Block) + if (!Set.count(&Inst)) { + Instruction *NewInst = &Inst; + if (!VMap.empty()) + NewInst = cast<Instruction>(VMap[NewInst]); + + assert(!isa<BranchInst>(NewInst) && + "Branches are marked used early on"); + Unused.push_back(NewInst); + } + + // Delete the instructions backwards, as it has a reduced likelihood of + // having to update as many def-use and use-def chains. + for (auto I = Unused.rbegin(), E = Unused.rend(); I != E; ++I) { + auto *Inst = *I; + + if (!Inst->use_empty()) + Inst->replaceAllUsesWith(UndefValue::get(Inst->getType())); + Inst->eraseFromParent(); + } + } + + void print() const { + if (DepCycle) + dbgs() << " (cycle)\n"; + for (auto *I : Set) + // Prefix with the block name. + dbgs() << " " << I->getParent()->getName() << ":" << *I << "\n"; + } + + void printBlocks() const { + for (auto *BB : getDistributedLoop()->getBlocks()) + dbgs() << *BB; + } + +private: + /// \brief Instructions from OrigLoop selected for this partition. + InstructionSet Set; + + /// \brief Whether this partition contains a dependence cycle. + bool DepCycle; + + /// \brief The original loop. + Loop *OrigLoop; + + /// \brief The cloned loop. If this partition is mapped to the original loop, + /// this is null. + Loop *ClonedLoop; + + /// \brief The blocks of ClonedLoop including the preheader. If this + /// partition is mapped to the original loop, this is empty. + SmallVector<BasicBlock *, 8> ClonedLoopBlocks; + + /// \brief These gets populated once the set of instructions have been + /// finalized. If this partition is mapped to the original loop, these are not + /// set. + ValueToValueMapTy VMap; +}; + +/// \brief Holds the set of Partitions. It populates them, merges them and then +/// clones the loops. +class InstPartitionContainer { + typedef DenseMap<Instruction *, int> InstToPartitionIdT; + +public: + InstPartitionContainer(Loop *L, LoopInfo *LI, DominatorTree *DT) + : L(L), LI(LI), DT(DT) {} + + /// \brief Returns the number of partitions. + unsigned getSize() const { return PartitionContainer.size(); } + + /// \brief Adds \p Inst into the current partition if that is marked to + /// contain cycles. Otherwise start a new partition for it. + void addToCyclicPartition(Instruction *Inst) { + // If the current partition is non-cyclic. Start a new one. + if (PartitionContainer.empty() || !PartitionContainer.back().hasDepCycle()) + PartitionContainer.emplace_back(Inst, L, /*DepCycle=*/true); + else + PartitionContainer.back().add(Inst); + } + + /// \brief Adds \p Inst into a partition that is not marked to contain + /// dependence cycles. + /// + // Initially we isolate memory instructions into as many partitions as + // possible, then later we may merge them back together. + void addToNewNonCyclicPartition(Instruction *Inst) { + PartitionContainer.emplace_back(Inst, L); + } + + /// \brief Merges adjacent non-cyclic partitions. + /// + /// The idea is that we currently only want to isolate the non-vectorizable + /// partition. We could later allow more distribution among these partition + /// too. + void mergeAdjacentNonCyclic() { + mergeAdjacentPartitionsIf( + [](const InstPartition *P) { return !P->hasDepCycle(); }); + } + + /// \brief If a partition contains only conditional stores, we won't vectorize + /// it. Try to merge it with a previous cyclic partition. + void mergeNonIfConvertible() { + mergeAdjacentPartitionsIf([&](const InstPartition *Partition) { + if (Partition->hasDepCycle()) + return true; + + // Now, check if all stores are conditional in this partition. + bool seenStore = false; + + for (auto *Inst : *Partition) + if (isa<StoreInst>(Inst)) { + seenStore = true; + if (!LoopAccessInfo::blockNeedsPredication(Inst->getParent(), L, DT)) + return false; + } + return seenStore; + }); + } + + /// \brief Merges the partitions according to various heuristics. + void mergeBeforePopulating() { + mergeAdjacentNonCyclic(); + if (!DistributeNonIfConvertible) + mergeNonIfConvertible(); + } + + /// \brief Merges partitions in order to ensure that no loads are duplicated. + /// + /// We can't duplicate loads because that could potentially reorder them. + /// LoopAccessAnalysis provides dependency information with the context that + /// the order of memory operation is preserved. + /// + /// Return if any partitions were merged. + bool mergeToAvoidDuplicatedLoads() { + typedef DenseMap<Instruction *, InstPartition *> LoadToPartitionT; + typedef EquivalenceClasses<InstPartition *> ToBeMergedT; + + LoadToPartitionT LoadToPartition; + ToBeMergedT ToBeMerged; + + // Step through the partitions and create equivalence between partitions + // that contain the same load. Also put partitions in between them in the + // same equivalence class to avoid reordering of memory operations. + for (PartitionContainerT::iterator I = PartitionContainer.begin(), + E = PartitionContainer.end(); + I != E; ++I) { + auto *PartI = &*I; + + // If a load occurs in two partitions PartI and PartJ, merge all + // partitions (PartI, PartJ] into PartI. + for (Instruction *Inst : *PartI) + if (isa<LoadInst>(Inst)) { + bool NewElt; + LoadToPartitionT::iterator LoadToPart; + + std::tie(LoadToPart, NewElt) = + LoadToPartition.insert(std::make_pair(Inst, PartI)); + if (!NewElt) { + DEBUG(dbgs() << "Merging partitions due to this load in multiple " + << "partitions: " << PartI << ", " + << LoadToPart->second << "\n" << *Inst << "\n"); + + auto PartJ = I; + do { + --PartJ; + ToBeMerged.unionSets(PartI, &*PartJ); + } while (&*PartJ != LoadToPart->second); + } + } + } + if (ToBeMerged.empty()) + return false; + + // Merge the member of an equivalence class into its class leader. This + // makes the members empty. + for (ToBeMergedT::iterator I = ToBeMerged.begin(), E = ToBeMerged.end(); + I != E; ++I) { + if (!I->isLeader()) + continue; + + auto PartI = I->getData(); + for (auto PartJ : make_range(std::next(ToBeMerged.member_begin(I)), + ToBeMerged.member_end())) { + PartJ->moveTo(*PartI); + } + } + + // Remove the empty partitions. + PartitionContainer.remove_if( + [](const InstPartition &P) { return P.empty(); }); + + return true; + } + + /// \brief Sets up the mapping between instructions to partitions. If the + /// instruction is duplicated across multiple partitions, set the entry to -1. + void setupPartitionIdOnInstructions() { + int PartitionID = 0; + for (const auto &Partition : PartitionContainer) { + for (Instruction *Inst : Partition) { + bool NewElt; + InstToPartitionIdT::iterator Iter; + + std::tie(Iter, NewElt) = + InstToPartitionId.insert(std::make_pair(Inst, PartitionID)); + if (!NewElt) + Iter->second = -1; + } + ++PartitionID; + } + } + + /// \brief Populates the partition with everything that the seeding + /// instructions require. + void populateUsedSet() { + for (auto &P : PartitionContainer) + P.populateUsedSet(); + } + + /// \brief This performs the main chunk of the work of cloning the loops for + /// the partitions. + void cloneLoops(Pass *P) { + BasicBlock *OrigPH = L->getLoopPreheader(); + // At this point the predecessor of the preheader is either the memcheck + // block or the top part of the original preheader. + BasicBlock *Pred = OrigPH->getSinglePredecessor(); + assert(Pred && "Preheader does not have a single predecessor"); + BasicBlock *ExitBlock = L->getExitBlock(); + assert(ExitBlock && "No single exit block"); + Loop *NewLoop; + + assert(!PartitionContainer.empty() && "at least two partitions expected"); + // We're cloning the preheader along with the loop so we already made sure + // it was empty. + assert(&*OrigPH->begin() == OrigPH->getTerminator() && + "preheader not empty"); + + // Create a loop for each partition except the last. Clone the original + // loop before PH along with adding a preheader for the cloned loop. Then + // update PH to point to the newly added preheader. + BasicBlock *TopPH = OrigPH; + unsigned Index = getSize() - 1; + for (auto I = std::next(PartitionContainer.rbegin()), + E = PartitionContainer.rend(); + I != E; ++I, --Index, TopPH = NewLoop->getLoopPreheader()) { + auto *Part = &*I; + + NewLoop = Part->cloneLoopWithPreheader(TopPH, Pred, Index, LI, DT); + + Part->getVMap()[ExitBlock] = TopPH; + Part->remapInstructions(); + } + Pred->getTerminator()->replaceUsesOfWith(OrigPH, TopPH); + + // Now go in forward order and update the immediate dominator for the + // preheaders with the exiting block of the previous loop. Dominance + // within the loop is updated in cloneLoopWithPreheader. + for (auto Curr = PartitionContainer.cbegin(), + Next = std::next(PartitionContainer.cbegin()), + E = PartitionContainer.cend(); + Next != E; ++Curr, ++Next) + DT->changeImmediateDominator( + Next->getDistributedLoop()->getLoopPreheader(), + Curr->getDistributedLoop()->getExitingBlock()); + } + + /// \brief Removes the dead instructions from the cloned loops. + void removeUnusedInsts() { + for (auto &Partition : PartitionContainer) + Partition.removeUnusedInsts(); + } + + /// \brief For each memory pointer, it computes the partitionId the pointer is + /// used in. + /// + /// This returns an array of int where the I-th entry corresponds to I-th + /// entry in LAI.getRuntimePointerCheck(). If the pointer is used in multiple + /// partitions its entry is set to -1. + SmallVector<int, 8> + computePartitionSetForPointers(const LoopAccessInfo &LAI) { + const LoopAccessInfo::RuntimePointerCheck *RtPtrCheck = + LAI.getRuntimePointerCheck(); + + unsigned N = RtPtrCheck->Pointers.size(); + SmallVector<int, 8> PtrToPartitions(N); + for (unsigned I = 0; I < N; ++I) { + Value *Ptr = RtPtrCheck->Pointers[I]; + auto Instructions = + LAI.getInstructionsForAccess(Ptr, RtPtrCheck->IsWritePtr[I]); + + int &Partition = PtrToPartitions[I]; + // First set it to uninitialized. + Partition = -2; + for (Instruction *Inst : Instructions) { + // Note that this could be -1 if Inst is duplicated across multiple + // partitions. + int ThisPartition = this->InstToPartitionId[Inst]; + if (Partition == -2) + Partition = ThisPartition; + // -1 means belonging to multiple partitions. + else if (Partition == -1) + break; + else if (Partition != (int)ThisPartition) + Partition = -1; + } + assert(Partition != -2 && "Pointer not belonging to any partition"); + } + + return PtrToPartitions; + } + + void print(raw_ostream &OS) const { + unsigned Index = 0; + for (const auto &P : PartitionContainer) { + OS << "Partition " << Index++ << " (" << &P << "):\n"; + P.print(); + } + } + + void dump() const { print(dbgs()); } + +#ifndef NDEBUG + friend raw_ostream &operator<<(raw_ostream &OS, + const InstPartitionContainer &Partitions) { + Partitions.print(OS); + return OS; + } +#endif + + void printBlocks() const { + unsigned Index = 0; + for (const auto &P : PartitionContainer) { + dbgs() << "\nPartition " << Index++ << " (" << &P << "):\n"; + P.printBlocks(); + } + } + +private: + typedef std::list<InstPartition> PartitionContainerT; + + /// \brief List of partitions. + PartitionContainerT PartitionContainer; + + /// \brief Mapping from Instruction to partition Id. If the instruction + /// belongs to multiple partitions the entry contains -1. + InstToPartitionIdT InstToPartitionId; + + Loop *L; + LoopInfo *LI; + DominatorTree *DT; + + /// \brief The control structure to merge adjacent partitions if both satisfy + /// the \p Predicate. + template <class UnaryPredicate> + void mergeAdjacentPartitionsIf(UnaryPredicate Predicate) { + InstPartition *PrevMatch = nullptr; + for (auto I = PartitionContainer.begin(); I != PartitionContainer.end();) { + auto DoesMatch = Predicate(&*I); + if (PrevMatch == nullptr && DoesMatch) { + PrevMatch = &*I; + ++I; + } else if (PrevMatch != nullptr && DoesMatch) { + I->moveTo(*PrevMatch); + I = PartitionContainer.erase(I); + } else { + PrevMatch = nullptr; + ++I; + } + } + } +}; + +/// \brief For each memory instruction, this class maintains difference of the +/// number of unsafe dependences that start out from this instruction minus +/// those that end here. +/// +/// By traversing the memory instructions in program order and accumulating this +/// number, we know whether any unsafe dependence crosses over a program point. +class MemoryInstructionDependences { + typedef MemoryDepChecker::Dependence Dependence; + +public: + struct Entry { + Instruction *Inst; + unsigned NumUnsafeDependencesStartOrEnd; + + Entry(Instruction *Inst) : Inst(Inst), NumUnsafeDependencesStartOrEnd(0) {} + }; + + typedef SmallVector<Entry, 8> AccessesType; + + AccessesType::const_iterator begin() const { return Accesses.begin(); } + AccessesType::const_iterator end() const { return Accesses.end(); } + + MemoryInstructionDependences( + const SmallVectorImpl<Instruction *> &Instructions, + const SmallVectorImpl<Dependence> &InterestingDependences) { + Accesses.append(Instructions.begin(), Instructions.end()); + + DEBUG(dbgs() << "Backward dependences:\n"); + for (auto &Dep : InterestingDependences) + if (Dep.isPossiblyBackward()) { + // Note that the designations source and destination follow the program + // order, i.e. source is always first. (The direction is given by the + // DepType.) + ++Accesses[Dep.Source].NumUnsafeDependencesStartOrEnd; + --Accesses[Dep.Destination].NumUnsafeDependencesStartOrEnd; + + DEBUG(Dep.print(dbgs(), 2, Instructions)); + } + } + +private: + AccessesType Accesses; +}; + +/// \brief Handles the loop versioning based on memchecks. +class RuntimeCheckEmitter { +public: + RuntimeCheckEmitter(const LoopAccessInfo &LAI, Loop *L, LoopInfo *LI, + DominatorTree *DT) + : OrigLoop(L), NonDistributedLoop(nullptr), LAI(LAI), LI(LI), DT(DT) {} + + /// \brief Given the \p Partitions formed by Loop Distribution, it determines + /// in which partition each pointer is used. + void partitionPointers(InstPartitionContainer &Partitions) { + // Set up partition id in PtrRtChecks. Ptr -> Access -> Intruction -> + // Partition. + PtrToPartition = Partitions.computePartitionSetForPointers(LAI); + + DEBUG(dbgs() << "\nPointers:\n"); + DEBUG(LAI.getRuntimePointerCheck()->print(dbgs(), 0, &PtrToPartition)); + } + + /// \brief Returns true if we need memchecks to distribute the loop. + bool needsRuntimeChecks() const { + return LAI.getRuntimePointerCheck()->needsAnyChecking(&PtrToPartition); + } + + /// \brief Performs the CFG manipulation part of versioning the loop including + /// the DominatorTree and LoopInfo updates. + void versionLoop(Pass *P) { + Instruction *FirstCheckInst; + Instruction *MemRuntimeCheck; + // Add the memcheck in the original preheader (this is empty initially). + BasicBlock *MemCheckBB = OrigLoop->getLoopPreheader(); + std::tie(FirstCheckInst, MemRuntimeCheck) = + LAI.addRuntimeCheck(MemCheckBB->getTerminator(), &PtrToPartition); + assert(MemRuntimeCheck && "called even though needsAnyChecking = false"); + + // Rename the block to make the IR more readable. + MemCheckBB->setName(OrigLoop->getHeader()->getName() + ".ldist.memcheck"); + + // Create empty preheader for the loop (and after cloning for the + // original/nondist loop). + BasicBlock *PH = + SplitBlock(MemCheckBB, MemCheckBB->getTerminator(), DT, LI); + PH->setName(OrigLoop->getHeader()->getName() + ".ph"); + + // Clone the loop including the preheader. + // + // FIXME: This does not currently preserve SimplifyLoop because the exit + // block is a join between the two loops. + SmallVector<BasicBlock *, 8> NonDistributedLoopBlocks; + NonDistributedLoop = + cloneLoopWithPreheader(PH, MemCheckBB, OrigLoop, VMap, ".ldist.nondist", + LI, DT, NonDistributedLoopBlocks); + remapInstructionsInLoop(NonDistributedLoopBlocks, VMap); + + // Insert the conditional branch based on the result of the memchecks. + Instruction *OrigTerm = MemCheckBB->getTerminator(); + BranchInst::Create(NonDistributedLoop->getLoopPreheader(), + OrigLoop->getLoopPreheader(), MemRuntimeCheck, OrigTerm); + OrigTerm->eraseFromParent(); + + // The loops merge in the original exit block. This is now dominated by the + // memchecking block. + DT->changeImmediateDominator(OrigLoop->getExitBlock(), MemCheckBB); + } + + /// \brief Adds the necessary PHI nodes for the versioned loops based on the + /// loop-defined values used outside of the loop. + void addPHINodes(const SmallVectorImpl<Instruction *> &DefsUsedOutside) { + BasicBlock *PHIBlock = OrigLoop->getExitBlock(); + assert(PHIBlock && "No single successor to loop exit block"); + + for (auto *Inst : DefsUsedOutside) { + auto *NonDistInst = cast<Instruction>(VMap[Inst]); + PHINode *PN; + + // First see if we have a single-operand PHI with the value defined by the + // original loop. + for (auto I = PHIBlock->begin(); (PN = dyn_cast<PHINode>(I)); ++I) { + assert(PN->getNumOperands() == 1 && + "Exit block should only have on predecessor"); + if (PN->getIncomingValue(0) == Inst) + break; + } + // If not create it. + if (!PN) { + PN = PHINode::Create(Inst->getType(), 2, Inst->getName() + ".ldist", + PHIBlock->begin()); + for (auto *User : Inst->users()) + if (!OrigLoop->contains(cast<Instruction>(User)->getParent())) + User->replaceUsesOfWith(Inst, PN); + PN->addIncoming(Inst, OrigLoop->getExitingBlock()); + } + // Add the new incoming value from the non-distributed loop. + PN->addIncoming(NonDistInst, NonDistributedLoop->getExitingBlock()); + } + } + +private: + /// \brief The original loop. This becomes the "versioned" one, i.e. control + /// goes if the memchecks all pass. + Loop *OrigLoop; + /// \brief The fall-back loop, i.e. if any of the memchecks fail. + Loop *NonDistributedLoop; + + /// \brief For each memory pointer it contains the partitionId it is used in. + /// + /// The I-th entry corresponds to I-th entry in LAI.getRuntimePointerCheck(). + /// If the pointer is used in multiple partitions the entry is set to -1. + SmallVector<int, 8> PtrToPartition; + + /// \brief This maps the instructions from OrigLoop to their counterpart in + /// NonDistributedLoop. + ValueToValueMapTy VMap; + + /// \brief Analyses used. + const LoopAccessInfo &LAI; + LoopInfo *LI; + DominatorTree *DT; +}; + +/// \brief Returns the instructions that use values defined in the loop. +static SmallVector<Instruction *, 8> findDefsUsedOutsideOfLoop(Loop *L) { + SmallVector<Instruction *, 8> UsedOutside; + + for (auto *Block : L->getBlocks()) + // FIXME: I believe that this could use copy_if if the Inst reference could + // be adapted into a pointer. + for (auto &Inst : *Block) { + auto Users = Inst.users(); + if (std::any_of(Users.begin(), Users.end(), [&](User *U) { + auto *Use = cast<Instruction>(U); + return !L->contains(Use->getParent()); + })) + UsedOutside.push_back(&Inst); + } + + return UsedOutside; +} + +/// \brief The pass class. +class LoopDistribute : public FunctionPass { +public: + LoopDistribute() : FunctionPass(ID) { + initializeLoopDistributePass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + LAA = &getAnalysis<LoopAccessAnalysis>(); + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + + // Build up a worklist of inner-loops to vectorize. This is necessary as the + // act of distributing a loop creates new loops and can invalidate iterators + // across the loops. + SmallVector<Loop *, 8> Worklist; + + for (Loop *TopLevelLoop : *LI) + for (Loop *L : depth_first(TopLevelLoop)) + // We only handle inner-most loops. + if (L->empty()) + Worklist.push_back(L); + + // Now walk the identified inner loops. + bool Changed = false; + for (Loop *L : Worklist) + Changed |= processLoop(L); + + // Process each loop nest in the function. + return Changed; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<LoopInfoWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); + AU.addRequired<LoopAccessAnalysis>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); + } + + static char ID; + +private: + /// \brief Try to distribute an inner-most loop. + bool processLoop(Loop *L) { + assert(L->empty() && "Only process inner loops."); + + DEBUG(dbgs() << "\nLDist: In \"" << L->getHeader()->getParent()->getName() + << "\" checking " << *L << "\n"); + + BasicBlock *PH = L->getLoopPreheader(); + if (!PH) { + DEBUG(dbgs() << "Skipping; no preheader"); + return false; + } + if (!L->getExitBlock()) { + DEBUG(dbgs() << "Skipping; multiple exit blocks"); + return false; + } + // LAA will check that we only have a single exiting block. + + const LoopAccessInfo &LAI = LAA->getInfo(L, ValueToValueMap()); + + // Currently, we only distribute to isolate the part of the loop with + // dependence cycles to enable partial vectorization. + if (LAI.canVectorizeMemory()) { + DEBUG(dbgs() << "Skipping; memory operations are safe for vectorization"); + return false; + } + auto *InterestingDependences = + LAI.getDepChecker().getInterestingDependences(); + if (!InterestingDependences || InterestingDependences->empty()) { + DEBUG(dbgs() << "Skipping; No unsafe dependences to isolate"); + return false; + } + + InstPartitionContainer Partitions(L, LI, DT); + + // First, go through each memory operation and assign them to consecutive + // partitions (the order of partitions follows program order). Put those + // with unsafe dependences into "cyclic" partition otherwise put each store + // in its own "non-cyclic" partition (we'll merge these later). + // + // Note that a memory operation (e.g. Load2 below) at a program point that + // has an unsafe dependence (Store3->Load1) spanning over it must be + // included in the same cyclic partition as the dependent operations. This + // is to preserve the original program order after distribution. E.g.: + // + // NumUnsafeDependencesStartOrEnd NumUnsafeDependencesActive + // Load1 -. 1 0->1 + // Load2 | /Unsafe/ 0 1 + // Store3 -' -1 1->0 + // Load4 0 0 + // + // NumUnsafeDependencesActive > 0 indicates this situation and in this case + // we just keep assigning to the same cyclic partition until + // NumUnsafeDependencesActive reaches 0. + const MemoryDepChecker &DepChecker = LAI.getDepChecker(); + MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(), + *InterestingDependences); + + int NumUnsafeDependencesActive = 0; + for (auto &InstDep : MID) { + Instruction *I = InstDep.Inst; + // We update NumUnsafeDependencesActive post-instruction, catch the + // start of a dependence directly via NumUnsafeDependencesStartOrEnd. + if (NumUnsafeDependencesActive || + InstDep.NumUnsafeDependencesStartOrEnd > 0) + Partitions.addToCyclicPartition(I); + else + Partitions.addToNewNonCyclicPartition(I); + NumUnsafeDependencesActive += InstDep.NumUnsafeDependencesStartOrEnd; + assert(NumUnsafeDependencesActive >= 0 && + "Negative number of dependences active"); + } + + // Add partitions for values used outside. These partitions can be out of + // order from the original program order. This is OK because if the + // partition uses a load we will merge this partition with the original + // partition of the load that we set up in the previous loop (see + // mergeToAvoidDuplicatedLoads). + auto DefsUsedOutside = findDefsUsedOutsideOfLoop(L); + for (auto *Inst : DefsUsedOutside) + Partitions.addToNewNonCyclicPartition(Inst); + + DEBUG(dbgs() << "Seeded partitions:\n" << Partitions); + if (Partitions.getSize() < 2) + return false; + + // Run the merge heuristics: Merge non-cyclic adjacent partitions since we + // should be able to vectorize these together. + Partitions.mergeBeforePopulating(); + DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions); + if (Partitions.getSize() < 2) + return false; + + // Now, populate the partitions with non-memory operations. + Partitions.populateUsedSet(); + DEBUG(dbgs() << "\nPopulated partitions:\n" << Partitions); + + // In order to preserve original lexical order for loads, keep them in the + // partition that we set up in the MemoryInstructionDependences loop. + if (Partitions.mergeToAvoidDuplicatedLoads()) { + DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n" + << Partitions); + if (Partitions.getSize() < 2) + return false; + } + + DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n"); + // We're done forming the partitions set up the reverse mapping from + // instructions to partitions. + Partitions.setupPartitionIdOnInstructions(); + + // To keep things simple have an empty preheader before we version or clone + // the loop. (Also split if this has no predecessor, i.e. entry, because we + // rely on PH having a predecessor.) + if (!PH->getSinglePredecessor() || &*PH->begin() != PH->getTerminator()) + SplitBlock(PH, PH->getTerminator(), DT, LI); + + // If we need run-time checks to disambiguate pointers are run-time, version + // the loop now. + RuntimeCheckEmitter RtCheckEmitter(LAI, L, LI, DT); + RtCheckEmitter.partitionPointers(Partitions); + if (RtCheckEmitter.needsRuntimeChecks()) { + RtCheckEmitter.versionLoop(this); + RtCheckEmitter.addPHINodes(DefsUsedOutside); + } + + // Create identical copies of the original loop for each partition and hook + // them up sequentially. + Partitions.cloneLoops(this); + + // Now, we remove the instruction from each loop that don't belong to that + // partition. + Partitions.removeUnusedInsts(); + DEBUG(dbgs() << "\nAfter removing unused Instrs:\n"); + DEBUG(Partitions.printBlocks()); + + if (LDistVerify) { + LI->verify(); + DT->verifyDomTree(); + } + + ++NumLoopsDistributed; + return true; + } + + // Analyses used. + LoopInfo *LI; + LoopAccessAnalysis *LAA; + DominatorTree *DT; +}; +} // anonymous namespace + +char LoopDistribute::ID; +static const char ldist_name[] = "Loop Distribition"; + +INITIALIZE_PASS_BEGIN(LoopDistribute, LDIST_NAME, ldist_name, false, false) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_END(LoopDistribute, LDIST_NAME, ldist_name, false, false) + +namespace llvm { +FunctionPass *createLoopDistributePass() { return new LoopDistribute(); } +} diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index a12f5a7..f92ecd4 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -47,6 +47,7 @@ #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/DataLayout.h" @@ -56,7 +57,6 @@ #include "llvm/IR/Module.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; @@ -130,7 +130,6 @@ namespace { class LoopIdiomRecognize : public LoopPass { Loop *CurLoop; - const DataLayout *DL; DominatorTree *DT; ScalarEvolution *SE; TargetLibraryInfo *TLI; @@ -139,7 +138,10 @@ namespace { static char ID; explicit LoopIdiomRecognize() : LoopPass(ID) { initializeLoopIdiomRecognizePass(*PassRegistry::getPassRegistry()); - DL = nullptr; DT = nullptr; SE = nullptr; TLI = nullptr; TTI = nullptr; + DT = nullptr; + SE = nullptr; + TLI = nullptr; + TTI = nullptr; } bool runOnLoop(Loop *L, LPPassManager &LPM) override; @@ -163,8 +165,8 @@ namespace { /// loop preheaders be inserted into the CFG. /// void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<LoopInfo>(); - AU.addPreserved<LoopInfo>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); AU.addRequiredID(LoopSimplifyID); AU.addPreservedID(LoopSimplifyID); AU.addRequiredID(LCSSAID); @@ -175,16 +177,8 @@ namespace { AU.addPreserved<ScalarEvolution>(); AU.addPreserved<DominatorTreeWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<TargetLibraryInfo>(); - AU.addRequired<TargetTransformInfo>(); - } - - const DataLayout *getDataLayout() { - if (DL) - return DL; - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - DL = DLP ? &DLP->getDataLayout() : nullptr; - return DL; + AU.addRequired<TargetLibraryInfoWrapperPass>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); } DominatorTree *getDominatorTree() { @@ -197,11 +191,16 @@ namespace { } TargetLibraryInfo *getTargetLibraryInfo() { - return TLI ? TLI : (TLI = &getAnalysis<TargetLibraryInfo>()); + if (!TLI) + TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + + return TLI; } const TargetTransformInfo *getTargetTransformInfo() { - return TTI ? TTI : (TTI = &getAnalysis<TargetTransformInfo>()); + return TTI ? TTI + : (TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI( + *CurLoop->getHeader()->getParent())); } Loop *getLoop() const { return CurLoop; } @@ -215,14 +214,14 @@ namespace { char LoopIdiomRecognize::ID = 0; INITIALIZE_PASS_BEGIN(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms", false, false) -INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LCSSA) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) -INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_END(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms", false, false) @@ -232,44 +231,13 @@ Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognize(); } /// and zero out all the operands of this instruction. If any of them become /// dead, delete them and the computation tree that feeds them. /// -static void deleteDeadInstruction(Instruction *I, ScalarEvolution &SE, +static void deleteDeadInstruction(Instruction *I, const TargetLibraryInfo *TLI) { - SmallVector<Instruction*, 32> NowDeadInsts; - - NowDeadInsts.push_back(I); - - // Before we touch this instruction, remove it from SE! - do { - Instruction *DeadInst = NowDeadInsts.pop_back_val(); - - // This instruction is dead, zap it, in stages. Start by removing it from - // SCEV. - SE.forgetValue(DeadInst); - - for (unsigned op = 0, e = DeadInst->getNumOperands(); op != e; ++op) { - Value *Op = DeadInst->getOperand(op); - DeadInst->setOperand(op, nullptr); - - // If this operand just became dead, add it to the NowDeadInsts list. - if (!Op->use_empty()) continue; - - if (Instruction *OpI = dyn_cast<Instruction>(Op)) - if (isInstructionTriviallyDead(OpI, TLI)) - NowDeadInsts.push_back(OpI); - } - - DeadInst->eraseFromParent(); - - } while (!NowDeadInsts.empty()); -} - -/// deleteIfDeadInstruction - If the specified value is a dead instruction, -/// delete it and any recursively used instructions. -static void deleteIfDeadInstruction(Value *V, ScalarEvolution &SE, - const TargetLibraryInfo *TLI) { - if (Instruction *I = dyn_cast<Instruction>(V)) - if (isInstructionTriviallyDead(I, TLI)) - deleteDeadInstruction(I, SE, TLI); + SmallVector<Value *, 16> Operands(I->value_op_begin(), I->value_op_end()); + I->replaceAllUsesWith(UndefValue::get(I->getType())); + I->eraseFromParent(); + for (Value *Op : Operands) + RecursivelyDeleteTriviallyDeadInstructions(Op, TLI); } //===----------------------------------------------------------------------===// @@ -285,7 +253,7 @@ static void deleteIfDeadInstruction(Value *V, ScalarEvolution &SE, // the concern of breaking data dependence. bool LIRUtil::isAlmostEmpty(BasicBlock *BB) { if (BranchInst *Br = getBranch(BB)) { - return Br->isUnconditional() && BB->size() == 1; + return Br->isUnconditional() && Br == BB->begin(); } return false; } @@ -542,7 +510,7 @@ void NclPopcountRecognize::transform(Instruction *CntInst, cast<ICmpInst>(Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1)); PreCond->replaceAllUsesWith(NewPreCond); - deleteDeadInstruction(PreCond, *SE, TLI); + RecursivelyDeleteTriviallyDeadInstructions(PreCond, TLI); } // Step 3: Note that the population count is exactly the trip count of the @@ -592,15 +560,7 @@ void NclPopcountRecognize::transform(Instruction *CntInst, // Step 4: All the references to the original population counter outside // the loop are replaced with the NewCount -- the value returned from // __builtin_ctpop(). - { - SmallVector<Value *, 4> CntUses; - for (User *U : CntInst->users()) - if (cast<Instruction>(U)->getParent() != Body) - CntUses.push_back(U); - for (unsigned Idx = 0; Idx < CntUses.size(); Idx++) { - (cast<Instruction>(CntUses[Idx]))->replaceUsesOfWith(CntInst, NewCount); - } - } + CntInst->replaceUsesOutsideBlock(NewCount, Body); // step 5: Forget the "non-computable" trip-count SCEV associated with the // loop. The loop would otherwise not be deleted even if it becomes empty. @@ -651,7 +611,9 @@ bool NclPopcountRecognize::recognize() { bool LoopIdiomRecognize::runOnCountableLoop() { const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop); - if (isa<SCEVCouldNotCompute>(BECount)) return false; + assert(!isa<SCEVCouldNotCompute>(BECount) && + "runOnCountableLoop() called on a loop without a predictable" + "backedge-taken count"); // If this loop executes exactly one time, then it should be peeled, not // optimized by this pass. @@ -659,15 +621,11 @@ bool LoopIdiomRecognize::runOnCountableLoop() { if (BECst->getValue()->getValue() == 0) return false; - // We require target data for now. - if (!getDataLayout()) - return false; - // set DT (void)getDominatorTree(); - LoopInfo &LI = getAnalysis<LoopInfo>(); - TLI = &getAnalysis<TargetLibraryInfo>(); + LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); // set TLI (void)getTargetLibraryInfo(); @@ -681,13 +639,12 @@ bool LoopIdiomRecognize::runOnCountableLoop() { bool MadeChange = false; // Scan all the blocks in the loop that are not in subloops. - for (Loop::block_iterator BI = CurLoop->block_begin(), - E = CurLoop->block_end(); BI != E; ++BI) { + for (auto *BB : CurLoop->getBlocks()) { // Ignore blocks in subloops. - if (LI.getLoopFor(*BI) != CurLoop) + if (LI.getLoopFor(BB) != CurLoop) continue; - MadeChange |= runOnLoopBlock(*BI, BECount, ExitBlocks); + MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks); } return MadeChange; } @@ -776,7 +733,8 @@ bool LoopIdiomRecognize::processLoopStore(StoreInst *SI, const SCEV *BECount) { Value *StorePtr = SI->getPointerOperand(); // Reject stores that are so large that they overflow an unsigned. - uint64_t SizeInBits = DL->getTypeSizeInBits(StoredVal->getType()); + auto &DL = CurLoop->getHeader()->getModule()->getDataLayout(); + uint64_t SizeInBits = DL.getTypeSizeInBits(StoredVal->getType()); if ((SizeInBits & 7) || (SizeInBits >> 32) != 0) return false; @@ -951,7 +909,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, // but it can be turned into memset_pattern if the target supports it. Value *SplatValue = isBytewiseValue(StoredVal); Constant *PatternValue = nullptr; - + auto &DL = CurLoop->getHeader()->getModule()->getDataLayout(); unsigned DestAS = DestPtr->getType()->getPointerAddressSpace(); // If we're allowed to form a memset, and the stored value would be acceptable @@ -962,9 +920,8 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, CurLoop->isLoopInvariant(SplatValue)) { // Keep and use SplatValue. PatternValue = nullptr; - } else if (DestAS == 0 && - TLI->has(LibFunc::memset_pattern16) && - (PatternValue = getMemSetPatternValue(StoredVal, *DL))) { + } else if (DestAS == 0 && TLI->has(LibFunc::memset_pattern16) && + (PatternValue = getMemSetPatternValue(StoredVal, DL))) { // Don't create memset_pattern16s with address spaces. // It looks like we can use PatternValue! SplatValue = nullptr; @@ -979,7 +936,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, // header. This allows us to insert code for it in the preheader. BasicBlock *Preheader = CurLoop->getLoopPreheader(); IRBuilder<> Builder(Preheader->getTerminator()); - SCEVExpander Expander(*SE, "loop-idiom"); + SCEVExpander Expander(*SE, DL, "loop-idiom"); Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS); @@ -997,7 +954,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, StoreSize, getAnalysis<AliasAnalysis>(), TheStore)) { Expander.clear(); // If we generated new code for the base pointer, clean up. - deleteIfDeadInstruction(BasePtr, *SE, TLI); + RecursivelyDeleteTriviallyDeadInstructions(BasePtr, TLI); return false; } @@ -1039,12 +996,12 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, // Otherwise we should form a memset_pattern16. PatternValue is known to be // an constant array of 16-bytes. Plop the value into a mergable global. GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true, - GlobalValue::InternalLinkage, + GlobalValue::PrivateLinkage, PatternValue, ".memset_pattern"); GV->setUnnamedAddr(true); // Ok to merge these. GV->setAlignment(16); Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy); - NewCall = Builder.CreateCall3(MSP, BasePtr, PatternPtr, NumBytes); + NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes}); } DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n" @@ -1053,7 +1010,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, // Okay, the memset has been formed. Zap the original store and anything that // feeds into it. - deleteDeadInstruction(TheStore, *SE, TLI); + deleteDeadInstruction(TheStore, TLI); ++NumMemSet; return true; } @@ -1076,7 +1033,8 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, // header. This allows us to insert code for it in the preheader. BasicBlock *Preheader = CurLoop->getLoopPreheader(); IRBuilder<> Builder(Preheader->getTerminator()); - SCEVExpander Expander(*SE, "loop-idiom"); + const DataLayout &DL = Preheader->getModule()->getDataLayout(); + SCEVExpander Expander(*SE, DL, "loop-idiom"); // Okay, we have a strided store "p[i]" of a loaded value. We can turn // this into a memcpy in the loop preheader now if we want. However, this @@ -1094,7 +1052,7 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, getAnalysis<AliasAnalysis>(), SI)) { Expander.clear(); // If we generated new code for the base pointer, clean up. - deleteIfDeadInstruction(StoreBasePtr, *SE, TLI); + RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI); return false; } @@ -1109,8 +1067,8 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, StoreSize, getAnalysis<AliasAnalysis>(), SI)) { Expander.clear(); // If we generated new code for the base pointer, clean up. - deleteIfDeadInstruction(LoadBasePtr, *SE, TLI); - deleteIfDeadInstruction(StoreBasePtr, *SE, TLI); + RecursivelyDeleteTriviallyDeadInstructions(LoadBasePtr, TLI); + RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI); return false; } @@ -1143,7 +1101,7 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, // Okay, the memset has been formed. Zap the original store and anything that // feeds into it. - deleteDeadInstruction(SI, *SE, TLI); + deleteDeadInstruction(SI, TLI); ++NumMemCpy; return true; } diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp index d664f85..e125026 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp @@ -23,7 +23,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" #include "llvm/Support/Debug.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; @@ -44,12 +44,12 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); AU.addRequired<AssumptionCacheTracker>(); - AU.addRequired<LoopInfo>(); + AU.addRequired<LoopInfoWrapperPass>(); AU.addRequiredID(LoopSimplifyID); AU.addPreservedID(LoopSimplifyID); AU.addPreservedID(LCSSAID); AU.addPreserved<ScalarEvolution>(); - AU.addRequired<TargetLibraryInfo>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); } }; } @@ -58,9 +58,9 @@ char LoopInstSimplify::ID = 0; INITIALIZE_PASS_BEGIN(LoopInstSimplify, "loop-instsimplify", "Simplify instructions in loops", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LCSSA) INITIALIZE_PASS_END(LoopInstSimplify, "loop-instsimplify", "Simplify instructions in loops", false, false) @@ -76,10 +76,9 @@ bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { DominatorTreeWrapperPass *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr; - LoopInfo *LI = &getAnalysis<LoopInfo>(); - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr; - const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>(); + LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + const TargetLibraryInfo *TLI = + &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache( *L->getHeader()->getParent()); @@ -109,6 +108,7 @@ bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { WorklistItem Item = VisitStack.pop_back_val(); BasicBlock *BB = Item.getPointer(); bool IsSubloopHeader = Item.getInt(); + const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); // Simplify instructions in the current basic block. for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) { diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp new file mode 100644 index 0000000..f584018 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -0,0 +1,1300 @@ +//===- LoopInterchange.cpp - Loop interchange pass------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This Pass handles loop interchange transform. +// This pass interchanges loops to provide a more cache-friendly memory access +// patterns. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AliasSetTracker.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/DependenceAnalysis.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopIterator.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/LoopUtils.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" +using namespace llvm; + +#define DEBUG_TYPE "loop-interchange" + +namespace { + +typedef SmallVector<Loop *, 8> LoopVector; + +// TODO: Check if we can use a sparse matrix here. +typedef std::vector<std::vector<char>> CharMatrix; + +// Maximum number of dependencies that can be handled in the dependency matrix. +static const unsigned MaxMemInstrCount = 100; + +// Maximum loop depth supported. +static const unsigned MaxLoopNestDepth = 10; + +struct LoopInterchange; + +#ifdef DUMP_DEP_MATRICIES +void printDepMatrix(CharMatrix &DepMatrix) { + for (auto I = DepMatrix.begin(), E = DepMatrix.end(); I != E; ++I) { + std::vector<char> Vec = *I; + for (auto II = Vec.begin(), EE = Vec.end(); II != EE; ++II) + DEBUG(dbgs() << *II << " "); + DEBUG(dbgs() << "\n"); + } +} +#endif + +static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, + Loop *L, DependenceAnalysis *DA) { + typedef SmallVector<Value *, 16> ValueVector; + ValueVector MemInstr; + + if (Level > MaxLoopNestDepth) { + DEBUG(dbgs() << "Cannot handle loops of depth greater than " + << MaxLoopNestDepth << "\n"); + return false; + } + + // For each block. + for (Loop::block_iterator BB = L->block_begin(), BE = L->block_end(); + BB != BE; ++BB) { + // Scan the BB and collect legal loads and stores. + for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end(); I != E; + ++I) { + Instruction *Ins = dyn_cast<Instruction>(I); + if (!Ins) + return false; + LoadInst *Ld = dyn_cast<LoadInst>(I); + StoreInst *St = dyn_cast<StoreInst>(I); + if (!St && !Ld) + continue; + if (Ld && !Ld->isSimple()) + return false; + if (St && !St->isSimple()) + return false; + MemInstr.push_back(I); + } + } + + DEBUG(dbgs() << "Found " << MemInstr.size() + << " Loads and Stores to analyze\n"); + + ValueVector::iterator I, IE, J, JE; + + for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) { + for (J = I, JE = MemInstr.end(); J != JE; ++J) { + std::vector<char> Dep; + Instruction *Src = dyn_cast<Instruction>(*I); + Instruction *Des = dyn_cast<Instruction>(*J); + if (Src == Des) + continue; + if (isa<LoadInst>(Src) && isa<LoadInst>(Des)) + continue; + if (auto D = DA->depends(Src, Des, true)) { + DEBUG(dbgs() << "Found Dependency between Src=" << Src << " Des=" << Des + << "\n"); + if (D->isFlow()) { + // TODO: Handle Flow dependence.Check if it is sufficient to populate + // the Dependence Matrix with the direction reversed. + DEBUG(dbgs() << "Flow dependence not handled"); + return false; + } + if (D->isAnti()) { + DEBUG(dbgs() << "Found Anti dependence \n"); + unsigned Levels = D->getLevels(); + char Direction; + for (unsigned II = 1; II <= Levels; ++II) { + const SCEV *Distance = D->getDistance(II); + const SCEVConstant *SCEVConst = + dyn_cast_or_null<SCEVConstant>(Distance); + if (SCEVConst) { + const ConstantInt *CI = SCEVConst->getValue(); + if (CI->isNegative()) + Direction = '<'; + else if (CI->isZero()) + Direction = '='; + else + Direction = '>'; + Dep.push_back(Direction); + } else if (D->isScalar(II)) { + Direction = 'S'; + Dep.push_back(Direction); + } else { + unsigned Dir = D->getDirection(II); + if (Dir == Dependence::DVEntry::LT || + Dir == Dependence::DVEntry::LE) + Direction = '<'; + else if (Dir == Dependence::DVEntry::GT || + Dir == Dependence::DVEntry::GE) + Direction = '>'; + else if (Dir == Dependence::DVEntry::EQ) + Direction = '='; + else + Direction = '*'; + Dep.push_back(Direction); + } + } + while (Dep.size() != Level) { + Dep.push_back('I'); + } + + DepMatrix.push_back(Dep); + if (DepMatrix.size() > MaxMemInstrCount) { + DEBUG(dbgs() << "Cannot handle more than " << MaxMemInstrCount + << " dependencies inside loop\n"); + return false; + } + } + } + } + } + + // We don't have a DepMatrix to check legality return false + if (DepMatrix.size() == 0) + return false; + return true; +} + +// A loop is moved from index 'from' to an index 'to'. Update the Dependence +// matrix by exchanging the two columns. +static void interChangeDepedencies(CharMatrix &DepMatrix, unsigned FromIndx, + unsigned ToIndx) { + unsigned numRows = DepMatrix.size(); + for (unsigned i = 0; i < numRows; ++i) { + char TmpVal = DepMatrix[i][ToIndx]; + DepMatrix[i][ToIndx] = DepMatrix[i][FromIndx]; + DepMatrix[i][FromIndx] = TmpVal; + } +} + +// Checks if outermost non '=','S'or'I' dependence in the dependence matrix is +// '>' +static bool isOuterMostDepPositive(CharMatrix &DepMatrix, unsigned Row, + unsigned Column) { + for (unsigned i = 0; i <= Column; ++i) { + if (DepMatrix[Row][i] == '<') + return false; + if (DepMatrix[Row][i] == '>') + return true; + } + // All dependencies were '=','S' or 'I' + return false; +} + +// Checks if no dependence exist in the dependency matrix in Row before Column. +static bool containsNoDependence(CharMatrix &DepMatrix, unsigned Row, + unsigned Column) { + for (unsigned i = 0; i < Column; ++i) { + if (DepMatrix[Row][i] != '=' || DepMatrix[Row][i] != 'S' || + DepMatrix[Row][i] != 'I') + return false; + } + return true; +} + +static bool validDepInterchange(CharMatrix &DepMatrix, unsigned Row, + unsigned OuterLoopId, char InnerDep, + char OuterDep) { + + if (isOuterMostDepPositive(DepMatrix, Row, OuterLoopId)) + return false; + + if (InnerDep == OuterDep) + return true; + + // It is legal to interchange if and only if after interchange no row has a + // '>' direction as the leftmost non-'='. + + if (InnerDep == '=' || InnerDep == 'S' || InnerDep == 'I') + return true; + + if (InnerDep == '<') + return true; + + if (InnerDep == '>') { + // If OuterLoopId represents outermost loop then interchanging will make the + // 1st dependency as '>' + if (OuterLoopId == 0) + return false; + + // If all dependencies before OuterloopId are '=','S'or 'I'. Then + // interchanging will result in this row having an outermost non '=' + // dependency of '>' + if (!containsNoDependence(DepMatrix, Row, OuterLoopId)) + return true; + } + + return false; +} + +// Checks if it is legal to interchange 2 loops. +// [Theorem] A permutation of the loops in a perfect nest is legal if and only +// if +// the direction matrix, after the same permutation is applied to its columns, +// has no ">" direction as the leftmost non-"=" direction in any row. +static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix, + unsigned InnerLoopId, + unsigned OuterLoopId) { + + unsigned NumRows = DepMatrix.size(); + // For each row check if it is valid to interchange. + for (unsigned Row = 0; Row < NumRows; ++Row) { + char InnerDep = DepMatrix[Row][InnerLoopId]; + char OuterDep = DepMatrix[Row][OuterLoopId]; + if (InnerDep == '*' || OuterDep == '*') + return false; + else if (!validDepInterchange(DepMatrix, Row, OuterLoopId, InnerDep, + OuterDep)) + return false; + } + return true; +} + +static void populateWorklist(Loop &L, SmallVector<LoopVector, 8> &V) { + + DEBUG(dbgs() << "Calling populateWorklist called\n"); + LoopVector LoopList; + Loop *CurrentLoop = &L; + std::vector<Loop *> vec = CurrentLoop->getSubLoopsVector(); + while (vec.size() != 0) { + // The current loop has multiple subloops in it hence it is not tightly + // nested. + // Discard all loops above it added into Worklist. + if (vec.size() != 1) { + LoopList.clear(); + return; + } + LoopList.push_back(CurrentLoop); + CurrentLoop = *(vec.begin()); + vec = CurrentLoop->getSubLoopsVector(); + } + LoopList.push_back(CurrentLoop); + V.push_back(LoopList); +} + +static PHINode *getInductionVariable(Loop *L, ScalarEvolution *SE) { + PHINode *InnerIndexVar = L->getCanonicalInductionVariable(); + if (InnerIndexVar) + return InnerIndexVar; + if (L->getLoopLatch() == nullptr || L->getLoopPredecessor() == nullptr) + return nullptr; + for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) { + PHINode *PhiVar = cast<PHINode>(I); + Type *PhiTy = PhiVar->getType(); + if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() && + !PhiTy->isPointerTy()) + return nullptr; + const SCEVAddRecExpr *AddRec = + dyn_cast<SCEVAddRecExpr>(SE->getSCEV(PhiVar)); + if (!AddRec || !AddRec->isAffine()) + continue; + const SCEV *Step = AddRec->getStepRecurrence(*SE); + const SCEVConstant *C = dyn_cast<SCEVConstant>(Step); + if (!C) + continue; + // Found the induction variable. + // FIXME: Handle loops with more than one induction variable. Note that, + // currently, legality makes sure we have only one induction variable. + return PhiVar; + } + return nullptr; +} + +/// LoopInterchangeLegality checks if it is legal to interchange the loop. +class LoopInterchangeLegality { +public: + LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE, + LoopInterchange *Pass) + : OuterLoop(Outer), InnerLoop(Inner), SE(SE), CurrentPass(Pass), + InnerLoopHasReduction(false) {} + + /// Check if the loops can be interchanged. + bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId, + CharMatrix &DepMatrix); + /// Check if the loop structure is understood. We do not handle triangular + /// loops for now. + bool isLoopStructureUnderstood(PHINode *InnerInductionVar); + + bool currentLimitations(); + + bool hasInnerLoopReduction() { return InnerLoopHasReduction; } + +private: + bool tightlyNested(Loop *Outer, Loop *Inner); + bool containsUnsafeInstructionsInHeader(BasicBlock *BB); + bool areAllUsesReductions(Instruction *Ins, Loop *L); + bool containsUnsafeInstructionsInLatch(BasicBlock *BB); + bool findInductionAndReductions(Loop *L, + SmallVector<PHINode *, 8> &Inductions, + SmallVector<PHINode *, 8> &Reductions); + Loop *OuterLoop; + Loop *InnerLoop; + + /// Scev analysis. + ScalarEvolution *SE; + LoopInterchange *CurrentPass; + + bool InnerLoopHasReduction; +}; + +/// LoopInterchangeProfitability checks if it is profitable to interchange the +/// loop. +class LoopInterchangeProfitability { +public: + LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE) + : OuterLoop(Outer), InnerLoop(Inner), SE(SE) {} + + /// Check if the loop interchange is profitable + bool isProfitable(unsigned InnerLoopId, unsigned OuterLoopId, + CharMatrix &DepMatrix); + +private: + int getInstrOrderCost(); + + Loop *OuterLoop; + Loop *InnerLoop; + + /// Scev analysis. + ScalarEvolution *SE; +}; + +/// LoopInterchangeTransform interchanges the loop +class LoopInterchangeTransform { +public: + LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE, + LoopInfo *LI, DominatorTree *DT, + LoopInterchange *Pass, BasicBlock *LoopNestExit, + bool InnerLoopContainsReductions) + : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), + LoopExit(LoopNestExit), + InnerLoopHasReduction(InnerLoopContainsReductions) {} + + /// Interchange OuterLoop and InnerLoop. + bool transform(); + void restructureLoops(Loop *InnerLoop, Loop *OuterLoop); + void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop); + +private: + void splitInnerLoopLatch(Instruction *); + void splitOuterLoopLatch(); + void splitInnerLoopHeader(); + bool adjustLoopLinks(); + void adjustLoopPreheaders(); + void adjustOuterLoopPreheader(); + void adjustInnerLoopPreheader(); + bool adjustLoopBranches(); + void updateIncomingBlock(BasicBlock *CurrBlock, BasicBlock *OldPred, + BasicBlock *NewPred); + + Loop *OuterLoop; + Loop *InnerLoop; + + /// Scev analysis. + ScalarEvolution *SE; + LoopInfo *LI; + DominatorTree *DT; + BasicBlock *LoopExit; + bool InnerLoopHasReduction; +}; + +// Main LoopInterchange Pass +struct LoopInterchange : public FunctionPass { + static char ID; + ScalarEvolution *SE; + LoopInfo *LI; + DependenceAnalysis *DA; + DominatorTree *DT; + LoopInterchange() + : FunctionPass(ID), SE(nullptr), LI(nullptr), DA(nullptr), DT(nullptr) { + initializeLoopInterchangePass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<ScalarEvolution>(); + AU.addRequired<AliasAnalysis>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addRequired<DependenceAnalysis>(); + AU.addRequiredID(LoopSimplifyID); + AU.addRequiredID(LCSSAID); + } + + bool runOnFunction(Function &F) override { + SE = &getAnalysis<ScalarEvolution>(); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + DA = &getAnalysis<DependenceAnalysis>(); + auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); + DT = DTWP ? &DTWP->getDomTree() : nullptr; + // Build up a worklist of loop pairs to analyze. + SmallVector<LoopVector, 8> Worklist; + + for (Loop *L : *LI) + populateWorklist(*L, Worklist); + + DEBUG(dbgs() << "Worklist size = " << Worklist.size() << "\n"); + bool Changed = true; + while (!Worklist.empty()) { + LoopVector LoopList = Worklist.pop_back_val(); + Changed = processLoopList(LoopList, F); + } + return Changed; + } + + bool isComputableLoopNest(LoopVector LoopList) { + for (auto I = LoopList.begin(), E = LoopList.end(); I != E; ++I) { + Loop *L = *I; + const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L); + if (ExitCountOuter == SE->getCouldNotCompute()) { + DEBUG(dbgs() << "Couldn't compute Backedge count\n"); + return false; + } + if (L->getNumBackEdges() != 1) { + DEBUG(dbgs() << "NumBackEdges is not equal to 1\n"); + return false; + } + if (!L->getExitingBlock()) { + DEBUG(dbgs() << "Loop Doesn't have unique exit block\n"); + return false; + } + } + return true; + } + + unsigned selectLoopForInterchange(LoopVector LoopList) { + // TODO: Add a better heuristic to select the loop to be interchanged based + // on the dependece matrix. Currently we select the innermost loop. + return LoopList.size() - 1; + } + + bool processLoopList(LoopVector LoopList, Function &F) { + + bool Changed = false; + CharMatrix DependencyMatrix; + if (LoopList.size() < 2) { + DEBUG(dbgs() << "Loop doesn't contain minimum nesting level.\n"); + return false; + } + if (!isComputableLoopNest(LoopList)) { + DEBUG(dbgs() << "Not vaild loop candidate for interchange\n"); + return false; + } + Loop *OuterMostLoop = *(LoopList.begin()); + + DEBUG(dbgs() << "Processing LoopList of size = " << LoopList.size() + << "\n"); + + if (!populateDependencyMatrix(DependencyMatrix, LoopList.size(), + OuterMostLoop, DA)) { + DEBUG(dbgs() << "Populating Dependency matrix failed\n"); + return false; + } +#ifdef DUMP_DEP_MATRICIES + DEBUG(dbgs() << "Dependence before inter change \n"); + printDepMatrix(DependencyMatrix); +#endif + + BasicBlock *OuterMostLoopLatch = OuterMostLoop->getLoopLatch(); + BranchInst *OuterMostLoopLatchBI = + dyn_cast<BranchInst>(OuterMostLoopLatch->getTerminator()); + if (!OuterMostLoopLatchBI) + return false; + + // Since we currently do not handle LCSSA PHI's any failure in loop + // condition will now branch to LoopNestExit. + // TODO: This should be removed once we handle LCSSA PHI nodes. + + // Get the Outermost loop exit. + BasicBlock *LoopNestExit; + if (OuterMostLoopLatchBI->getSuccessor(0) == OuterMostLoop->getHeader()) + LoopNestExit = OuterMostLoopLatchBI->getSuccessor(1); + else + LoopNestExit = OuterMostLoopLatchBI->getSuccessor(0); + + if (isa<PHINode>(LoopNestExit->begin())) { + DEBUG(dbgs() << "PHI Nodes in loop nest exit is not handled for now " + "since on failure all loops branch to loop nest exit.\n"); + return false; + } + + unsigned SelecLoopId = selectLoopForInterchange(LoopList); + // Move the selected loop outwards to the best posible position. + for (unsigned i = SelecLoopId; i > 0; i--) { + bool Interchanged = + processLoop(LoopList, i, i - 1, LoopNestExit, DependencyMatrix); + if (!Interchanged) + return Changed; + // Loops interchanged reflect the same in LoopList + std::swap(LoopList[i - 1], LoopList[i]); + + // Update the DependencyMatrix + interChangeDepedencies(DependencyMatrix, i, i - 1); + DT->recalculate(F); +#ifdef DUMP_DEP_MATRICIES + DEBUG(dbgs() << "Dependence after inter change \n"); + printDepMatrix(DependencyMatrix); +#endif + Changed |= Interchanged; + } + return Changed; + } + + bool processLoop(LoopVector LoopList, unsigned InnerLoopId, + unsigned OuterLoopId, BasicBlock *LoopNestExit, + std::vector<std::vector<char>> &DependencyMatrix) { + + DEBUG(dbgs() << "Processing Innder Loop Id = " << InnerLoopId + << " and OuterLoopId = " << OuterLoopId << "\n"); + Loop *InnerLoop = LoopList[InnerLoopId]; + Loop *OuterLoop = LoopList[OuterLoopId]; + + LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, this); + if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) { + DEBUG(dbgs() << "Not interchanging Loops. Cannot prove legality\n"); + return false; + } + DEBUG(dbgs() << "Loops are legal to interchange\n"); + LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE); + if (!LIP.isProfitable(InnerLoopId, OuterLoopId, DependencyMatrix)) { + DEBUG(dbgs() << "Interchanging Loops not profitable\n"); + return false; + } + + LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, this, + LoopNestExit, LIL.hasInnerLoopReduction()); + LIT.transform(); + DEBUG(dbgs() << "Loops interchanged\n"); + return true; + } +}; + +} // end of namespace +bool LoopInterchangeLegality::areAllUsesReductions(Instruction *Ins, Loop *L) { + return !std::any_of(Ins->user_begin(), Ins->user_end(), [=](User *U) -> bool { + PHINode *UserIns = dyn_cast<PHINode>(U); + ReductionDescriptor RD; + return !UserIns || !ReductionDescriptor::isReductionPHI(UserIns, L, RD); + }); +} + +bool LoopInterchangeLegality::containsUnsafeInstructionsInHeader( + BasicBlock *BB) { + for (auto I = BB->begin(), E = BB->end(); I != E; ++I) { + // Load corresponding to reduction PHI's are safe while concluding if + // tightly nested. + if (LoadInst *L = dyn_cast<LoadInst>(I)) { + if (!areAllUsesReductions(L, InnerLoop)) + return true; + } else if (I->mayHaveSideEffects() || I->mayReadFromMemory()) + return true; + } + return false; +} + +bool LoopInterchangeLegality::containsUnsafeInstructionsInLatch( + BasicBlock *BB) { + for (auto I = BB->begin(), E = BB->end(); I != E; ++I) { + // Stores corresponding to reductions are safe while concluding if tightly + // nested. + if (StoreInst *L = dyn_cast<StoreInst>(I)) { + PHINode *PHI = dyn_cast<PHINode>(L->getOperand(0)); + if (!PHI) + return true; + } else if (I->mayHaveSideEffects() || I->mayReadFromMemory()) + return true; + } + return false; +} + +bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) { + BasicBlock *OuterLoopHeader = OuterLoop->getHeader(); + BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); + BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch(); + + DEBUG(dbgs() << "Checking if Loops are Tightly Nested\n"); + + // A perfectly nested loop will not have any branch in between the outer and + // inner block i.e. outer header will branch to either inner preheader and + // outerloop latch. + BranchInst *outerLoopHeaderBI = + dyn_cast<BranchInst>(OuterLoopHeader->getTerminator()); + if (!outerLoopHeaderBI) + return false; + unsigned num = outerLoopHeaderBI->getNumSuccessors(); + for (unsigned i = 0; i < num; i++) { + if (outerLoopHeaderBI->getSuccessor(i) != InnerLoopPreHeader && + outerLoopHeaderBI->getSuccessor(i) != OuterLoopLatch) + return false; + } + + DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch \n"); + // We do not have any basic block in between now make sure the outer header + // and outer loop latch doesnt contain any unsafe instructions. + if (containsUnsafeInstructionsInHeader(OuterLoopHeader) || + containsUnsafeInstructionsInLatch(OuterLoopLatch)) + return false; + + DEBUG(dbgs() << "Loops are perfectly nested \n"); + // We have a perfect loop nest. + return true; +} + + +bool LoopInterchangeLegality::isLoopStructureUnderstood( + PHINode *InnerInduction) { + + unsigned Num = InnerInduction->getNumOperands(); + BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader(); + for (unsigned i = 0; i < Num; ++i) { + Value *Val = InnerInduction->getOperand(i); + if (isa<Constant>(Val)) + continue; + Instruction *I = dyn_cast<Instruction>(Val); + if (!I) + return false; + // TODO: Handle triangular loops. + // e.g. for(int i=0;i<N;i++) + // for(int j=i;j<N;j++) + unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i); + if (InnerInduction->getIncomingBlock(IncomBlockIndx) == + InnerLoopPreheader && + !OuterLoop->isLoopInvariant(I)) { + return false; + } + } + return true; +} + +bool LoopInterchangeLegality::findInductionAndReductions( + Loop *L, SmallVector<PHINode *, 8> &Inductions, + SmallVector<PHINode *, 8> &Reductions) { + if (!L->getLoopLatch() || !L->getLoopPredecessor()) + return false; + for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) { + ReductionDescriptor RD; + PHINode *PHI = cast<PHINode>(I); + ConstantInt *StepValue = nullptr; + if (isInductionPHI(PHI, SE, StepValue)) + Inductions.push_back(PHI); + else if (ReductionDescriptor::isReductionPHI(PHI, L, RD)) + Reductions.push_back(PHI); + else { + DEBUG( + dbgs() << "Failed to recognize PHI as an induction or reduction.\n"); + return false; + } + } + return true; +} + +static bool containsSafePHI(BasicBlock *Block, bool isOuterLoopExitBlock) { + for (auto I = Block->begin(); isa<PHINode>(I); ++I) { + PHINode *PHI = cast<PHINode>(I); + // Reduction lcssa phi will have only 1 incoming block that from loop latch. + if (PHI->getNumIncomingValues() > 1) + return false; + Instruction *Ins = dyn_cast<Instruction>(PHI->getIncomingValue(0)); + if (!Ins) + return false; + // Incoming value for lcssa phi's in outer loop exit can only be inner loop + // exits lcssa phi else it would not be tightly nested. + if (!isa<PHINode>(Ins) && isOuterLoopExitBlock) + return false; + } + return true; +} + +static BasicBlock *getLoopLatchExitBlock(BasicBlock *LatchBlock, + BasicBlock *LoopHeader) { + if (BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator())) { + unsigned Num = BI->getNumSuccessors(); + assert(Num == 2); + for (unsigned i = 0; i < Num; ++i) { + if (BI->getSuccessor(i) == LoopHeader) + continue; + return BI->getSuccessor(i); + } + } + return nullptr; +} + +// This function indicates the current limitations in the transform as a result +// of which we do not proceed. +bool LoopInterchangeLegality::currentLimitations() { + + BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); + BasicBlock *InnerLoopHeader = InnerLoop->getHeader(); + BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch(); + BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch(); + BasicBlock *OuterLoopHeader = OuterLoop->getHeader(); + + PHINode *InnerInductionVar; + SmallVector<PHINode *, 8> Inductions; + SmallVector<PHINode *, 8> Reductions; + if (!findInductionAndReductions(InnerLoop, Inductions, Reductions)) + return true; + + // TODO: Currently we handle only loops with 1 induction variable. + if (Inductions.size() != 1) { + DEBUG(dbgs() << "We currently only support loops with 1 induction variable." + << "Failed to interchange due to current limitation\n"); + return true; + } + if (Reductions.size() > 0) + InnerLoopHasReduction = true; + + InnerInductionVar = Inductions.pop_back_val(); + Reductions.clear(); + if (!findInductionAndReductions(OuterLoop, Inductions, Reductions)) + return true; + + // Outer loop cannot have reduction because then loops will not be tightly + // nested. + if (!Reductions.empty()) + return true; + // TODO: Currently we handle only loops with 1 induction variable. + if (Inductions.size() != 1) + return true; + + // TODO: Triangular loops are not handled for now. + if (!isLoopStructureUnderstood(InnerInductionVar)) { + DEBUG(dbgs() << "Loop structure not understood by pass\n"); + return true; + } + + // TODO: We only handle LCSSA PHI's corresponding to reduction for now. + BasicBlock *LoopExitBlock = + getLoopLatchExitBlock(OuterLoopLatch, OuterLoopHeader); + if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, true)) + return true; + + LoopExitBlock = getLoopLatchExitBlock(InnerLoopLatch, InnerLoopHeader); + if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, false)) + return true; + + // TODO: Current limitation: Since we split the inner loop latch at the point + // were induction variable is incremented (induction.next); We cannot have + // more than 1 user of induction.next since it would result in broken code + // after split. + // e.g. + // for(i=0;i<N;i++) { + // for(j = 0;j<M;j++) { + // A[j+1][i+2] = A[j][i]+k; + // } + // } + bool FoundInduction = false; + Instruction *InnerIndexVarInc = nullptr; + if (InnerInductionVar->getIncomingBlock(0) == InnerLoopPreHeader) + InnerIndexVarInc = + dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(1)); + else + InnerIndexVarInc = + dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(0)); + + if (!InnerIndexVarInc) + return true; + + // Since we split the inner loop latch on this induction variable. Make sure + // we do not have any instruction between the induction variable and branch + // instruction. + + for (auto I = InnerLoopLatch->rbegin(), E = InnerLoopLatch->rend(); + I != E && !FoundInduction; ++I) { + if (isa<BranchInst>(*I) || isa<CmpInst>(*I) || isa<TruncInst>(*I)) + continue; + const Instruction &Ins = *I; + // We found an instruction. If this is not induction variable then it is not + // safe to split this loop latch. + if (!Ins.isIdenticalTo(InnerIndexVarInc)) + return true; + else + FoundInduction = true; + } + // The loop latch ended and we didnt find the induction variable return as + // current limitation. + if (!FoundInduction) + return true; + + return false; +} + +bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId, + unsigned OuterLoopId, + CharMatrix &DepMatrix) { + + if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) { + DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId + << "and OuterLoopId = " << OuterLoopId + << "due to dependence\n"); + return false; + } + + // Create unique Preheaders if we already do not have one. + BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader(); + BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); + + // Create a unique outer preheader - + // 1) If OuterLoop preheader is not present. + // 2) If OuterLoop Preheader is same as OuterLoop Header + // 3) If OuterLoop Preheader is same as Header of the previous loop. + // 4) If OuterLoop Preheader is Entry node. + if (!OuterLoopPreHeader || OuterLoopPreHeader == OuterLoop->getHeader() || + isa<PHINode>(OuterLoopPreHeader->begin()) || + !OuterLoopPreHeader->getUniquePredecessor()) { + OuterLoopPreHeader = InsertPreheaderForLoop(OuterLoop, CurrentPass); + } + + if (!InnerLoopPreHeader || InnerLoopPreHeader == InnerLoop->getHeader() || + InnerLoopPreHeader == OuterLoop->getHeader()) { + InnerLoopPreHeader = InsertPreheaderForLoop(InnerLoop, CurrentPass); + } + + // TODO: The loops could not be interchanged due to current limitations in the + // transform module. + if (currentLimitations()) { + DEBUG(dbgs() << "Not legal because of current transform limitation\n"); + return false; + } + + // Check if the loops are tightly nested. + if (!tightlyNested(OuterLoop, InnerLoop)) { + DEBUG(dbgs() << "Loops not tightly nested\n"); + return false; + } + + return true; +} + +int LoopInterchangeProfitability::getInstrOrderCost() { + unsigned GoodOrder, BadOrder; + BadOrder = GoodOrder = 0; + for (auto BI = InnerLoop->block_begin(), BE = InnerLoop->block_end(); + BI != BE; ++BI) { + for (auto I = (*BI)->begin(), E = (*BI)->end(); I != E; ++I) { + const Instruction &Ins = *I; + if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&Ins)) { + unsigned NumOp = GEP->getNumOperands(); + bool FoundInnerInduction = false; + bool FoundOuterInduction = false; + for (unsigned i = 0; i < NumOp; ++i) { + const SCEV *OperandVal = SE->getSCEV(GEP->getOperand(i)); + const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OperandVal); + if (!AR) + continue; + + // If we find the inner induction after an outer induction e.g. + // for(int i=0;i<N;i++) + // for(int j=0;j<N;j++) + // A[i][j] = A[i-1][j-1]+k; + // then it is a good order. + if (AR->getLoop() == InnerLoop) { + // We found an InnerLoop induction after OuterLoop induction. It is + // a good order. + FoundInnerInduction = true; + if (FoundOuterInduction) { + GoodOrder++; + break; + } + } + // If we find the outer induction after an inner induction e.g. + // for(int i=0;i<N;i++) + // for(int j=0;j<N;j++) + // A[j][i] = A[j-1][i-1]+k; + // then it is a bad order. + if (AR->getLoop() == OuterLoop) { + // We found an OuterLoop induction after InnerLoop induction. It is + // a bad order. + FoundOuterInduction = true; + if (FoundInnerInduction) { + BadOrder++; + break; + } + } + } + } + } + } + return GoodOrder - BadOrder; +} + +static bool isProfitabileForVectorization(unsigned InnerLoopId, + unsigned OuterLoopId, + CharMatrix &DepMatrix) { + // TODO: Improve this heuristic to catch more cases. + // If the inner loop is loop independent or doesn't carry any dependency it is + // profitable to move this to outer position. + unsigned Row = DepMatrix.size(); + for (unsigned i = 0; i < Row; ++i) { + if (DepMatrix[i][InnerLoopId] != 'S' && DepMatrix[i][InnerLoopId] != 'I') + return false; + // TODO: We need to improve this heuristic. + if (DepMatrix[i][OuterLoopId] != '=') + return false; + } + // If outer loop has dependence and inner loop is loop independent then it is + // profitable to interchange to enable parallelism. + return true; +} + +bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId, + unsigned OuterLoopId, + CharMatrix &DepMatrix) { + + // TODO: Add Better Profitibility checks. + // e.g + // 1) Construct dependency matrix and move the one with no loop carried dep + // inside to enable vectorization. + + // This is rough cost estimation algorithm. It counts the good and bad order + // of induction variables in the instruction and allows reordering if number + // of bad orders is more than good. + int Cost = 0; + Cost += getInstrOrderCost(); + DEBUG(dbgs() << "Cost = " << Cost << "\n"); + if (Cost < 0) + return true; + + // It is not profitable as per current cache profitibility model. But check if + // we can move this loop outside to improve parallelism. + bool ImprovesPar = + isProfitabileForVectorization(InnerLoopId, OuterLoopId, DepMatrix); + return ImprovesPar; +} + +void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop, + Loop *InnerLoop) { + for (Loop::iterator I = OuterLoop->begin(), E = OuterLoop->end(); I != E; + ++I) { + if (*I == InnerLoop) { + OuterLoop->removeChildLoop(I); + return; + } + } + assert(false && "Couldn't find loop"); +} + +void LoopInterchangeTransform::restructureLoops(Loop *InnerLoop, + Loop *OuterLoop) { + Loop *OuterLoopParent = OuterLoop->getParentLoop(); + if (OuterLoopParent) { + // Remove the loop from its parent loop. + removeChildLoop(OuterLoopParent, OuterLoop); + removeChildLoop(OuterLoop, InnerLoop); + OuterLoopParent->addChildLoop(InnerLoop); + } else { + removeChildLoop(OuterLoop, InnerLoop); + LI->changeTopLevelLoop(OuterLoop, InnerLoop); + } + + while (!InnerLoop->empty()) + OuterLoop->addChildLoop(InnerLoop->removeChildLoop(InnerLoop->begin())); + + InnerLoop->addChildLoop(OuterLoop); +} + +bool LoopInterchangeTransform::transform() { + + DEBUG(dbgs() << "transform\n"); + bool Transformed = false; + Instruction *InnerIndexVar; + + if (InnerLoop->getSubLoops().size() == 0) { + BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); + DEBUG(dbgs() << "Calling Split Inner Loop\n"); + PHINode *InductionPHI = getInductionVariable(InnerLoop, SE); + if (!InductionPHI) { + DEBUG(dbgs() << "Failed to find the point to split loop latch \n"); + return false; + } + + if (InductionPHI->getIncomingBlock(0) == InnerLoopPreHeader) + InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(1)); + else + InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(0)); + + // + // Split at the place were the induction variable is + // incremented/decremented. + // TODO: This splitting logic may not work always. Fix this. + splitInnerLoopLatch(InnerIndexVar); + DEBUG(dbgs() << "splitInnerLoopLatch Done\n"); + + // Splits the inner loops phi nodes out into a seperate basic block. + splitInnerLoopHeader(); + DEBUG(dbgs() << "splitInnerLoopHeader Done\n"); + } + + Transformed |= adjustLoopLinks(); + if (!Transformed) { + DEBUG(dbgs() << "adjustLoopLinks Failed\n"); + return false; + } + + restructureLoops(InnerLoop, OuterLoop); + return true; +} + +void LoopInterchangeTransform::splitInnerLoopLatch(Instruction *Inc) { + BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch(); + BasicBlock *InnerLoopLatchPred = InnerLoopLatch; + InnerLoopLatch = SplitBlock(InnerLoopLatchPred, Inc, DT, LI); +} + +void LoopInterchangeTransform::splitOuterLoopLatch() { + BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch(); + BasicBlock *OuterLatchLcssaPhiBlock = OuterLoopLatch; + OuterLoopLatch = SplitBlock(OuterLatchLcssaPhiBlock, + OuterLoopLatch->getFirstNonPHI(), DT, LI); +} + +void LoopInterchangeTransform::splitInnerLoopHeader() { + + // Split the inner loop header out. Here make sure that the reduction PHI's + // stay in the innerloop body. + BasicBlock *InnerLoopHeader = InnerLoop->getHeader(); + BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); + if (InnerLoopHasReduction) { + // FIXME: Check if the induction PHI will always be the first PHI. + BasicBlock *New = InnerLoopHeader->splitBasicBlock( + ++(InnerLoopHeader->begin()), InnerLoopHeader->getName() + ".split"); + if (LI) + if (Loop *L = LI->getLoopFor(InnerLoopHeader)) + L->addBasicBlockToLoop(New, *LI); + + // Adjust Reduction PHI's in the block. + SmallVector<PHINode *, 8> PHIVec; + for (auto I = New->begin(); isa<PHINode>(I); ++I) { + PHINode *PHI = dyn_cast<PHINode>(I); + Value *V = PHI->getIncomingValueForBlock(InnerLoopPreHeader); + PHI->replaceAllUsesWith(V); + PHIVec.push_back((PHI)); + } + for (auto I = PHIVec.begin(), E = PHIVec.end(); I != E; ++I) { + PHINode *P = *I; + P->eraseFromParent(); + } + } else { + SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHI(), DT, LI); + } + + DEBUG(dbgs() << "Output of splitInnerLoopHeader InnerLoopHeaderSucc & " + "InnerLoopHeader \n"); +} + +/// \brief Move all instructions except the terminator from FromBB right before +/// InsertBefore +static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) { + auto &ToList = InsertBefore->getParent()->getInstList(); + auto &FromList = FromBB->getInstList(); + + ToList.splice(InsertBefore, FromList, FromList.begin(), + FromBB->getTerminator()); +} + +void LoopInterchangeTransform::adjustOuterLoopPreheader() { + BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader(); + BasicBlock *InnerPreHeader = InnerLoop->getLoopPreheader(); + + moveBBContents(OuterLoopPreHeader, InnerPreHeader->getTerminator()); +} + +void LoopInterchangeTransform::adjustInnerLoopPreheader() { + BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); + BasicBlock *OuterHeader = OuterLoop->getHeader(); + + moveBBContents(InnerLoopPreHeader, OuterHeader->getTerminator()); +} + +void LoopInterchangeTransform::updateIncomingBlock(BasicBlock *CurrBlock, + BasicBlock *OldPred, + BasicBlock *NewPred) { + for (auto I = CurrBlock->begin(); isa<PHINode>(I); ++I) { + PHINode *PHI = cast<PHINode>(I); + unsigned Num = PHI->getNumIncomingValues(); + for (unsigned i = 0; i < Num; ++i) { + if (PHI->getIncomingBlock(i) == OldPred) + PHI->setIncomingBlock(i, NewPred); + } + } +} + +bool LoopInterchangeTransform::adjustLoopBranches() { + + DEBUG(dbgs() << "adjustLoopBranches called\n"); + // Adjust the loop preheader + BasicBlock *InnerLoopHeader = InnerLoop->getHeader(); + BasicBlock *OuterLoopHeader = OuterLoop->getHeader(); + BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch(); + BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch(); + BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader(); + BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); + BasicBlock *OuterLoopPredecessor = OuterLoopPreHeader->getUniquePredecessor(); + BasicBlock *InnerLoopLatchPredecessor = + InnerLoopLatch->getUniquePredecessor(); + BasicBlock *InnerLoopLatchSuccessor; + BasicBlock *OuterLoopLatchSuccessor; + + BranchInst *OuterLoopLatchBI = + dyn_cast<BranchInst>(OuterLoopLatch->getTerminator()); + BranchInst *InnerLoopLatchBI = + dyn_cast<BranchInst>(InnerLoopLatch->getTerminator()); + BranchInst *OuterLoopHeaderBI = + dyn_cast<BranchInst>(OuterLoopHeader->getTerminator()); + BranchInst *InnerLoopHeaderBI = + dyn_cast<BranchInst>(InnerLoopHeader->getTerminator()); + + if (!OuterLoopPredecessor || !InnerLoopLatchPredecessor || + !OuterLoopLatchBI || !InnerLoopLatchBI || !OuterLoopHeaderBI || + !InnerLoopHeaderBI) + return false; + + BranchInst *InnerLoopLatchPredecessorBI = + dyn_cast<BranchInst>(InnerLoopLatchPredecessor->getTerminator()); + BranchInst *OuterLoopPredecessorBI = + dyn_cast<BranchInst>(OuterLoopPredecessor->getTerminator()); + + if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI) + return false; + BasicBlock *InnerLoopHeaderSucessor = InnerLoopHeader->getUniqueSuccessor(); + if (!InnerLoopHeaderSucessor) + return false; + + // Adjust Loop Preheader and headers + + unsigned NumSucc = OuterLoopPredecessorBI->getNumSuccessors(); + for (unsigned i = 0; i < NumSucc; ++i) { + if (OuterLoopPredecessorBI->getSuccessor(i) == OuterLoopPreHeader) + OuterLoopPredecessorBI->setSuccessor(i, InnerLoopPreHeader); + } + + NumSucc = OuterLoopHeaderBI->getNumSuccessors(); + for (unsigned i = 0; i < NumSucc; ++i) { + if (OuterLoopHeaderBI->getSuccessor(i) == OuterLoopLatch) + OuterLoopHeaderBI->setSuccessor(i, LoopExit); + else if (OuterLoopHeaderBI->getSuccessor(i) == InnerLoopPreHeader) + OuterLoopHeaderBI->setSuccessor(i, InnerLoopHeaderSucessor); + } + + // Adjust reduction PHI's now that the incoming block has changed. + updateIncomingBlock(InnerLoopHeaderSucessor, InnerLoopHeader, + OuterLoopHeader); + + BranchInst::Create(OuterLoopPreHeader, InnerLoopHeaderBI); + InnerLoopHeaderBI->eraseFromParent(); + + // -------------Adjust loop latches----------- + if (InnerLoopLatchBI->getSuccessor(0) == InnerLoopHeader) + InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(1); + else + InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(0); + + NumSucc = InnerLoopLatchPredecessorBI->getNumSuccessors(); + for (unsigned i = 0; i < NumSucc; ++i) { + if (InnerLoopLatchPredecessorBI->getSuccessor(i) == InnerLoopLatch) + InnerLoopLatchPredecessorBI->setSuccessor(i, InnerLoopLatchSuccessor); + } + + // Adjust PHI nodes in InnerLoopLatchSuccessor. Update all uses of PHI with + // the value and remove this PHI node from inner loop. + SmallVector<PHINode *, 8> LcssaVec; + for (auto I = InnerLoopLatchSuccessor->begin(); isa<PHINode>(I); ++I) { + PHINode *LcssaPhi = cast<PHINode>(I); + LcssaVec.push_back(LcssaPhi); + } + for (auto I = LcssaVec.begin(), E = LcssaVec.end(); I != E; ++I) { + PHINode *P = *I; + Value *Incoming = P->getIncomingValueForBlock(InnerLoopLatch); + P->replaceAllUsesWith(Incoming); + P->eraseFromParent(); + } + + if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopHeader) + OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(1); + else + OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(0); + + if (InnerLoopLatchBI->getSuccessor(1) == InnerLoopLatchSuccessor) + InnerLoopLatchBI->setSuccessor(1, OuterLoopLatchSuccessor); + else + InnerLoopLatchBI->setSuccessor(0, OuterLoopLatchSuccessor); + + updateIncomingBlock(OuterLoopLatchSuccessor, OuterLoopLatch, InnerLoopLatch); + + if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopLatchSuccessor) { + OuterLoopLatchBI->setSuccessor(0, InnerLoopLatch); + } else { + OuterLoopLatchBI->setSuccessor(1, InnerLoopLatch); + } + + return true; +} +void LoopInterchangeTransform::adjustLoopPreheaders() { + + // We have interchanged the preheaders so we need to interchange the data in + // the preheader as well. + // This is because the content of inner preheader was previously executed + // inside the outer loop. + BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader(); + BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); + BasicBlock *OuterLoopHeader = OuterLoop->getHeader(); + BranchInst *InnerTermBI = + cast<BranchInst>(InnerLoopPreHeader->getTerminator()); + + // These instructions should now be executed inside the loop. + // Move instruction into a new block after outer header. + moveBBContents(InnerLoopPreHeader, OuterLoopHeader->getTerminator()); + // These instructions were not executed previously in the loop so move them to + // the older inner loop preheader. + moveBBContents(OuterLoopPreHeader, InnerTermBI); +} + +bool LoopInterchangeTransform::adjustLoopLinks() { + + // Adjust all branches in the inner and outer loop. + bool Changed = adjustLoopBranches(); + if (Changed) + adjustLoopPreheaders(); + return Changed; +} + +char LoopInterchange::ID = 0; +INITIALIZE_PASS_BEGIN(LoopInterchange, "loop-interchange", + "Interchanges loops for cache reuse", false, false) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(DependenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(LCSSA) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) + +INITIALIZE_PASS_END(LoopInterchange, "loop-interchange", + "Interchanges loops for cache reuse", false, false) + +Pass *llvm::createLoopInterchangePass() { return new LoopInterchange(); } diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp index 8f12204..ed103e6 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp @@ -12,7 +12,9 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/MapVector.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" @@ -21,6 +23,7 @@ #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" @@ -28,7 +31,6 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" @@ -43,6 +45,12 @@ static cl::opt<unsigned> MaxInc("max-reroll-increment", cl::init(2048), cl::Hidden, cl::desc("The maximum increment for loop rerolling")); +static cl::opt<unsigned> +NumToleratedFailedMatches("reroll-num-tolerated-failed-matches", cl::init(400), + cl::Hidden, + cl::desc("The maximum number of failures to tolerate" + " during fuzzy matching. (default: 400)")); + // This loop re-rolling transformation aims to transform loops like this: // // int foo(int a); @@ -119,6 +127,16 @@ MaxInc("max-reroll-increment", cl::init(2048), cl::Hidden, // br %cmp, header, exit namespace { + enum IterationLimits { + /// The maximum number of iterations that we'll try and reroll. This + /// has to be less than 25 in order to fit into a SmallBitVector. + IL_MaxRerollIterations = 16, + /// The bitvector index used by loop induction variables and other + /// instructions that belong to all iterations. + IL_All, + IL_End + }; + class LoopReroll : public LoopPass { public: static char ID; // Pass ID, replacement for typeid @@ -130,19 +148,18 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<AliasAnalysis>(); - AU.addRequired<LoopInfo>(); - AU.addPreserved<LoopInfo>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); AU.addRequired<ScalarEvolution>(); - AU.addRequired<TargetLibraryInfo>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); } -protected: + protected: AliasAnalysis *AA; LoopInfo *LI; ScalarEvolution *SE; - const DataLayout *DL; TargetLibraryInfo *TLI; DominatorTree *DT; @@ -311,26 +328,113 @@ protected: DenseSet<int> Reds; }; + // A DAGRootSet models an induction variable being used in a rerollable + // loop. For example, + // + // x[i*3+0] = y1 + // x[i*3+1] = y2 + // x[i*3+2] = y3 + // + // Base instruction -> i*3 + // +---+----+ + // / | \ + // ST[y1] +1 +2 <-- Roots + // | | + // ST[y2] ST[y3] + // + // There may be multiple DAGRoots, for example: + // + // x[i*2+0] = ... (1) + // x[i*2+1] = ... (1) + // x[i*2+4] = ... (2) + // x[i*2+5] = ... (2) + // x[(i+1234)*2+5678] = ... (3) + // x[(i+1234)*2+5679] = ... (3) + // + // The loop will be rerolled by adding a new loop induction variable, + // one for the Base instruction in each DAGRootSet. + // + struct DAGRootSet { + Instruction *BaseInst; + SmallInstructionVector Roots; + // The instructions between IV and BaseInst (but not including BaseInst). + SmallInstructionSet SubsumedInsts; + }; + + // The set of all DAG roots, and state tracking of all roots + // for a particular induction variable. + struct DAGRootTracker { + DAGRootTracker(LoopReroll *Parent, Loop *L, Instruction *IV, + ScalarEvolution *SE, AliasAnalysis *AA, + TargetLibraryInfo *TLI) + : Parent(Parent), L(L), SE(SE), AA(AA), TLI(TLI), IV(IV) {} + + /// Stage 1: Find all the DAG roots for the induction variable. + bool findRoots(); + /// Stage 2: Validate if the found roots are valid. + bool validate(ReductionTracker &Reductions); + /// Stage 3: Assuming validate() returned true, perform the + /// replacement. + /// @param IterCount The maximum iteration count of L. + void replace(const SCEV *IterCount); + + protected: + typedef MapVector<Instruction*, SmallBitVector> UsesTy; + + bool findRootsRecursive(Instruction *IVU, + SmallInstructionSet SubsumedInsts); + bool findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts); + bool collectPossibleRoots(Instruction *Base, + std::map<int64_t,Instruction*> &Roots); + + bool collectUsedInstructions(SmallInstructionSet &PossibleRedSet); + void collectInLoopUserSet(const SmallInstructionVector &Roots, + const SmallInstructionSet &Exclude, + const SmallInstructionSet &Final, + DenseSet<Instruction *> &Users); + void collectInLoopUserSet(Instruction *Root, + const SmallInstructionSet &Exclude, + const SmallInstructionSet &Final, + DenseSet<Instruction *> &Users); + + UsesTy::iterator nextInstr(int Val, UsesTy &In, + const SmallInstructionSet &Exclude, + UsesTy::iterator *StartI=nullptr); + bool isBaseInst(Instruction *I); + bool isRootInst(Instruction *I); + bool instrDependsOn(Instruction *I, + UsesTy::iterator Start, + UsesTy::iterator End); + + LoopReroll *Parent; + + // Members of Parent, replicated here for brevity. + Loop *L; + ScalarEvolution *SE; + AliasAnalysis *AA; + TargetLibraryInfo *TLI; + + // The loop induction variable. + Instruction *IV; + // Loop step amount. + uint64_t Inc; + // Loop reroll count; if Inc == 1, this records the scaling applied + // to the indvar: a[i*2+0] = ...; a[i*2+1] = ... ; + // If Inc is not 1, Scale = Inc. + uint64_t Scale; + // The roots themselves. + SmallVector<DAGRootSet,16> RootSets; + // All increment instructions for IV. + SmallInstructionVector LoopIncs; + // Map of all instructions in the loop (in order) to the iterations + // they are used in (or specially, IL_All for instructions + // used in the loop increment mechanism). + UsesTy Uses; + }; + void collectPossibleIVs(Loop *L, SmallInstructionVector &PossibleIVs); void collectPossibleReductions(Loop *L, ReductionTracker &Reductions); - void collectInLoopUserSet(Loop *L, - const SmallInstructionVector &Roots, - const SmallInstructionSet &Exclude, - const SmallInstructionSet &Final, - DenseSet<Instruction *> &Users); - void collectInLoopUserSet(Loop *L, - Instruction * Root, - const SmallInstructionSet &Exclude, - const SmallInstructionSet &Final, - DenseSet<Instruction *> &Users); - bool findScaleFromMul(Instruction *RealIV, uint64_t &Scale, - Instruction *&IV, - SmallInstructionVector &LoopIncs); - bool collectAllRoots(Loop *L, uint64_t Inc, uint64_t Scale, Instruction *IV, - SmallVector<SmallInstructionVector, 32> &Roots, - SmallInstructionSet &AllRoots, - SmallInstructionVector &LoopIncs); bool reroll(Instruction *IV, Loop *L, BasicBlock *Header, const SCEV *IterCount, ReductionTracker &Reductions); }; @@ -339,10 +443,10 @@ protected: char LoopReroll::ID = 0; INITIALIZE_PASS_BEGIN(LoopReroll, "loop-reroll", "Reroll loops", false, false) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) -INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END(LoopReroll, "loop-reroll", "Reroll loops", false, false) Pass *llvm::createLoopRerollPass() { @@ -353,10 +457,10 @@ Pass *llvm::createLoopRerollPass() { // This operates like Instruction::isUsedOutsideOfBlock, but considers PHIs in // non-loop blocks to be outside the loop. static bool hasUsesOutsideLoop(Instruction *I, Loop *L) { - for (User *U : I->users()) + for (User *U : I->users()) { if (!L->contains(cast<Instruction>(U))) return true; - + } return false; } @@ -403,6 +507,8 @@ void LoopReroll::SimpleLoopReduction::add(Loop *L) { // (including the PHI), except for the last value (which is used by the PHI // and also outside the loop). Instruction *C = Instructions.front(); + if (C->user_empty()) + return; do { C = cast<Instruction>(*C->user_begin()); @@ -424,11 +530,12 @@ void LoopReroll::SimpleLoopReduction::add(Loop *L) { return; // C is now the (potential) last instruction in the reduction chain. - for (User *U : C->users()) + for (User *U : C->users()) { // The only in-loop user can be the initial PHI. if (L->contains(cast<Instruction>(U))) if (cast<Instruction>(U) != Instructions.front()) return; + } Instructions.push_back(C); Valid = true; @@ -467,7 +574,7 @@ void LoopReroll::collectPossibleReductions(Loop *L, // if they are users, but their users are not added. This is used, for // example, to prevent a reduction update from forcing all later reduction // updates into the use set. -void LoopReroll::collectInLoopUserSet(Loop *L, +void LoopReroll::DAGRootTracker::collectInLoopUserSet( Instruction *Root, const SmallInstructionSet &Exclude, const SmallInstructionSet &Final, DenseSet<Instruction *> &Users) { @@ -504,14 +611,14 @@ void LoopReroll::collectInLoopUserSet(Loop *L, // Collect all of the users of all of the provided root instructions (combined // into a single set). -void LoopReroll::collectInLoopUserSet(Loop *L, +void LoopReroll::DAGRootTracker::collectInLoopUserSet( const SmallInstructionVector &Roots, const SmallInstructionSet &Exclude, const SmallInstructionSet &Final, DenseSet<Instruction *> &Users) { for (SmallInstructionVector::const_iterator I = Roots.begin(), IE = Roots.end(); I != IE; ++I) - collectInLoopUserSet(L, *I, Exclude, Final, Users); + collectInLoopUserSet(*I, Exclude, Final, Users); } static bool isSimpleLoadStore(Instruction *I) { @@ -524,130 +631,699 @@ static bool isSimpleLoadStore(Instruction *I) { return false; } -// Recognize loops that are setup like this: -// -// %iv = phi [ (preheader, ...), (body, %iv.next) ] -// %scaled.iv = mul %iv, scale -// f(%scaled.iv) -// %scaled.iv.1 = add %scaled.iv, 1 -// f(%scaled.iv.1) -// %scaled.iv.2 = add %scaled.iv, 2 -// f(%scaled.iv.2) -// %scaled.iv.scale_m_1 = add %scaled.iv, scale-1 -// f(%scaled.iv.scale_m_1) -// ... -// %iv.next = add %iv, 1 -// %cmp = icmp(%iv, ...) -// br %cmp, header, exit -// -// and, if found, set IV = %scaled.iv, and add %iv.next to LoopIncs. -bool LoopReroll::findScaleFromMul(Instruction *RealIV, uint64_t &Scale, - Instruction *&IV, - SmallInstructionVector &LoopIncs) { - // This is a special case: here we're looking for all uses (except for - // the increment) to be multiplied by a common factor. The increment must - // be by one. This is to capture loops like: - // for (int i = 0; i < 500; ++i) { - // foo(3*i); foo(3*i+1); foo(3*i+2); - // } - if (RealIV->getNumUses() != 2) +/// Return true if IVU is a "simple" arithmetic operation. +/// This is used for narrowing the search space for DAGRoots; only arithmetic +/// and GEPs can be part of a DAGRoot. +static bool isSimpleArithmeticOp(User *IVU) { + if (Instruction *I = dyn_cast<Instruction>(IVU)) { + switch (I->getOpcode()) { + default: return false; + case Instruction::Add: + case Instruction::Sub: + case Instruction::Mul: + case Instruction::Shl: + case Instruction::AShr: + case Instruction::LShr: + case Instruction::GetElementPtr: + case Instruction::Trunc: + case Instruction::ZExt: + case Instruction::SExt: + return true; + } + } + return false; +} + +static bool isLoopIncrement(User *U, Instruction *IV) { + BinaryOperator *BO = dyn_cast<BinaryOperator>(U); + if (!BO || BO->getOpcode() != Instruction::Add) + return false; + + for (auto *UU : BO->users()) { + PHINode *PN = dyn_cast<PHINode>(UU); + if (PN && PN == IV) + return true; + } + return false; +} + +bool LoopReroll::DAGRootTracker:: +collectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) { + SmallInstructionVector BaseUsers; + + for (auto *I : Base->users()) { + ConstantInt *CI = nullptr; + + if (isLoopIncrement(I, IV)) { + LoopIncs.push_back(cast<Instruction>(I)); + continue; + } + + // The root nodes must be either GEPs, ORs or ADDs. + if (auto *BO = dyn_cast<BinaryOperator>(I)) { + if (BO->getOpcode() == Instruction::Add || + BO->getOpcode() == Instruction::Or) + CI = dyn_cast<ConstantInt>(BO->getOperand(1)); + } else if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) { + Value *LastOperand = GEP->getOperand(GEP->getNumOperands()-1); + CI = dyn_cast<ConstantInt>(LastOperand); + } + + if (!CI) { + if (Instruction *II = dyn_cast<Instruction>(I)) { + BaseUsers.push_back(II); + continue; + } else { + DEBUG(dbgs() << "LRR: Aborting due to non-instruction: " << *I << "\n"); + return false; + } + } + + int64_t V = CI->getValue().getSExtValue(); + if (Roots.find(V) != Roots.end()) + // No duplicates, please. + return false; + + // FIXME: Add support for negative values. + if (V < 0) { + DEBUG(dbgs() << "LRR: Aborting due to negative value: " << V << "\n"); + return false; + } + + Roots[V] = cast<Instruction>(I); + } + + if (Roots.empty()) return false; - const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(RealIV)); - Instruction *User1 = cast<Instruction>(*RealIV->user_begin()), - *User2 = cast<Instruction>(*std::next(RealIV->user_begin())); - if (!SE->isSCEVable(User1->getType()) || !SE->isSCEVable(User2->getType())) + + // If we found non-loop-inc, non-root users of Base, assume they are + // for the zeroth root index. This is because "add %a, 0" gets optimized + // away. + if (BaseUsers.size()) { + if (Roots.find(0) != Roots.end()) { + DEBUG(dbgs() << "LRR: Multiple roots found for base - aborting!\n"); + return false; + } + Roots[0] = Base; + } + + // Calculate the number of users of the base, or lowest indexed, iteration. + unsigned NumBaseUses = BaseUsers.size(); + if (NumBaseUses == 0) + NumBaseUses = Roots.begin()->second->getNumUses(); + + // Check that every node has the same number of users. + for (auto &KV : Roots) { + if (KV.first == 0) + continue; + if (KV.second->getNumUses() != NumBaseUses) { + DEBUG(dbgs() << "LRR: Aborting - Root and Base #users not the same: " + << "#Base=" << NumBaseUses << ", #Root=" << + KV.second->getNumUses() << "\n"); + return false; + } + } + + return true; +} + +bool LoopReroll::DAGRootTracker:: +findRootsRecursive(Instruction *I, SmallInstructionSet SubsumedInsts) { + // Does the user look like it could be part of a root set? + // All its users must be simple arithmetic ops. + if (I->getNumUses() > IL_MaxRerollIterations) return false; - const SCEVAddRecExpr *User1SCEV = - dyn_cast<SCEVAddRecExpr>(SE->getSCEV(User1)), - *User2SCEV = - dyn_cast<SCEVAddRecExpr>(SE->getSCEV(User2)); - if (!User1SCEV || !User1SCEV->isAffine() || - !User2SCEV || !User2SCEV->isAffine()) + + if ((I->getOpcode() == Instruction::Mul || + I->getOpcode() == Instruction::PHI) && + I != IV && + findRootsBase(I, SubsumedInsts)) + return true; + + SubsumedInsts.insert(I); + + for (User *V : I->users()) { + Instruction *I = dyn_cast<Instruction>(V); + if (std::find(LoopIncs.begin(), LoopIncs.end(), I) != LoopIncs.end()) + continue; + + if (!I || !isSimpleArithmeticOp(I) || + !findRootsRecursive(I, SubsumedInsts)) + return false; + } + return true; +} + +bool LoopReroll::DAGRootTracker:: +findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts) { + + // The base instruction needs to be a multiply so + // that we can erase it. + if (IVU->getOpcode() != Instruction::Mul && + IVU->getOpcode() != Instruction::PHI) + return false; + + std::map<int64_t, Instruction*> V; + if (!collectPossibleRoots(IVU, V)) return false; - // We assume below that User1 is the scale multiply and User2 is the - // increment. If this can't be true, then swap them. - if (User1SCEV == RealIVSCEV->getPostIncExpr(*SE)) { - std::swap(User1, User2); - std::swap(User1SCEV, User2SCEV); + // If we didn't get a root for index zero, then IVU must be + // subsumed. + if (V.find(0) == V.end()) + SubsumedInsts.insert(IVU); + + // Partition the vector into monotonically increasing indexes. + DAGRootSet DRS; + DRS.BaseInst = nullptr; + + for (auto &KV : V) { + if (!DRS.BaseInst) { + DRS.BaseInst = KV.second; + DRS.SubsumedInsts = SubsumedInsts; + } else if (DRS.Roots.empty()) { + DRS.Roots.push_back(KV.second); + } else if (V.find(KV.first - 1) != V.end()) { + DRS.Roots.push_back(KV.second); + } else { + // Linear sequence terminated. + RootSets.push_back(DRS); + DRS.BaseInst = KV.second; + DRS.SubsumedInsts = SubsumedInsts; + DRS.Roots.clear(); + } + } + RootSets.push_back(DRS); + + return true; +} + +bool LoopReroll::DAGRootTracker::findRoots() { + + const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(IV)); + Inc = cast<SCEVConstant>(RealIVSCEV->getOperand(1))-> + getValue()->getZExtValue(); + + assert(RootSets.empty() && "Unclean state!"); + if (Inc == 1) { + for (auto *IVU : IV->users()) { + if (isLoopIncrement(IVU, IV)) + LoopIncs.push_back(cast<Instruction>(IVU)); + } + if (!findRootsRecursive(IV, SmallInstructionSet())) + return false; + LoopIncs.push_back(IV); + } else { + if (!findRootsBase(IV, SmallInstructionSet())) + return false; } - if (User2SCEV != RealIVSCEV->getPostIncExpr(*SE)) + // Ensure all sets have the same size. + if (RootSets.empty()) { + DEBUG(dbgs() << "LRR: Aborting because no root sets found!\n"); return false; - assert(User2SCEV->getStepRecurrence(*SE)->isOne() && - "Invalid non-unit step for multiplicative scaling"); - LoopIncs.push_back(User2); - - if (const SCEVConstant *MulScale = - dyn_cast<SCEVConstant>(User1SCEV->getStepRecurrence(*SE))) { - // Make sure that both the start and step have the same multiplier. - if (RealIVSCEV->getStart()->getType() != MulScale->getType()) + } + for (auto &V : RootSets) { + if (V.Roots.empty() || V.Roots.size() != RootSets[0].Roots.size()) { + DEBUG(dbgs() + << "LRR: Aborting because not all root sets have the same size\n"); return false; - if (SE->getMulExpr(RealIVSCEV->getStart(), MulScale) != - User1SCEV->getStart()) + } + } + + // And ensure all loop iterations are consecutive. We rely on std::map + // providing ordered traversal. + for (auto &V : RootSets) { + const auto *ADR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(V.BaseInst)); + if (!ADR) return false; - ConstantInt *MulScaleCI = MulScale->getValue(); - if (!MulScaleCI->uge(2) || MulScaleCI->uge(MaxInc)) + // Consider a DAGRootSet with N-1 roots (so N different values including + // BaseInst). + // Define d = Roots[0] - BaseInst, which should be the same as + // Roots[I] - Roots[I-1] for all I in [1..N). + // Define D = BaseInst@J - BaseInst@J-1, where "@J" means the value at the + // loop iteration J. + // + // Now, For the loop iterations to be consecutive: + // D = d * N + + unsigned N = V.Roots.size() + 1; + const SCEV *StepSCEV = SE->getMinusSCEV(SE->getSCEV(V.Roots[0]), ADR); + const SCEV *ScaleSCEV = SE->getConstant(StepSCEV->getType(), N); + if (ADR->getStepRecurrence(*SE) != SE->getMulExpr(StepSCEV, ScaleSCEV)) { + DEBUG(dbgs() << "LRR: Aborting because iterations are not consecutive\n"); return false; - Scale = MulScaleCI->getZExtValue(); - IV = User1; - } else + } + } + Scale = RootSets[0].Roots.size() + 1; + + if (Scale > IL_MaxRerollIterations) { + DEBUG(dbgs() << "LRR: Aborting - too many iterations found. " + << "#Found=" << Scale << ", #Max=" << IL_MaxRerollIterations + << "\n"); return false; + } + + DEBUG(dbgs() << "LRR: Successfully found roots: Scale=" << Scale << "\n"); - DEBUG(dbgs() << "LRR: Found possible scaling " << *User1 << "\n"); return true; } -// Collect all root increments with respect to the provided induction variable -// (normally the PHI, but sometimes a multiply). A root increment is an -// instruction, normally an add, with a positive constant less than Scale. In a -// rerollable loop, each of these increments is the root of an instruction -// graph isomorphic to the others. Also, we collect the final induction -// increment (the increment equal to the Scale), and its users in LoopIncs. -bool LoopReroll::collectAllRoots(Loop *L, uint64_t Inc, uint64_t Scale, - Instruction *IV, - SmallVector<SmallInstructionVector, 32> &Roots, - SmallInstructionSet &AllRoots, - SmallInstructionVector &LoopIncs) { - for (User *U : IV->users()) { - Instruction *UI = cast<Instruction>(U); - if (!SE->isSCEVable(UI->getType())) - continue; - if (UI->getType() != IV->getType()) - continue; - if (!L->contains(UI)) - continue; - if (hasUsesOutsideLoop(UI, L)) - continue; +bool LoopReroll::DAGRootTracker::collectUsedInstructions(SmallInstructionSet &PossibleRedSet) { + // Populate the MapVector with all instructions in the block, in order first, + // so we can iterate over the contents later in perfect order. + for (auto &I : *L->getHeader()) { + Uses[&I].resize(IL_End); + } + + SmallInstructionSet Exclude; + for (auto &DRS : RootSets) { + Exclude.insert(DRS.Roots.begin(), DRS.Roots.end()); + Exclude.insert(DRS.SubsumedInsts.begin(), DRS.SubsumedInsts.end()); + Exclude.insert(DRS.BaseInst); + } + Exclude.insert(LoopIncs.begin(), LoopIncs.end()); + + for (auto &DRS : RootSets) { + DenseSet<Instruction*> VBase; + collectInLoopUserSet(DRS.BaseInst, Exclude, PossibleRedSet, VBase); + for (auto *I : VBase) { + Uses[I].set(0); + } + + unsigned Idx = 1; + for (auto *Root : DRS.Roots) { + DenseSet<Instruction*> V; + collectInLoopUserSet(Root, Exclude, PossibleRedSet, V); - if (const SCEVConstant *Diff = dyn_cast<SCEVConstant>(SE->getMinusSCEV( - SE->getSCEV(UI), SE->getSCEV(IV)))) { - uint64_t Idx = Diff->getValue()->getValue().getZExtValue(); - if (Idx > 0 && Idx < Scale) { - Roots[Idx-1].push_back(UI); - AllRoots.insert(UI); - } else if (Idx == Scale && Inc > 1) { - LoopIncs.push_back(UI); + // While we're here, check the use sets are the same size. + if (V.size() != VBase.size()) { + DEBUG(dbgs() << "LRR: Aborting - use sets are different sizes\n"); + return false; + } + + for (auto *I : V) { + Uses[I].set(Idx); } + ++Idx; } + + // Make sure our subsumed instructions are remembered too. + for (auto *I : DRS.SubsumedInsts) { + Uses[I].set(IL_All); + } + } + + // Make sure the loop increments are also accounted for. + + Exclude.clear(); + for (auto &DRS : RootSets) { + Exclude.insert(DRS.Roots.begin(), DRS.Roots.end()); + Exclude.insert(DRS.SubsumedInsts.begin(), DRS.SubsumedInsts.end()); + Exclude.insert(DRS.BaseInst); + } + + DenseSet<Instruction*> V; + collectInLoopUserSet(LoopIncs, Exclude, PossibleRedSet, V); + for (auto *I : V) { + Uses[I].set(IL_All); + } + + return true; + +} + +/// Get the next instruction in "In" that is a member of set Val. +/// Start searching from StartI, and do not return anything in Exclude. +/// If StartI is not given, start from In.begin(). +LoopReroll::DAGRootTracker::UsesTy::iterator +LoopReroll::DAGRootTracker::nextInstr(int Val, UsesTy &In, + const SmallInstructionSet &Exclude, + UsesTy::iterator *StartI) { + UsesTy::iterator I = StartI ? *StartI : In.begin(); + while (I != In.end() && (I->second.test(Val) == 0 || + Exclude.count(I->first) != 0)) + ++I; + return I; +} + +bool LoopReroll::DAGRootTracker::isBaseInst(Instruction *I) { + for (auto &DRS : RootSets) { + if (DRS.BaseInst == I) + return true; } + return false; +} - if (Roots[0].empty()) +bool LoopReroll::DAGRootTracker::isRootInst(Instruction *I) { + for (auto &DRS : RootSets) { + if (std::find(DRS.Roots.begin(), DRS.Roots.end(), I) != DRS.Roots.end()) + return true; + } + return false; +} + +/// Return true if instruction I depends on any instruction between +/// Start and End. +bool LoopReroll::DAGRootTracker::instrDependsOn(Instruction *I, + UsesTy::iterator Start, + UsesTy::iterator End) { + for (auto *U : I->users()) { + for (auto It = Start; It != End; ++It) + if (U == It->first) + return true; + } + return false; +} + +bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) { + // We now need to check for equivalence of the use graph of each root with + // that of the primary induction variable (excluding the roots). Our goal + // here is not to solve the full graph isomorphism problem, but rather to + // catch common cases without a lot of work. As a result, we will assume + // that the relative order of the instructions in each unrolled iteration + // is the same (although we will not make an assumption about how the + // different iterations are intermixed). Note that while the order must be + // the same, the instructions may not be in the same basic block. + + // An array of just the possible reductions for this scale factor. When we + // collect the set of all users of some root instructions, these reduction + // instructions are treated as 'final' (their uses are not considered). + // This is important because we don't want the root use set to search down + // the reduction chain. + SmallInstructionSet PossibleRedSet; + SmallInstructionSet PossibleRedLastSet; + SmallInstructionSet PossibleRedPHISet; + Reductions.restrictToScale(Scale, PossibleRedSet, + PossibleRedPHISet, PossibleRedLastSet); + + // Populate "Uses" with where each instruction is used. + if (!collectUsedInstructions(PossibleRedSet)) return false; - bool AllSame = true; - for (unsigned i = 1; i < Scale-1; ++i) - if (Roots[i].size() != Roots[0].size()) { - AllSame = false; - break; + + // Make sure we mark the reduction PHIs as used in all iterations. + for (auto *I : PossibleRedPHISet) { + Uses[I].set(IL_All); + } + + // Make sure all instructions in the loop are in one and only one + // set. + for (auto &KV : Uses) { + if (KV.second.count() != 1) { + DEBUG(dbgs() << "LRR: Aborting - instruction is not used in 1 iteration: " + << *KV.first << " (#uses=" << KV.second.count() << ")\n"); + return false; } + } - if (!AllSame) - return false; + DEBUG( + for (auto &KV : Uses) { + dbgs() << "LRR: " << KV.second.find_first() << "\t" << *KV.first << "\n"; + } + ); + + for (unsigned Iter = 1; Iter < Scale; ++Iter) { + // In addition to regular aliasing information, we need to look for + // instructions from later (future) iterations that have side effects + // preventing us from reordering them past other instructions with side + // effects. + bool FutureSideEffects = false; + AliasSetTracker AST(*AA); + // The map between instructions in f(%iv.(i+1)) and f(%iv). + DenseMap<Value *, Value *> BaseMap; + + // Compare iteration Iter to the base. + SmallInstructionSet Visited; + auto BaseIt = nextInstr(0, Uses, Visited); + auto RootIt = nextInstr(Iter, Uses, Visited); + auto LastRootIt = Uses.begin(); + + while (BaseIt != Uses.end() && RootIt != Uses.end()) { + Instruction *BaseInst = BaseIt->first; + Instruction *RootInst = RootIt->first; + + // Skip over the IV or root instructions; only match their users. + bool Continue = false; + if (isBaseInst(BaseInst)) { + Visited.insert(BaseInst); + BaseIt = nextInstr(0, Uses, Visited); + Continue = true; + } + if (isRootInst(RootInst)) { + LastRootIt = RootIt; + Visited.insert(RootInst); + RootIt = nextInstr(Iter, Uses, Visited); + Continue = true; + } + if (Continue) continue; + + if (!BaseInst->isSameOperationAs(RootInst)) { + // Last chance saloon. We don't try and solve the full isomorphism + // problem, but try and at least catch the case where two instructions + // *of different types* are round the wrong way. We won't be able to + // efficiently tell, given two ADD instructions, which way around we + // should match them, but given an ADD and a SUB, we can at least infer + // which one is which. + // + // This should allow us to deal with a greater subset of the isomorphism + // problem. It does however change a linear algorithm into a quadratic + // one, so limit the number of probes we do. + auto TryIt = RootIt; + unsigned N = NumToleratedFailedMatches; + while (TryIt != Uses.end() && + !BaseInst->isSameOperationAs(TryIt->first) && + N--) { + ++TryIt; + TryIt = nextInstr(Iter, Uses, Visited, &TryIt); + } + + if (TryIt == Uses.end() || TryIt == RootIt || + instrDependsOn(TryIt->first, RootIt, TryIt)) { + DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst << + " vs. " << *RootInst << "\n"); + return false; + } + + RootIt = TryIt; + RootInst = TryIt->first; + } + + // All instructions between the last root and this root + // may belong to some other iteration. If they belong to a + // future iteration, then they're dangerous to alias with. + // + // Note that because we allow a limited amount of flexibility in the order + // that we visit nodes, LastRootIt might be *before* RootIt, in which + // case we've already checked this set of instructions so we shouldn't + // do anything. + for (; LastRootIt < RootIt; ++LastRootIt) { + Instruction *I = LastRootIt->first; + if (LastRootIt->second.find_first() < (int)Iter) + continue; + if (I->mayWriteToMemory()) + AST.add(I); + // Note: This is specifically guarded by a check on isa<PHINode>, + // which while a valid (somewhat arbitrary) micro-optimization, is + // needed because otherwise isSafeToSpeculativelyExecute returns + // false on PHI nodes. + if (!isa<PHINode>(I) && !isSimpleLoadStore(I) && + !isSafeToSpeculativelyExecute(I)) + // Intervening instructions cause side effects. + FutureSideEffects = true; + } + + // Make sure that this instruction, which is in the use set of this + // root instruction, does not also belong to the base set or the set of + // some other root instruction. + if (RootIt->second.count() > 1) { + DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst << + " vs. " << *RootInst << " (prev. case overlap)\n"); + return false; + } + + // Make sure that we don't alias with any instruction in the alias set + // tracker. If we do, then we depend on a future iteration, and we + // can't reroll. + if (RootInst->mayReadFromMemory()) + for (auto &K : AST) { + if (K.aliasesUnknownInst(RootInst, *AA)) { + DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst << + " vs. " << *RootInst << " (depends on future store)\n"); + return false; + } + } + + // If we've past an instruction from a future iteration that may have + // side effects, and this instruction might also, then we can't reorder + // them, and this matching fails. As an exception, we allow the alias + // set tracker to handle regular (simple) load/store dependencies. + if (FutureSideEffects && ((!isSimpleLoadStore(BaseInst) && + !isSafeToSpeculativelyExecute(BaseInst)) || + (!isSimpleLoadStore(RootInst) && + !isSafeToSpeculativelyExecute(RootInst)))) { + DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst << + " vs. " << *RootInst << + " (side effects prevent reordering)\n"); + return false; + } + + // For instructions that are part of a reduction, if the operation is + // associative, then don't bother matching the operands (because we + // already know that the instructions are isomorphic, and the order + // within the iteration does not matter). For non-associative reductions, + // we do need to match the operands, because we need to reject + // out-of-order instructions within an iteration! + // For example (assume floating-point addition), we need to reject this: + // x += a[i]; x += b[i]; + // x += a[i+1]; x += b[i+1]; + // x += b[i+2]; x += a[i+2]; + bool InReduction = Reductions.isPairInSame(BaseInst, RootInst); + + if (!(InReduction && BaseInst->isAssociative())) { + bool Swapped = false, SomeOpMatched = false; + for (unsigned j = 0; j < BaseInst->getNumOperands(); ++j) { + Value *Op2 = RootInst->getOperand(j); + + // If this is part of a reduction (and the operation is not + // associatve), then we match all operands, but not those that are + // part of the reduction. + if (InReduction) + if (Instruction *Op2I = dyn_cast<Instruction>(Op2)) + if (Reductions.isPairInSame(RootInst, Op2I)) + continue; + + DenseMap<Value *, Value *>::iterator BMI = BaseMap.find(Op2); + if (BMI != BaseMap.end()) { + Op2 = BMI->second; + } else { + for (auto &DRS : RootSets) { + if (DRS.Roots[Iter-1] == (Instruction*) Op2) { + Op2 = DRS.BaseInst; + break; + } + } + } + + if (BaseInst->getOperand(Swapped ? unsigned(!j) : j) != Op2) { + // If we've not already decided to swap the matched operands, and + // we've not already matched our first operand (note that we could + // have skipped matching the first operand because it is part of a + // reduction above), and the instruction is commutative, then try + // the swapped match. + if (!Swapped && BaseInst->isCommutative() && !SomeOpMatched && + BaseInst->getOperand(!j) == Op2) { + Swapped = true; + } else { + DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst + << " vs. " << *RootInst << " (operand " << j << ")\n"); + return false; + } + } + + SomeOpMatched = true; + } + } + + if ((!PossibleRedLastSet.count(BaseInst) && + hasUsesOutsideLoop(BaseInst, L)) || + (!PossibleRedLastSet.count(RootInst) && + hasUsesOutsideLoop(RootInst, L))) { + DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst << + " vs. " << *RootInst << " (uses outside loop)\n"); + return false; + } + + Reductions.recordPair(BaseInst, RootInst, Iter); + BaseMap.insert(std::make_pair(RootInst, BaseInst)); + + LastRootIt = RootIt; + Visited.insert(BaseInst); + Visited.insert(RootInst); + BaseIt = nextInstr(0, Uses, Visited); + RootIt = nextInstr(Iter, Uses, Visited); + } + assert (BaseIt == Uses.end() && RootIt == Uses.end() && + "Mismatched set sizes!"); + } + + DEBUG(dbgs() << "LRR: Matched all iteration increments for " << + *IV << "\n"); return true; } +void LoopReroll::DAGRootTracker::replace(const SCEV *IterCount) { + BasicBlock *Header = L->getHeader(); + // Remove instructions associated with non-base iterations. + for (BasicBlock::reverse_iterator J = Header->rbegin(); + J != Header->rend();) { + unsigned I = Uses[&*J].find_first(); + if (I > 0 && I < IL_All) { + Instruction *D = &*J; + DEBUG(dbgs() << "LRR: removing: " << *D << "\n"); + D->eraseFromParent(); + continue; + } + + ++J; + } + const DataLayout &DL = Header->getModule()->getDataLayout(); + + // We need to create a new induction variable for each different BaseInst. + for (auto &DRS : RootSets) { + // Insert the new induction variable. + const SCEVAddRecExpr *RealIVSCEV = + cast<SCEVAddRecExpr>(SE->getSCEV(DRS.BaseInst)); + const SCEV *Start = RealIVSCEV->getStart(); + const SCEVAddRecExpr *H = cast<SCEVAddRecExpr> + (SE->getAddRecExpr(Start, + SE->getConstant(RealIVSCEV->getType(), 1), + L, SCEV::FlagAnyWrap)); + { // Limit the lifetime of SCEVExpander. + SCEVExpander Expander(*SE, DL, "reroll"); + Value *NewIV = Expander.expandCodeFor(H, IV->getType(), Header->begin()); + + for (auto &KV : Uses) { + if (KV.second.find_first() == 0) + KV.first->replaceUsesOfWith(DRS.BaseInst, NewIV); + } + + if (BranchInst *BI = dyn_cast<BranchInst>(Header->getTerminator())) { + // FIXME: Why do we need this check? + if (Uses[BI].find_first() == IL_All) { + const SCEV *ICSCEV = RealIVSCEV->evaluateAtIteration(IterCount, *SE); + + // Iteration count SCEV minus 1 + const SCEV *ICMinus1SCEV = + SE->getMinusSCEV(ICSCEV, SE->getConstant(ICSCEV->getType(), 1)); + + Value *ICMinus1; // Iteration count minus 1 + if (isa<SCEVConstant>(ICMinus1SCEV)) { + ICMinus1 = Expander.expandCodeFor(ICMinus1SCEV, NewIV->getType(), BI); + } else { + BasicBlock *Preheader = L->getLoopPreheader(); + if (!Preheader) + Preheader = InsertPreheaderForLoop(L, Parent); + + ICMinus1 = Expander.expandCodeFor(ICMinus1SCEV, NewIV->getType(), + Preheader->getTerminator()); + } + + Value *Cond = + new ICmpInst(BI, CmpInst::ICMP_EQ, NewIV, ICMinus1, "exitcond"); + BI->setCondition(Cond); + + if (BI->getSuccessor(1) != Header) + BI->swapSuccessors(); + } + } + } + } + + SimplifyInstructionsInBlock(Header, TLI); + DeleteDeadPHIs(Header, TLI); +} + // Validate the selected reductions. All iterations must have an isomorphic // part of the reduction chain and, for non-associative reductions, the chain // entries must appear in order. @@ -711,8 +1387,9 @@ void LoopReroll::ReductionTracker::replaceSelected() { // Replace users with the new end-of-chain value. SmallInstructionVector Users; - for (User *U : PossibleReds[i].getReducedValue()->users()) + for (User *U : PossibleReds[i].getReducedValue()->users()) { Users.push_back(cast<Instruction>(U)); + } for (SmallInstructionVector::iterator J = Users.begin(), JE = Users.end(); J != JE; ++J) @@ -767,359 +1444,23 @@ void LoopReroll::ReductionTracker::replaceSelected() { bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header, const SCEV *IterCount, ReductionTracker &Reductions) { - const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(IV)); - uint64_t Inc = cast<SCEVConstant>(RealIVSCEV->getOperand(1))-> - getValue()->getZExtValue(); - // The collection of loop increment instructions. - SmallInstructionVector LoopIncs; - uint64_t Scale = Inc; - - // The effective induction variable, IV, is normally also the real induction - // variable. When we're dealing with a loop like: - // for (int i = 0; i < 500; ++i) - // x[3*i] = ...; - // x[3*i+1] = ...; - // x[3*i+2] = ...; - // then the real IV is still i, but the effective IV is (3*i). - Instruction *RealIV = IV; - if (Inc == 1 && !findScaleFromMul(RealIV, Scale, IV, LoopIncs)) - return false; - - assert(Scale <= MaxInc && "Scale is too large"); - assert(Scale > 1 && "Scale must be at least 2"); + DAGRootTracker DAGRoots(this, L, IV, SE, AA, TLI); - // The set of increment instructions for each increment value. - SmallVector<SmallInstructionVector, 32> Roots(Scale-1); - SmallInstructionSet AllRoots; - if (!collectAllRoots(L, Inc, Scale, IV, Roots, AllRoots, LoopIncs)) + if (!DAGRoots.findRoots()) return false; - DEBUG(dbgs() << "LRR: Found all root induction increments for: " << - *RealIV << "\n"); - - // An array of just the possible reductions for this scale factor. When we - // collect the set of all users of some root instructions, these reduction - // instructions are treated as 'final' (their uses are not considered). - // This is important because we don't want the root use set to search down - // the reduction chain. - SmallInstructionSet PossibleRedSet; - SmallInstructionSet PossibleRedLastSet, PossibleRedPHISet; - Reductions.restrictToScale(Scale, PossibleRedSet, PossibleRedPHISet, - PossibleRedLastSet); - - // We now need to check for equivalence of the use graph of each root with - // that of the primary induction variable (excluding the roots). Our goal - // here is not to solve the full graph isomorphism problem, but rather to - // catch common cases without a lot of work. As a result, we will assume - // that the relative order of the instructions in each unrolled iteration - // is the same (although we will not make an assumption about how the - // different iterations are intermixed). Note that while the order must be - // the same, the instructions may not be in the same basic block. - SmallInstructionSet Exclude(AllRoots); - Exclude.insert(LoopIncs.begin(), LoopIncs.end()); - - DenseSet<Instruction *> BaseUseSet; - collectInLoopUserSet(L, IV, Exclude, PossibleRedSet, BaseUseSet); - - DenseSet<Instruction *> AllRootUses; - std::vector<DenseSet<Instruction *> > RootUseSets(Scale-1); - - bool MatchFailed = false; - for (unsigned i = 0; i < Scale-1 && !MatchFailed; ++i) { - DenseSet<Instruction *> &RootUseSet = RootUseSets[i]; - collectInLoopUserSet(L, Roots[i], SmallInstructionSet(), - PossibleRedSet, RootUseSet); - - DEBUG(dbgs() << "LRR: base use set size: " << BaseUseSet.size() << - " vs. iteration increment " << (i+1) << - " use set size: " << RootUseSet.size() << "\n"); - - if (BaseUseSet.size() != RootUseSet.size()) { - MatchFailed = true; - break; - } - - // In addition to regular aliasing information, we need to look for - // instructions from later (future) iterations that have side effects - // preventing us from reordering them past other instructions with side - // effects. - bool FutureSideEffects = false; - AliasSetTracker AST(*AA); - - // The map between instructions in f(%iv.(i+1)) and f(%iv). - DenseMap<Value *, Value *> BaseMap; - - assert(L->getNumBlocks() == 1 && "Cannot handle multi-block loops"); - for (BasicBlock::iterator J1 = Header->begin(), J2 = Header->begin(), - JE = Header->end(); J1 != JE && !MatchFailed; ++J1) { - if (cast<Instruction>(J1) == RealIV) - continue; - if (cast<Instruction>(J1) == IV) - continue; - if (!BaseUseSet.count(J1)) - continue; - if (PossibleRedPHISet.count(J1)) // Skip reduction PHIs. - continue; - - while (J2 != JE && (!RootUseSet.count(J2) || - std::find(Roots[i].begin(), Roots[i].end(), J2) != - Roots[i].end())) { - // As we iterate through the instructions, instructions that don't - // belong to previous iterations (or the base case), must belong to - // future iterations. We want to track the alias set of writes from - // previous iterations. - if (!isa<PHINode>(J2) && !BaseUseSet.count(J2) && - !AllRootUses.count(J2)) { - if (J2->mayWriteToMemory()) - AST.add(J2); - - // Note: This is specifically guarded by a check on isa<PHINode>, - // which while a valid (somewhat arbitrary) micro-optimization, is - // needed because otherwise isSafeToSpeculativelyExecute returns - // false on PHI nodes. - if (!isSimpleLoadStore(J2) && !isSafeToSpeculativelyExecute(J2, DL)) - FutureSideEffects = true; - } - - ++J2; - } - - if (!J1->isSameOperationAs(J2)) { - DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << - " vs. " << *J2 << "\n"); - MatchFailed = true; - break; - } - - // Make sure that this instruction, which is in the use set of this - // root instruction, does not also belong to the base set or the set of - // some previous root instruction. - if (BaseUseSet.count(J2) || AllRootUses.count(J2)) { - DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << - " vs. " << *J2 << " (prev. case overlap)\n"); - MatchFailed = true; - break; - } - - // Make sure that we don't alias with any instruction in the alias set - // tracker. If we do, then we depend on a future iteration, and we - // can't reroll. - if (J2->mayReadFromMemory()) { - for (AliasSetTracker::iterator K = AST.begin(), KE = AST.end(); - K != KE && !MatchFailed; ++K) { - if (K->aliasesUnknownInst(J2, *AA)) { - DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << - " vs. " << *J2 << " (depends on future store)\n"); - MatchFailed = true; - break; - } - } - } - - // If we've past an instruction from a future iteration that may have - // side effects, and this instruction might also, then we can't reorder - // them, and this matching fails. As an exception, we allow the alias - // set tracker to handle regular (simple) load/store dependencies. - if (FutureSideEffects && - ((!isSimpleLoadStore(J1) && - !isSafeToSpeculativelyExecute(J1, DL)) || - (!isSimpleLoadStore(J2) && - !isSafeToSpeculativelyExecute(J2, DL)))) { - DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << - " vs. " << *J2 << - " (side effects prevent reordering)\n"); - MatchFailed = true; - break; - } - - // For instructions that are part of a reduction, if the operation is - // associative, then don't bother matching the operands (because we - // already know that the instructions are isomorphic, and the order - // within the iteration does not matter). For non-associative reductions, - // we do need to match the operands, because we need to reject - // out-of-order instructions within an iteration! - // For example (assume floating-point addition), we need to reject this: - // x += a[i]; x += b[i]; - // x += a[i+1]; x += b[i+1]; - // x += b[i+2]; x += a[i+2]; - bool InReduction = Reductions.isPairInSame(J1, J2); - - if (!(InReduction && J1->isAssociative())) { - bool Swapped = false, SomeOpMatched = false; - for (unsigned j = 0; j < J1->getNumOperands() && !MatchFailed; ++j) { - Value *Op2 = J2->getOperand(j); - - // If this is part of a reduction (and the operation is not - // associatve), then we match all operands, but not those that are - // part of the reduction. - if (InReduction) - if (Instruction *Op2I = dyn_cast<Instruction>(Op2)) - if (Reductions.isPairInSame(J2, Op2I)) - continue; - - DenseMap<Value *, Value *>::iterator BMI = BaseMap.find(Op2); - if (BMI != BaseMap.end()) - Op2 = BMI->second; - else if (std::find(Roots[i].begin(), Roots[i].end(), - (Instruction*) Op2) != Roots[i].end()) - Op2 = IV; - - if (J1->getOperand(Swapped ? unsigned(!j) : j) != Op2) { - // If we've not already decided to swap the matched operands, and - // we've not already matched our first operand (note that we could - // have skipped matching the first operand because it is part of a - // reduction above), and the instruction is commutative, then try - // the swapped match. - if (!Swapped && J1->isCommutative() && !SomeOpMatched && - J1->getOperand(!j) == Op2) { - Swapped = true; - } else { - DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << - " vs. " << *J2 << " (operand " << j << ")\n"); - MatchFailed = true; - break; - } - } - - SomeOpMatched = true; - } - } - - if ((!PossibleRedLastSet.count(J1) && hasUsesOutsideLoop(J1, L)) || - (!PossibleRedLastSet.count(J2) && hasUsesOutsideLoop(J2, L))) { - DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << - " vs. " << *J2 << " (uses outside loop)\n"); - MatchFailed = true; - break; - } - - if (!MatchFailed) - BaseMap.insert(std::pair<Value *, Value *>(J2, J1)); - - AllRootUses.insert(J2); - Reductions.recordPair(J1, J2, i+1); - - ++J2; - } - } - - if (MatchFailed) - return false; - - DEBUG(dbgs() << "LRR: Matched all iteration increments for " << - *RealIV << "\n"); - - DenseSet<Instruction *> LoopIncUseSet; - collectInLoopUserSet(L, LoopIncs, SmallInstructionSet(), - SmallInstructionSet(), LoopIncUseSet); - DEBUG(dbgs() << "LRR: Loop increment set size: " << - LoopIncUseSet.size() << "\n"); - - // Make sure that all instructions in the loop have been included in some - // use set. - for (BasicBlock::iterator J = Header->begin(), JE = Header->end(); - J != JE; ++J) { - if (isa<DbgInfoIntrinsic>(J)) - continue; - if (cast<Instruction>(J) == RealIV) - continue; - if (cast<Instruction>(J) == IV) - continue; - if (BaseUseSet.count(J) || AllRootUses.count(J) || - (LoopIncUseSet.count(J) && (J->isTerminator() || - isSafeToSpeculativelyExecute(J, DL)))) - continue; - - if (AllRoots.count(J)) - continue; - - if (Reductions.isSelectedPHI(J)) - continue; - - DEBUG(dbgs() << "LRR: aborting reroll based on " << *RealIV << - " unprocessed instruction found: " << *J << "\n"); - MatchFailed = true; - break; - } - - if (MatchFailed) + *IV << "\n"); + + if (!DAGRoots.validate(Reductions)) return false; - - DEBUG(dbgs() << "LRR: all instructions processed from " << - *RealIV << "\n"); - if (!Reductions.validateSelected()) return false; - // At this point, we've validated the rerolling, and we're committed to // making changes! Reductions.replaceSelected(); + DAGRoots.replace(IterCount); - // Remove instructions associated with non-base iterations. - for (BasicBlock::reverse_iterator J = Header->rbegin(); - J != Header->rend();) { - if (AllRootUses.count(&*J)) { - Instruction *D = &*J; - DEBUG(dbgs() << "LRR: removing: " << *D << "\n"); - D->eraseFromParent(); - continue; - } - - ++J; - } - - // Insert the new induction variable. - const SCEV *Start = RealIVSCEV->getStart(); - if (Inc == 1) - Start = SE->getMulExpr(Start, - SE->getConstant(Start->getType(), Scale)); - const SCEVAddRecExpr *H = - cast<SCEVAddRecExpr>(SE->getAddRecExpr(Start, - SE->getConstant(RealIVSCEV->getType(), 1), - L, SCEV::FlagAnyWrap)); - { // Limit the lifetime of SCEVExpander. - SCEVExpander Expander(*SE, "reroll"); - Value *NewIV = Expander.expandCodeFor(H, IV->getType(), Header->begin()); - - for (DenseSet<Instruction *>::iterator J = BaseUseSet.begin(), - JE = BaseUseSet.end(); J != JE; ++J) - (*J)->replaceUsesOfWith(IV, NewIV); - - if (BranchInst *BI = dyn_cast<BranchInst>(Header->getTerminator())) { - if (LoopIncUseSet.count(BI)) { - const SCEV *ICSCEV = RealIVSCEV->evaluateAtIteration(IterCount, *SE); - if (Inc == 1) - ICSCEV = - SE->getMulExpr(ICSCEV, SE->getConstant(ICSCEV->getType(), Scale)); - // Iteration count SCEV minus 1 - const SCEV *ICMinus1SCEV = - SE->getMinusSCEV(ICSCEV, SE->getConstant(ICSCEV->getType(), 1)); - - Value *ICMinus1; // Iteration count minus 1 - if (isa<SCEVConstant>(ICMinus1SCEV)) { - ICMinus1 = Expander.expandCodeFor(ICMinus1SCEV, NewIV->getType(), BI); - } else { - BasicBlock *Preheader = L->getLoopPreheader(); - if (!Preheader) - Preheader = InsertPreheaderForLoop(L, this); - - ICMinus1 = Expander.expandCodeFor(ICMinus1SCEV, NewIV->getType(), - Preheader->getTerminator()); - } - - Value *Cond = - new ICmpInst(BI, CmpInst::ICMP_EQ, NewIV, ICMinus1, "exitcond"); - BI->setCondition(Cond); - - if (BI->getSuccessor(1) != Header) - BI->swapSuccessors(); - } - } - } - - SimplifyInstructionsInBlock(Header, DL, TLI); - DeleteDeadPHIs(Header, TLI); ++NumRerolledLoops; return true; } @@ -1129,11 +1470,9 @@ bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) { return false; AA = &getAnalysis<AliasAnalysis>(); - LI = &getAnalysis<LoopInfo>(); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); SE = &getAnalysis<ScalarEvolution>(); - TLI = &getAnalysis<TargetLibraryInfo>(); - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - DL = DLP ? &DLP->getDataLayout() : nullptr; + TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); BasicBlock *Header = L->getHeader(); diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp index 267cb99..a675e12 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp @@ -24,8 +24,10 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SSAUpdater.h" @@ -56,14 +58,14 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<AssumptionCacheTracker>(); AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addRequired<LoopInfo>(); - AU.addPreserved<LoopInfo>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); AU.addRequiredID(LoopSimplifyID); AU.addPreservedID(LoopSimplifyID); AU.addRequiredID(LCSSAID); AU.addPreservedID(LCSSAID); AU.addPreserved<ScalarEvolution>(); - AU.addRequired<TargetTransformInfo>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); } bool runOnLoop(Loop *L, LPPassManager &LPM) override; @@ -75,14 +77,15 @@ namespace { LoopInfo *LI; const TargetTransformInfo *TTI; AssumptionCache *AC; + DominatorTree *DT; }; } char LoopRotate::ID = 0; INITIALIZE_PASS_BEGIN(LoopRotate, "loop-rotate", "Rotate Loops", false, false) -INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LCSSA) INITIALIZE_PASS_END(LoopRotate, "loop-rotate", "Rotate Loops", false, false) @@ -100,10 +103,13 @@ bool LoopRotate::runOnLoop(Loop *L, LPPassManager &LPM) { // Save the loop metadata. MDNode *LoopMD = L->getLoopID(); - LI = &getAnalysis<LoopInfo>(); - TTI = &getAnalysis<TargetTransformInfo>(); - AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache( - *L->getHeader()->getParent()); + Function &F = *L->getHeader()->getParent(); + + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); + DT = DTWP ? &DTWP->getDomTree() : nullptr; // Simplify the loop latch before attempting to rotate the header // upward. Rotation may not be needed if the loop tail can be folded into the @@ -226,20 +232,17 @@ static bool shouldSpeculateInstrs(BasicBlock::iterator Begin, case Instruction::Shl: case Instruction::LShr: case Instruction::AShr: { - Value *IVOpnd = nullptr; - if (isa<ConstantInt>(I->getOperand(0))) - IVOpnd = I->getOperand(1); - - if (isa<ConstantInt>(I->getOperand(1))) { - if (IVOpnd) - return false; - - IVOpnd = I->getOperand(0); - } + Value *IVOpnd = !isa<Constant>(I->getOperand(0)) + ? I->getOperand(0) + : !isa<Constant>(I->getOperand(1)) + ? I->getOperand(1) + : nullptr; + if (!IVOpnd) + return false; // If increment operand is used outside of the loop, this speculation // could cause extra live range interference. - if (MultiExitLoop && IVOpnd) { + if (MultiExitLoop) { for (User *UseI : IVOpnd->users()) { auto *UserInst = cast<Instruction>(UseI); if (!L->contains(UserInst)) @@ -308,9 +311,8 @@ bool LoopRotate::simplifyLoopLatch(Loop *L) { // Nuke the Latch block. assert(Latch->empty() && "unable to evacuate Latch"); LI->removeBlock(Latch); - if (DominatorTreeWrapperPass *DTWP = - getAnalysisIfAvailable<DominatorTreeWrapperPass>()) - DTWP->getDomTree().eraseNode(Latch); + if (DT) + DT->eraseNode(Latch); Latch->eraseFromParent(); return true; } @@ -412,6 +414,8 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { for (; PHINode *PN = dyn_cast<PHINode>(I); ++I) ValueMap[PN] = PN->getIncomingValueForBlock(OrigPreheader); + const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); + // For the rest of the instructions, either hoist to the OrigPreheader if // possible or create a clone in the OldPreHeader if not. TerminatorInst *LoopEntryBranch = OrigPreheader->getTerminator(); @@ -442,8 +446,8 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { // With the operands remapped, see if the instruction constant folds or is // otherwise simplifyable. This commonly occurs because the entry from PHI // nodes allows icmps and other instructions to fold. - // FIXME: Provide DL, TLI, DT, AC to SimplifyInstruction. - Value *V = SimplifyInstruction(C); + // FIXME: Provide TLI, DT, AC to SimplifyInstruction. + Value *V = SimplifyInstruction(C, DL); if (V && LI->replacementPreservesLCSSAForm(C, V)) { // If so, then delete the temporary instruction and stick the folded value // in the map. @@ -495,31 +499,31 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { // The conditional branch can't be folded, handle the general case. // Update DominatorTree to reflect the CFG change we just made. Then split // edges as necessary to preserve LoopSimplify form. - if (DominatorTreeWrapperPass *DTWP = - getAnalysisIfAvailable<DominatorTreeWrapperPass>()) { - DominatorTree &DT = DTWP->getDomTree(); + if (DT) { // Everything that was dominated by the old loop header is now dominated // by the original loop preheader. Conceptually the header was merged // into the preheader, even though we reuse the actual block as a new // loop latch. - DomTreeNode *OrigHeaderNode = DT.getNode(OrigHeader); + DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader); SmallVector<DomTreeNode *, 8> HeaderChildren(OrigHeaderNode->begin(), OrigHeaderNode->end()); - DomTreeNode *OrigPreheaderNode = DT.getNode(OrigPreheader); + DomTreeNode *OrigPreheaderNode = DT->getNode(OrigPreheader); for (unsigned I = 0, E = HeaderChildren.size(); I != E; ++I) - DT.changeImmediateDominator(HeaderChildren[I], OrigPreheaderNode); + DT->changeImmediateDominator(HeaderChildren[I], OrigPreheaderNode); - assert(DT.getNode(Exit)->getIDom() == OrigPreheaderNode); - assert(DT.getNode(NewHeader)->getIDom() == OrigPreheaderNode); + assert(DT->getNode(Exit)->getIDom() == OrigPreheaderNode); + assert(DT->getNode(NewHeader)->getIDom() == OrigPreheaderNode); // Update OrigHeader to be dominated by the new header block. - DT.changeImmediateDominator(OrigHeader, OrigLatch); + DT->changeImmediateDominator(OrigHeader, OrigLatch); } // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and // thus is not a preheader anymore. // Split the edge to form a real preheader. - BasicBlock *NewPH = SplitCriticalEdge(OrigPreheader, NewHeader, this); + BasicBlock *NewPH = SplitCriticalEdge( + OrigPreheader, NewHeader, + CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA()); NewPH->setName(NewHeader->getName() + ".lr.ph"); // Preserve canonical loop form, which means that 'Exit' should have only @@ -538,7 +542,8 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { if (isa<IndirectBrInst>((*PI)->getTerminator())) continue; SplitLatchEdge |= L->getLoopLatch() == *PI; - BasicBlock *ExitSplit = SplitCriticalEdge(*PI, Exit, this); + BasicBlock *ExitSplit = SplitCriticalEdge( + *PI, Exit, CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA()); ExitSplit->moveBefore(Exit); } assert(SplitLatchEdge && @@ -552,17 +557,15 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { PHBI->eraseFromParent(); // With our CFG finalized, update DomTree if it is available. - if (DominatorTreeWrapperPass *DTWP = - getAnalysisIfAvailable<DominatorTreeWrapperPass>()) { - DominatorTree &DT = DTWP->getDomTree(); + if (DT) { // Update OrigHeader to be dominated by the new header block. - DT.changeImmediateDominator(NewHeader, OrigPreheader); - DT.changeImmediateDominator(OrigHeader, OrigLatch); + DT->changeImmediateDominator(NewHeader, OrigPreheader); + DT->changeImmediateDominator(OrigHeader, OrigLatch); // Brute force incremental dominator tree update. Call // findNearestCommonDominator on all CFG predecessors of each child of the // original header. - DomTreeNode *OrigHeaderNode = DT.getNode(OrigHeader); + DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader); SmallVector<DomTreeNode *, 8> HeaderChildren(OrigHeaderNode->begin(), OrigHeaderNode->end()); bool Changed; @@ -575,11 +578,11 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { pred_iterator PI = pred_begin(BB); BasicBlock *NearestDom = *PI; for (pred_iterator PE = pred_end(BB); PI != PE; ++PI) - NearestDom = DT.findNearestCommonDominator(NearestDom, *PI); + NearestDom = DT->findNearestCommonDominator(NearestDom, *PI); // Remember if this changes the DomTree. if (Node->getIDom()->getBlock() != NearestDom) { - DT.changeImmediateDominator(BB, NearestDom); + DT->changeImmediateDominator(BB, NearestDom); Changed = true; } } @@ -597,7 +600,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { // the OrigHeader block into OrigLatch. This will succeed if they are // connected by an unconditional branch. This is just a cleanup so the // emitted code isn't too gross in this common case. - MergeBlockIntoPredecessor(OrigHeader, this); + MergeBlockIntoPredecessor(OrigHeader, DT, LI); DEBUG(dbgs() << "LoopRotation: into "; L->dump()); diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 7b60373..584c7ae 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -28,7 +28,7 @@ // // The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however // it's useful to think about these as the same register, with some uses using -// the value of the register before the add and some using // it after. In this +// the value of the register before the add and some using it after. In this // example, the icmp is a post-increment user, since it uses %i.next, which is // the value of the induction variable after the increment. The other common // case of post-increment users is users outside the loop. @@ -68,6 +68,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" #include "llvm/IR/ValueHandle.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -111,8 +112,6 @@ public: /// a particular register. SmallBitVector UsedByIndices; - RegSortData() {} - void print(raw_ostream &OS) const; void dump() const; }; @@ -186,9 +185,8 @@ RegUseTracker::SwapAndDropUse(size_t LUIdx, size_t LastLUIdx) { // Update RegUses. The data structure is not optimized for this purpose; // we must iterate through it and update each of the bit vectors. - for (RegUsesTy::iterator I = RegUsesMap.begin(), E = RegUsesMap.end(); - I != E; ++I) { - SmallBitVector &UsedByIndices = I->second.UsedByIndices; + for (auto &Pair : RegUsesMap) { + SmallBitVector &UsedByIndices = Pair.second.UsedByIndices; if (LUIdx < UsedByIndices.size()) UsedByIndices[LUIdx] = LastLUIdx < UsedByIndices.size() ? UsedByIndices[LastLUIdx] : 0; @@ -298,9 +296,8 @@ static void DoInitialMatch(const SCEV *S, Loop *L, // Look at add operands. if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { - for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end(); - I != E; ++I) - DoInitialMatch(*I, L, Good, Bad, SE); + for (const SCEV *S : Add->operands()) + DoInitialMatch(S, L, Good, Bad, SE); return; } @@ -327,12 +324,10 @@ static void DoInitialMatch(const SCEV *S, Loop *L, DoInitialMatch(NewMul, L, MyGood, MyBad, SE); const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue( SE.getEffectiveSCEVType(NewMul->getType()))); - for (SmallVectorImpl<const SCEV *>::const_iterator I = MyGood.begin(), - E = MyGood.end(); I != E; ++I) - Good.push_back(SE.getMulExpr(NegOne, *I)); - for (SmallVectorImpl<const SCEV *>::const_iterator I = MyBad.begin(), - E = MyBad.end(); I != E; ++I) - Bad.push_back(SE.getMulExpr(NegOne, *I)); + for (const SCEV *S : MyGood) + Good.push_back(SE.getMulExpr(NegOne, S)); + for (const SCEV *S : MyBad) + Bad.push_back(SE.getMulExpr(NegOne, S)); return; } @@ -444,9 +439,8 @@ bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx, if (ScaledReg) if (RegUses.isRegUsedByUsesOtherThan(ScaledReg, LUIdx)) return true; - for (SmallVectorImpl<const SCEV *>::const_iterator I = BaseRegs.begin(), - E = BaseRegs.end(); I != E; ++I) - if (RegUses.isRegUsedByUsesOtherThan(*I, LUIdx)) + for (const SCEV *BaseReg : BaseRegs) + if (RegUses.isRegUsedByUsesOtherThan(BaseReg, LUIdx)) return true; return false; } @@ -461,10 +455,9 @@ void Formula::print(raw_ostream &OS) const { if (!First) OS << " + "; else First = false; OS << BaseOffset; } - for (SmallVectorImpl<const SCEV *>::const_iterator I = BaseRegs.begin(), - E = BaseRegs.end(); I != E; ++I) { + for (const SCEV *BaseReg : BaseRegs) { if (!First) OS << " + "; else First = false; - OS << "reg(" << **I << ')'; + OS << "reg(" << *BaseReg << ')'; } if (HasBaseReg && BaseRegs.empty()) { if (!First) OS << " + "; else First = false; @@ -577,10 +570,8 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS, if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(LHS)) { if (IgnoreSignificantBits || isAddSExtable(Add, SE)) { SmallVector<const SCEV *, 8> Ops; - for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end(); - I != E; ++I) { - const SCEV *Op = getExactSDiv(*I, RHS, SE, - IgnoreSignificantBits); + for (const SCEV *S : Add->operands()) { + const SCEV *Op = getExactSDiv(S, RHS, SE, IgnoreSignificantBits); if (!Op) return nullptr; Ops.push_back(Op); } @@ -594,9 +585,7 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS, if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) { SmallVector<const SCEV *, 4> Ops; bool Found = false; - for (SCEVMulExpr::op_iterator I = Mul->op_begin(), E = Mul->op_end(); - I != E; ++I) { - const SCEV *S = *I; + for (const SCEV *S : Mul->operands()) { if (!Found) if (const SCEV *Q = getExactSDiv(S, RHS, SE, IgnoreSignificantBits)) { @@ -766,9 +755,8 @@ static bool isHighCostExpansion(const SCEV *S, return false; if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { - for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end(); - I != E; ++I) { - if (isHighCostExpansion(*I, Processed, SE)) + for (const SCEV *S : Add->operands()) { + if (isHighCostExpansion(S, Processed, SE)) return true; } return false; @@ -819,9 +807,9 @@ DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) { if (!I || !isInstructionTriviallyDead(I)) continue; - for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) - if (Instruction *U = dyn_cast<Instruction>(*OI)) { - *OI = nullptr; + for (Use &O : I->operands()) + if (Instruction *U = dyn_cast<Instruction>(O)) { + O = nullptr; if (U->use_empty()) DeadInsts.push_back(U); } @@ -1002,9 +990,7 @@ void Cost::RateFormula(const TargetTransformInfo &TTI, if (isLoser()) return; } - for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(), - E = F.BaseRegs.end(); I != E; ++I) { - const SCEV *BaseReg = *I; + for (const SCEV *BaseReg : F.BaseRegs) { if (VisitedRegs.count(BaseReg)) { Lose(); return; @@ -1027,9 +1013,8 @@ void Cost::RateFormula(const TargetTransformInfo &TTI, ScaleCost += getScalingFactorCost(TTI, LU, F); // Tally up the non-zero immediates. - for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(), - E = Offsets.end(); I != E; ++I) { - int64_t Offset = (uint64_t)*I + F.BaseOffset; + for (int64_t O : Offsets) { + int64_t Offset = (uint64_t)O + F.BaseOffset; if (F.BaseGV) ImmCost += 64; // Handle symbolic values conservatively. // TODO: This should probably be the pointer size. @@ -1152,10 +1137,9 @@ void LSRFixup::print(raw_ostream &OS) const { OS << ", OperandValToReplace="; OperandValToReplace->printAsOperand(OS, /*PrintType=*/false); - for (PostIncLoopSet::const_iterator I = PostIncLoops.begin(), - E = PostIncLoops.end(); I != E; ++I) { + for (const Loop *PIL : PostIncLoops) { OS << ", PostIncLoop="; - (*I)->getHeader()->printAsOperand(OS, /*PrintType=*/false); + PIL->getHeader()->printAsOperand(OS, /*PrintType=*/false); } if (LUIdx != ~size_t(0)) @@ -1301,9 +1285,8 @@ bool LSRUse::InsertFormula(const Formula &F) { assert((!F.ScaledReg || !F.ScaledReg->isZero()) && "Zero allocated in a scaled register!"); #ifndef NDEBUG - for (SmallVectorImpl<const SCEV *>::const_iterator I = - F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) - assert(!(*I)->isZero() && "Zero allocated in a base register!"); + for (const SCEV *BaseReg : F.BaseRegs) + assert(!BaseReg->isZero() && "Zero allocated in a base register!"); #endif // Add the formula to the list. @@ -1327,11 +1310,9 @@ void LSRUse::DeleteFormula(Formula &F) { /// RecomputeRegs - Recompute the Regs field, and update RegUses. void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) { // Now that we've filtered out some formulae, recompute the Regs set. - SmallPtrSet<const SCEV *, 4> OldRegs = Regs; + SmallPtrSet<const SCEV *, 4> OldRegs = std::move(Regs); Regs.clear(); - for (SmallVectorImpl<Formula>::const_iterator I = Formulae.begin(), - E = Formulae.end(); I != E; ++I) { - const Formula &F = *I; + for (const Formula &F : Formulae) { if (F.ScaledReg) Regs.insert(F.ScaledReg); Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end()); } @@ -1357,11 +1338,11 @@ void LSRUse::print(raw_ostream &OS) const { } OS << ", Offsets={"; - for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(), - E = Offsets.end(); I != E; ++I) { - OS << *I; - if (std::next(I) != E) - OS << ','; + bool NeedComma = false; + for (int64_t O : Offsets) { + if (NeedComma) OS << ','; + OS << O; + NeedComma = true; } OS << '}'; @@ -1386,9 +1367,6 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, case LSRUse::Address: return TTI.isLegalAddressingMode(AccessTy, BaseGV, BaseOffset, HasBaseReg, Scale); - // Otherwise, just guess that reg+reg addressing is legal. - //return ; - case LSRUse::ICmpZero: // There's not even a target hook for querying whether it would be legal to // fold a GV into an ICmp. @@ -1928,12 +1906,12 @@ void LSRInstance::OptimizeShadowIV() { /// set the IV user and stride information and return true, otherwise return /// false. bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) { - for (IVUsers::iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) - if (UI->getUser() == Cond) { + for (IVStrideUse &U : IU) + if (U.getUser() == Cond) { // NOTE: we could handle setcc instructions with multiple uses here, but // InstCombine does it as well for simple uses, it's not clear that it // occurs enough in real life to handle. - CondUse = UI; + CondUse = &U; return true; } return false; @@ -2108,8 +2086,7 @@ LSRInstance::OptimizeLoopTermCond() { SmallVector<BasicBlock*, 8> ExitingBlocks; L->getExitingBlocks(ExitingBlocks); - for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) { - BasicBlock *ExitingBlock = ExitingBlocks[i]; + for (BasicBlock *ExitingBlock : ExitingBlocks) { // Get the terminating condition for the loop if possible. If we // can, we want to change it to use a post-incremented version of its @@ -2352,9 +2329,7 @@ LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF, LU.WidestFixupType == OrigLU.WidestFixupType && LU.HasFormulaWithSameRegs(OrigF)) { // Scan through this use's formulae. - for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(), - E = LU.Formulae.end(); I != E; ++I) { - const Formula &F = *I; + for (const Formula &F : LU.Formulae) { // Check to see if this formula has the same registers and symbols // as OrigF. if (F.BaseRegs == OrigF.BaseRegs && @@ -2382,8 +2357,8 @@ void LSRInstance::CollectInterestingTypesAndFactors() { // Collect interesting types and strides. SmallVector<const SCEV *, 4> Worklist; - for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) { - const SCEV *Expr = IU.getExpr(*UI); + for (const IVStrideUse &U : IU) { + const SCEV *Expr = IU.getExpr(U); // Collect interesting types. Types.insert(SE.getEffectiveSCEVType(Expr->getType())); @@ -2586,25 +2561,23 @@ isProfitableChain(IVChain &Chain, SmallPtrSetImpl<Instruction*> &Users, unsigned NumConstIncrements = 0; unsigned NumVarIncrements = 0; unsigned NumReusedIncrements = 0; - for (IVChain::const_iterator I = Chain.begin(), E = Chain.end(); - I != E; ++I) { - - if (I->IncExpr->isZero()) + for (const IVInc &Inc : Chain) { + if (Inc.IncExpr->isZero()) continue; // Incrementing by zero or some constant is neutral. We assume constants can // be folded into an addressing mode or an add's immediate operand. - if (isa<SCEVConstant>(I->IncExpr)) { + if (isa<SCEVConstant>(Inc.IncExpr)) { ++NumConstIncrements; continue; } - if (I->IncExpr == LastIncExpr) + if (Inc.IncExpr == LastIncExpr) ++NumReusedIncrements; else ++NumVarIncrements; - LastIncExpr = I->IncExpr; + LastIncExpr = Inc.IncExpr; } // An IV chain with a single increment is handled by LSR's postinc // uses. However, a chain with multiple increments requires keeping the IV's @@ -2839,12 +2812,11 @@ void LSRInstance::FinalizeChain(IVChain &Chain) { assert(!Chain.Incs.empty() && "empty IV chains are not allowed"); DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n"); - for (IVChain::const_iterator I = Chain.begin(), E = Chain.end(); - I != E; ++I) { - DEBUG(dbgs() << " Inc: " << *I->UserInst << "\n"); - User::op_iterator UseI = - std::find(I->UserInst->op_begin(), I->UserInst->op_end(), I->IVOperand); - assert(UseI != I->UserInst->op_end() && "cannot find IV operand"); + for (const IVInc &Inc : Chain) { + DEBUG(dbgs() << " Inc: " << Inc.UserInst << "\n"); + auto UseI = std::find(Inc.UserInst->op_begin(), Inc.UserInst->op_end(), + Inc.IVOperand); + assert(UseI != Inc.UserInst->op_end() && "cannot find IV operand"); IVIncSet.insert(UseI); } } @@ -2907,20 +2879,18 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter, Type *IVTy = IVSrc->getType(); Type *IntTy = SE.getEffectiveSCEVType(IVTy); const SCEV *LeftOverExpr = nullptr; - for (IVChain::const_iterator IncI = Chain.begin(), - IncE = Chain.end(); IncI != IncE; ++IncI) { - - Instruction *InsertPt = IncI->UserInst; + for (const IVInc &Inc : Chain) { + Instruction *InsertPt = Inc.UserInst; if (isa<PHINode>(InsertPt)) InsertPt = L->getLoopLatch()->getTerminator(); // IVOper will replace the current IV User's operand. IVSrc is the IV // value currently held in a register. Value *IVOper = IVSrc; - if (!IncI->IncExpr->isZero()) { + if (!Inc.IncExpr->isZero()) { // IncExpr was the result of subtraction of two narrow values, so must // be signed. - const SCEV *IncExpr = SE.getNoopOrSignExtend(IncI->IncExpr, IntTy); + const SCEV *IncExpr = SE.getNoopOrSignExtend(Inc.IncExpr, IntTy); LeftOverExpr = LeftOverExpr ? SE.getAddExpr(LeftOverExpr, IncExpr) : IncExpr; } @@ -2933,22 +2903,21 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter, IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt); // If an IV increment can't be folded, use it as the next IV value. - if (!canFoldIVIncExpr(LeftOverExpr, IncI->UserInst, IncI->IVOperand, - TTI)) { + if (!canFoldIVIncExpr(LeftOverExpr, Inc.UserInst, Inc.IVOperand, TTI)) { assert(IVTy == IVOper->getType() && "inconsistent IV increment type"); IVSrc = IVOper; LeftOverExpr = nullptr; } } - Type *OperTy = IncI->IVOperand->getType(); + Type *OperTy = Inc.IVOperand->getType(); if (IVTy != OperTy) { assert(SE.getTypeSizeInBits(IVTy) >= SE.getTypeSizeInBits(OperTy) && "cannot extend a chained IV"); IRBuilder<> Builder(InsertPt); IVOper = Builder.CreateTruncOrBitCast(IVOper, OperTy, "lsr.chain"); } - IncI->UserInst->replaceUsesOfWith(IncI->IVOperand, IVOper); - DeadInsts.push_back(IncI->IVOperand); + Inc.UserInst->replaceUsesOfWith(Inc.IVOperand, IVOper); + DeadInsts.push_back(Inc.IVOperand); } // If LSR created a new, wider phi, we may also replace its postinc. We only // do this if we also found a wide value for the head of the chain. @@ -2976,11 +2945,11 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter, } void LSRInstance::CollectFixupsAndInitialFormulae() { - for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) { - Instruction *UserInst = UI->getUser(); + for (const IVStrideUse &U : IU) { + Instruction *UserInst = U.getUser(); // Skip IV users that are part of profitable IV Chains. User::op_iterator UseI = std::find(UserInst->op_begin(), UserInst->op_end(), - UI->getOperandValToReplace()); + U.getOperandValToReplace()); assert(UseI != UserInst->op_end() && "cannot find IV operand"); if (IVIncSet.count(UseI)) continue; @@ -2988,8 +2957,8 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { // Record the uses. LSRFixup &LF = getNewFixup(); LF.UserInst = UserInst; - LF.OperandValToReplace = UI->getOperandValToReplace(); - LF.PostIncLoops = UI->getPostIncLoops(); + LF.OperandValToReplace = U.getOperandValToReplace(); + LF.PostIncLoops = U.getPostIncLoops(); LSRUse::KindType Kind = LSRUse::Basic; Type *AccessTy = nullptr; @@ -2998,7 +2967,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { AccessTy = getAccessType(LF.UserInst); } - const SCEV *S = IU.getExpr(*UI); + const SCEV *S = IU.getExpr(U); // Equality (== and !=) ICmps are special. We can rewrite (i == N) as // (N - i == 0), and this allows (N - i) to be the expression that we work @@ -3090,9 +3059,8 @@ LSRInstance::InsertSupplementalFormula(const SCEV *S, void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) { if (F.ScaledReg) RegUses.CountRegister(F.ScaledReg, LUIdx); - for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(), - E = F.BaseRegs.end(); I != E; ++I) - RegUses.CountRegister(*I, LUIdx); + for (const SCEV *BaseReg : F.BaseRegs) + RegUses.CountRegister(BaseReg, LUIdx); } /// InsertFormula - If the given formula has not yet been inserted, add it to @@ -3213,9 +3181,8 @@ static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C, if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { // Break out add operands. - for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end(); - I != E; ++I) { - const SCEV *Remainder = CollectSubexprs(*I, C, Ops, L, SE, Depth+1); + for (const SCEV *S : Add->operands()) { + const SCEV *Remainder = CollectSubexprs(S, C, Ops, L, SE, Depth+1); if (Remainder) Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder); } @@ -3373,9 +3340,7 @@ void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula F = Base; F.BaseRegs.clear(); SmallVector<const SCEV *, 4> Ops; - for (SmallVectorImpl<const SCEV *>::const_iterator - I = Base.BaseRegs.begin(), E = Base.BaseRegs.end(); I != E; ++I) { - const SCEV *BaseReg = *I; + for (const SCEV *BaseReg : Base.BaseRegs) { if (SE.properlyDominates(BaseReg, L->getHeader()) && !SE.hasComputableLoopEvolution(BaseReg, L)) Ops.push_back(BaseReg); @@ -3432,15 +3397,13 @@ void LSRInstance::GenerateConstantOffsetsImpl( LSRUse &LU, unsigned LUIdx, const Formula &Base, const SmallVectorImpl<int64_t> &Worklist, size_t Idx, bool IsScaledReg) { const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx]; - for (SmallVectorImpl<int64_t>::const_iterator I = Worklist.begin(), - E = Worklist.end(); - I != E; ++I) { + for (int64_t Offset : Worklist) { Formula F = Base; - F.BaseOffset = (uint64_t)Base.BaseOffset - *I; - if (isLegalUse(TTI, LU.MinOffset - *I, LU.MaxOffset - *I, LU.Kind, + F.BaseOffset = (uint64_t)Base.BaseOffset - Offset; + if (isLegalUse(TTI, LU.MinOffset - Offset, LU.MaxOffset - Offset, LU.Kind, LU.AccessTy, F)) { // Add the offset to the base register. - const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), *I), G); + const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), Offset), G); // If it cancelled out, drop the base register, otherwise update it. if (NewG->isZero()) { if (IsScaledReg) { @@ -3506,10 +3469,7 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, assert(!Base.BaseGV && "ICmpZero use is not legal!"); // Check each interesting stride. - for (SmallSetVector<int64_t, 8>::const_iterator - I = Factors.begin(), E = Factors.end(); I != E; ++I) { - int64_t Factor = *I; - + for (int64_t Factor : Factors) { // Check that the multiplication doesn't overflow. if (Base.BaseOffset == INT64_MIN && Factor == -1) continue; @@ -3593,10 +3553,7 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) { assert(Base.Scale == 0 && "Unscale did not did its job!"); // Check each interesting stride. - for (SmallSetVector<int64_t, 8>::const_iterator - I = Factors.begin(), E = Factors.end(); I != E; ++I) { - int64_t Factor = *I; - + for (int64_t Factor : Factors) { Base.Scale = Factor; Base.HasBaseReg = Base.BaseRegs.size() > 1; // Check whether this scale is going to be legal. @@ -3652,16 +3609,13 @@ void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) { if (!DstTy) return; DstTy = SE.getEffectiveSCEVType(DstTy); - for (SmallSetVector<Type *, 4>::const_iterator - I = Types.begin(), E = Types.end(); I != E; ++I) { - Type *SrcTy = *I; + for (Type *SrcTy : Types) { if (SrcTy != DstTy && TTI.isTruncateFree(SrcTy, DstTy)) { Formula F = Base; - if (F.ScaledReg) F.ScaledReg = SE.getAnyExtendExpr(F.ScaledReg, *I); - for (SmallVectorImpl<const SCEV *>::iterator J = F.BaseRegs.begin(), - JE = F.BaseRegs.end(); J != JE; ++J) - *J = SE.getAnyExtendExpr(*J, SrcTy); + if (F.ScaledReg) F.ScaledReg = SE.getAnyExtendExpr(F.ScaledReg, SrcTy); + for (const SCEV *&BaseReg : F.BaseRegs) + BaseReg = SE.getAnyExtendExpr(BaseReg, SrcTy); // TODO: This assumes we've done basic processing on all uses and // have an idea what the register usage is. @@ -3708,20 +3662,17 @@ void WorkItem::dump() const { void LSRInstance::GenerateCrossUseConstantOffsets() { // Group the registers by their value without any added constant offset. typedef std::map<int64_t, const SCEV *> ImmMapTy; - typedef DenseMap<const SCEV *, ImmMapTy> RegMapTy; - RegMapTy Map; + DenseMap<const SCEV *, ImmMapTy> Map; DenseMap<const SCEV *, SmallBitVector> UsedByIndicesMap; SmallVector<const SCEV *, 8> Sequence; - for (RegUseTracker::const_iterator I = RegUses.begin(), E = RegUses.end(); - I != E; ++I) { - const SCEV *Reg = *I; + for (const SCEV *Use : RegUses) { + const SCEV *Reg = Use; // Make a copy for ExtractImmediate to modify. int64_t Imm = ExtractImmediate(Reg, SE); - std::pair<RegMapTy::iterator, bool> Pair = - Map.insert(std::make_pair(Reg, ImmMapTy())); + auto Pair = Map.insert(std::make_pair(Reg, ImmMapTy())); if (Pair.second) Sequence.push_back(Reg); - Pair.first->second.insert(std::make_pair(Imm, *I)); - UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(*I); + Pair.first->second.insert(std::make_pair(Imm, Use)); + UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(Use); } // Now examine each set of registers with the same base value. Build up @@ -3729,9 +3680,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { // not adding formulae and register counts while we're searching. SmallVector<WorkItem, 32> WorkItems; SmallSet<std::pair<size_t, int64_t>, 32> UniqueItems; - for (SmallVectorImpl<const SCEV *>::const_iterator I = Sequence.begin(), - E = Sequence.end(); I != E; ++I) { - const SCEV *Reg = *I; + for (const SCEV *Reg : Sequence) { const ImmMapTy &Imms = Map.find(Reg)->second; // It's not worthwhile looking for reuse if there's only one offset. @@ -3739,9 +3688,8 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { continue; DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':'; - for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end(); - J != JE; ++J) - dbgs() << ' ' << J->first; + for (const auto &Entry : Imms) + dbgs() << ' ' << Entry.first; dbgs() << '\n'); // Examine each offset. @@ -3786,9 +3734,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { UniqueItems.clear(); // Now iterate through the worklist and add new formulae. - for (SmallVectorImpl<WorkItem>::const_iterator I = WorkItems.begin(), - E = WorkItems.end(); I != E; ++I) { - const WorkItem &WI = *I; + for (const WorkItem &WI : WorkItems) { size_t LUIdx = WI.LUIdx; LSRUse &LU = Uses[LUIdx]; int64_t Imm = WI.Imm; @@ -3827,7 +3773,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { if (C->getValue()->isNegative() != (NewF.BaseOffset < 0) && (C->getValue()->getValue().abs() * APInt(BitWidth, F.Scale)) - .ule(abs64(NewF.BaseOffset))) + .ule(std::abs(NewF.BaseOffset))) continue; // OK, looks good. @@ -3853,12 +3799,10 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { // If the new formula has a constant in a register, and adding the // constant value to the immediate would produce a value closer to // zero than the immediate itself, then the formula isn't worthwhile. - for (SmallVectorImpl<const SCEV *>::const_iterator - J = NewF.BaseRegs.begin(), JE = NewF.BaseRegs.end(); - J != JE; ++J) - if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*J)) + for (const SCEV *NewReg : NewF.BaseRegs) + if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewReg)) if ((C->getValue()->getValue() + NewF.BaseOffset).abs().slt( - abs64(NewF.BaseOffset)) && + std::abs(NewF.BaseOffset)) && (C->getValue()->getValue() + NewF.BaseOffset).countTrailingZeros() >= countTrailingZeros<uint64_t>(NewF.BaseOffset)) @@ -3959,9 +3903,7 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() { } else { SmallVector<const SCEV *, 4> Key; - for (SmallVectorImpl<const SCEV *>::const_iterator J = F.BaseRegs.begin(), - JE = F.BaseRegs.end(); J != JE; ++J) { - const SCEV *Reg = *J; + for (const SCEV *Reg : F.BaseRegs) { if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx)) Key.push_back(Reg); } @@ -4023,9 +3965,8 @@ static const size_t ComplexityLimit = UINT16_MAX; /// isn't always sufficient. size_t LSRInstance::EstimateSearchSpaceComplexity() const { size_t Power = 1; - for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(), - E = Uses.end(); I != E; ++I) { - size_t FSize = I->Formulae.size(); + for (const LSRUse &LU : Uses) { + size_t FSize = LU.Formulae.size(); if (FSize >= ComplexityLimit) { Power = ComplexityLimit; break; @@ -4116,9 +4057,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() { for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { LSRUse &LU = Uses[LUIdx]; - for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(), - E = LU.Formulae.end(); I != E; ++I) { - const Formula &F = *I; + for (const Formula &F : LU.Formulae) { if (F.BaseOffset == 0 || (F.Scale != 0 && F.Scale != 1)) continue; @@ -4135,9 +4074,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() { LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop; // Update the relocs to reference the new use. - for (SmallVectorImpl<LSRFixup>::iterator I = Fixups.begin(), - E = Fixups.end(); I != E; ++I) { - LSRFixup &Fixup = *I; + for (LSRFixup &Fixup : Fixups) { if (Fixup.LUIdx == LUIdx) { Fixup.LUIdx = LUThatHas - &Uses.front(); Fixup.Offset += F.BaseOffset; @@ -4218,9 +4155,7 @@ void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() { // to be a good reuse register candidate. const SCEV *Best = nullptr; unsigned BestNum = 0; - for (RegUseTracker::const_iterator I = RegUses.begin(), E = RegUses.end(); - I != E; ++I) { - const SCEV *Reg = *I; + for (const SCEV *Reg : RegUses) { if (Taken.count(Reg)) continue; if (!Best) @@ -4308,17 +4243,12 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution, SmallPtrSet<const SCEV *, 16> NewRegs; Cost NewCost; - for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(), - E = LU.Formulae.end(); I != E; ++I) { - const Formula &F = *I; - + for (const Formula &F : LU.Formulae) { // Ignore formulae which may not be ideal in terms of register reuse of // ReqRegs. The formula should use all required registers before // introducing new ones. int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size()); - for (SmallSetVector<const SCEV *, 4>::const_iterator J = ReqRegs.begin(), - JE = ReqRegs.end(); J != JE; ++J) { - const SCEV *Reg = *J; + for (const SCEV *Reg : ReqRegs) { if ((F.ScaledReg && F.ScaledReg == Reg) || std::find(F.BaseRegs.begin(), F.BaseRegs.end(), Reg) != F.BaseRegs.end()) { @@ -4426,9 +4356,7 @@ LSRInstance::HoistInsertPosition(BasicBlock::iterator IP, bool AllDominate = true; Instruction *BetterPos = nullptr; Instruction *Tentative = IDom->getTerminator(); - for (SmallVectorImpl<Instruction *>::const_iterator I = Inputs.begin(), - E = Inputs.end(); I != E; ++I) { - Instruction *Inst = *I; + for (Instruction *Inst : Inputs) { if (Inst == Tentative || !DT.dominates(Inst, Tentative)) { AllDominate = false; break; @@ -4475,9 +4403,7 @@ LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP, } // The expansion must also be dominated by the increment positions of any // loops it for which it is using post-inc mode. - for (PostIncLoopSet::const_iterator I = LF.PostIncLoops.begin(), - E = LF.PostIncLoops.end(); I != E; ++I) { - const Loop *PIL = *I; + for (const Loop *PIL : LF.PostIncLoops) { if (PIL == L) continue; // Be dominated by the loop exit. @@ -4552,9 +4478,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF, SmallVector<const SCEV *, 8> Ops; // Expand the BaseRegs portion. - for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(), - E = F.BaseRegs.end(); I != E; ++I) { - const SCEV *Reg = *I; + for (const SCEV *Reg : F.BaseRegs) { assert(!Reg->isZero() && "Zero allocated in a base register!"); // If we're expanding for a post-inc user, make the post-inc adjustment. @@ -4728,12 +4652,14 @@ void LSRInstance::RewriteForPHI(PHINode *PN, // Split the critical edge. BasicBlock *NewBB = nullptr; if (!Parent->isLandingPad()) { - NewBB = SplitCriticalEdge(BB, Parent, P, - /*MergeIdenticalEdges=*/true, - /*DontDeleteUselessPhis=*/true); + NewBB = SplitCriticalEdge(BB, Parent, + CriticalEdgeSplittingOptions(&DT, &LI) + .setMergeIdenticalEdges() + .setDontDeleteUselessPHIs()); } else { SmallVector<BasicBlock*, 2> NewBBs; - SplitLandingPadPredecessors(Parent, BB, "", "", P, NewBBs); + SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs, + /*AliasAnalysis*/ nullptr, &DT, &LI); NewBB = NewBBs[0]; } // If NewBB==NULL, then SplitCriticalEdge refused to split because all @@ -4823,7 +4749,8 @@ LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution, // we can remove them after we are done working. SmallVector<WeakVH, 16> DeadInsts; - SCEVExpander Rewriter(SE, "lsr"); + SCEVExpander Rewriter(SE, L->getHeader()->getModule()->getDataLayout(), + "lsr"); #ifndef NDEBUG Rewriter.setDebugType(DEBUG_TYPE); #endif @@ -4832,25 +4759,20 @@ LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution, Rewriter.setIVIncInsertPos(L, IVIncInsertPos); // Mark phi nodes that terminate chains so the expander tries to reuse them. - for (SmallVectorImpl<IVChain>::const_iterator ChainI = IVChainVec.begin(), - ChainE = IVChainVec.end(); ChainI != ChainE; ++ChainI) { - if (PHINode *PN = dyn_cast<PHINode>(ChainI->tailUserInst())) + for (const IVChain &Chain : IVChainVec) { + if (PHINode *PN = dyn_cast<PHINode>(Chain.tailUserInst())) Rewriter.setChainedPhi(PN); } // Expand the new value definitions and update the users. - for (SmallVectorImpl<LSRFixup>::const_iterator I = Fixups.begin(), - E = Fixups.end(); I != E; ++I) { - const LSRFixup &Fixup = *I; - + for (const LSRFixup &Fixup : Fixups) { Rewrite(Fixup, *Solution[Fixup.LUIdx], Rewriter, DeadInsts, P); Changed = true; } - for (SmallVectorImpl<IVChain>::const_iterator ChainI = IVChainVec.begin(), - ChainE = IVChainVec.end(); ChainI != ChainE; ++ChainI) { - GenerateIVChain(*ChainI, Rewriter, DeadInsts); + for (const IVChain &Chain : IVChainVec) { + GenerateIVChain(Chain, Rewriter, DeadInsts); Changed = true; } // Clean up after ourselves. This must be done before deleting any @@ -4863,9 +4785,10 @@ LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution, LSRInstance::LSRInstance(Loop *L, Pass *P) : IU(P->getAnalysis<IVUsers>()), SE(P->getAnalysis<ScalarEvolution>()), DT(P->getAnalysis<DominatorTreeWrapperPass>().getDomTree()), - LI(P->getAnalysis<LoopInfo>()), - TTI(P->getAnalysis<TargetTransformInfo>()), L(L), Changed(false), - IVIncInsertPos(nullptr) { + LI(P->getAnalysis<LoopInfoWrapperPass>().getLoopInfo()), + TTI(P->getAnalysis<TargetTransformInfoWrapperPass>().getTTI( + *L->getHeader()->getParent())), + L(L), Changed(false), IVIncInsertPos(nullptr) { // If LoopSimplify form is not available, stay out of trouble. if (!L->isLoopSimplifyForm()) return; @@ -4876,10 +4799,10 @@ LSRInstance::LSRInstance(Loop *L, Pass *P) // If there's too much analysis to be done, bail early. We won't be able to // model the problem anyway. unsigned NumUsers = 0; - for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) { + for (const IVStrideUse &U : IU) { if (++NumUsers > MaxIVUsers) { - DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << *L - << "\n"); + (void)U; + DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << U << "\n"); return; } } @@ -4948,14 +4871,10 @@ LSRInstance::LSRInstance(Loop *L, Pass *P) #ifndef NDEBUG // Formulae should be legal. - for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(), E = Uses.end(); - I != E; ++I) { - const LSRUse &LU = *I; - for (SmallVectorImpl<Formula>::const_iterator J = LU.Formulae.begin(), - JE = LU.Formulae.end(); - J != JE; ++J) + for (const LSRUse &LU : Uses) { + for (const Formula &F : LU.Formulae) assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, - *J) && "Illegal formula generated!"); + F) && "Illegal formula generated!"); }; #endif @@ -4969,44 +4888,38 @@ void LSRInstance::print_factors_and_types(raw_ostream &OS) const { OS << "LSR has identified the following interesting factors and types: "; bool First = true; - for (SmallSetVector<int64_t, 8>::const_iterator - I = Factors.begin(), E = Factors.end(); I != E; ++I) { + for (int64_t Factor : Factors) { if (!First) OS << ", "; First = false; - OS << '*' << *I; + OS << '*' << Factor; } - for (SmallSetVector<Type *, 4>::const_iterator - I = Types.begin(), E = Types.end(); I != E; ++I) { + for (Type *Ty : Types) { if (!First) OS << ", "; First = false; - OS << '(' << **I << ')'; + OS << '(' << *Ty << ')'; } OS << '\n'; } void LSRInstance::print_fixups(raw_ostream &OS) const { OS << "LSR is examining the following fixup sites:\n"; - for (SmallVectorImpl<LSRFixup>::const_iterator I = Fixups.begin(), - E = Fixups.end(); I != E; ++I) { + for (const LSRFixup &LF : Fixups) { dbgs() << " "; - I->print(OS); + LF.print(OS); OS << '\n'; } } void LSRInstance::print_uses(raw_ostream &OS) const { OS << "LSR is examining the following uses:\n"; - for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(), - E = Uses.end(); I != E; ++I) { - const LSRUse &LU = *I; + for (const LSRUse &LU : Uses) { dbgs() << " "; LU.print(OS); OS << '\n'; - for (SmallVectorImpl<Formula>::const_iterator J = LU.Formulae.begin(), - JE = LU.Formulae.end(); J != JE; ++J) { + for (const Formula &F : LU.Formulae) { OS << " "; - J->print(OS); + F.print(OS); OS << '\n'; } } @@ -5041,11 +4954,11 @@ private: char LoopStrengthReduce::ID = 0; INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce", "Loop Strength Reduction", false, false) -INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_DEPENDENCY(IVUsers) -INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce", "Loop Strength Reduction", false, false) @@ -5064,8 +4977,8 @@ void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const { // many analyses if they are around. AU.addPreservedID(LoopSimplifyID); - AU.addRequired<LoopInfo>(); - AU.addPreserved<LoopInfo>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); AU.addRequiredID(LoopSimplifyID); AU.addRequired<DominatorTreeWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); @@ -5076,7 +4989,7 @@ void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequiredID(LoopSimplifyID); AU.addRequired<IVUsers>(); AU.addPreserved<IVUsers>(); - AU.addRequired<TargetTransformInfo>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); } bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) { @@ -5092,13 +5005,15 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) { Changed |= DeleteDeadPHIs(L->getHeader()); if (EnablePhiElim && L->isLoopSimplifyForm()) { SmallVector<WeakVH, 16> DeadInsts; - SCEVExpander Rewriter(getAnalysis<ScalarEvolution>(), "lsr"); + const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); + SCEVExpander Rewriter(getAnalysis<ScalarEvolution>(), DL, "lsr"); #ifndef NDEBUG Rewriter.setDebugType(DEBUG_TYPE); #endif unsigned numFolded = Rewriter.replaceCongruentIVs( L, &getAnalysis<DominatorTreeWrapperPass>().getDomTree(), DeadInsts, - &getAnalysis<TargetTransformInfo>()); + &getAnalysis<TargetTransformInfoWrapperPass>().getTTI( + *L->getHeader()->getParent())); if (numFolded) { Changed = true; DeleteTriviallyDeadInstructions(DeadInsts); diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index fef5210..ccafd10 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -13,15 +13,18 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/SetVector.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" -#include "llvm/Analysis/FunctionTargetTransformInfo.h" +#include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Dominators.h" +#include "llvm/IR/InstVisitor.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Metadata.h" #include "llvm/Support/CommandLine.h" @@ -38,6 +41,22 @@ static cl::opt<unsigned> UnrollThreshold("unroll-threshold", cl::init(150), cl::Hidden, cl::desc("The cut-off point for automatic loop unrolling")); +static cl::opt<unsigned> UnrollMaxIterationsCountToAnalyze( + "unroll-max-iteration-count-to-analyze", cl::init(0), cl::Hidden, + cl::desc("Don't allow loop unrolling to simulate more than this number of" + "iterations when checking full unroll profitability")); + +static cl::opt<unsigned> UnrollMinPercentOfOptimized( + "unroll-percent-of-optimized-for-complete-unroll", cl::init(20), cl::Hidden, + cl::desc("If complete unrolling could trigger further optimizations, and, " + "by that, remove the given percent of instructions, perform the " + "complete unroll even if it's beyond the threshold")); + +static cl::opt<unsigned> UnrollAbsoluteThreshold( + "unroll-absolute-threshold", cl::init(2000), cl::Hidden, + cl::desc("Don't unroll if the unrolled size is bigger than this threshold," + " even if we can remove big portion of instructions later.")); + static cl::opt<unsigned> UnrollCount("unroll-count", cl::init(0), cl::Hidden, cl::desc("Use this unroll count for all loops including those with " @@ -63,11 +82,16 @@ namespace { static char ID; // Pass ID, replacement for typeid LoopUnroll(int T = -1, int C = -1, int P = -1, int R = -1) : LoopPass(ID) { CurrentThreshold = (T == -1) ? UnrollThreshold : unsigned(T); + CurrentAbsoluteThreshold = UnrollAbsoluteThreshold; + CurrentMinPercentOfOptimized = UnrollMinPercentOfOptimized; CurrentCount = (C == -1) ? UnrollCount : unsigned(C); CurrentAllowPartial = (P == -1) ? UnrollAllowPartial : (bool)P; CurrentRuntime = (R == -1) ? UnrollRuntime : (bool)R; UserThreshold = (T != -1) || (UnrollThreshold.getNumOccurrences() > 0); + UserAbsoluteThreshold = (UnrollAbsoluteThreshold.getNumOccurrences() > 0); + UserPercentOfOptimized = + (UnrollMinPercentOfOptimized.getNumOccurrences() > 0); UserAllowPartial = (P != -1) || (UnrollAllowPartial.getNumOccurrences() > 0); UserRuntime = (R != -1) || (UnrollRuntime.getNumOccurrences() > 0); @@ -91,10 +115,16 @@ namespace { unsigned CurrentCount; unsigned CurrentThreshold; + unsigned CurrentAbsoluteThreshold; + unsigned CurrentMinPercentOfOptimized; bool CurrentAllowPartial; bool CurrentRuntime; bool UserCount; // CurrentCount is user-specified. bool UserThreshold; // CurrentThreshold is user-specified. + bool UserAbsoluteThreshold; // CurrentAbsoluteThreshold is + // user-specified. + bool UserPercentOfOptimized; // CurrentMinPercentOfOptimized is + // user-specified. bool UserAllowPartial; // CurrentAllowPartial is user-specified. bool UserRuntime; // CurrentRuntime is user-specified. @@ -105,16 +135,15 @@ namespace { /// void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<AssumptionCacheTracker>(); - AU.addRequired<LoopInfo>(); - AU.addPreserved<LoopInfo>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); AU.addRequiredID(LoopSimplifyID); AU.addPreservedID(LoopSimplifyID); AU.addRequiredID(LCSSAID); AU.addPreservedID(LCSSAID); AU.addRequired<ScalarEvolution>(); AU.addPreserved<ScalarEvolution>(); - AU.addRequired<TargetTransformInfo>(); - AU.addRequired<FunctionTargetTransformInfo>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); // FIXME: Loop unroll requires LCSSA. And LCSSA requires dom info. // If loop unroll does not preserve dom info then LCSSA pass on next // loop will receive invalid dom info. @@ -124,9 +153,11 @@ namespace { // Fill in the UnrollingPreferences parameter with values from the // TargetTransformationInfo. - void getUnrollingPreferences(Loop *L, const FunctionTargetTransformInfo &FTTI, + void getUnrollingPreferences(Loop *L, const TargetTransformInfo &TTI, TargetTransformInfo::UnrollingPreferences &UP) { UP.Threshold = CurrentThreshold; + UP.AbsoluteThreshold = CurrentAbsoluteThreshold; + UP.MinPercentOfOptimized = CurrentMinPercentOfOptimized; UP.OptSizeThreshold = OptSizeUnrollThreshold; UP.PartialThreshold = CurrentThreshold; UP.PartialOptSizeThreshold = OptSizeUnrollThreshold; @@ -134,7 +165,8 @@ namespace { UP.MaxCount = UINT_MAX; UP.Partial = CurrentAllowPartial; UP.Runtime = CurrentRuntime; - FTTI.getUnrollingPreferences(L, UP); + UP.AllowExpensiveTripCount = false; + TTI.getUnrollingPreferences(L, UP); } // Select and return an unroll count based on parameters from @@ -153,7 +185,9 @@ namespace { // unrolled loops respectively. void selectThresholds(const Loop *L, bool HasPragma, const TargetTransformInfo::UnrollingPreferences &UP, - unsigned &Threshold, unsigned &PartialThreshold) { + unsigned &Threshold, unsigned &PartialThreshold, + unsigned &AbsoluteThreshold, + unsigned &PercentOfOptimizedForCompleteUnroll) { // Determine the current unrolling threshold. While this is // normally set from UnrollThreshold, it is overridden to a // smaller value if the current function is marked as @@ -161,10 +195,15 @@ namespace { // specified. Threshold = UserThreshold ? CurrentThreshold : UP.Threshold; PartialThreshold = UserThreshold ? CurrentThreshold : UP.PartialThreshold; + AbsoluteThreshold = UserAbsoluteThreshold ? CurrentAbsoluteThreshold + : UP.AbsoluteThreshold; + PercentOfOptimizedForCompleteUnroll = UserPercentOfOptimized + ? CurrentMinPercentOfOptimized + : UP.MinPercentOfOptimized; + if (!UserThreshold && - L->getHeader()->getParent()->getAttributes(). - hasAttribute(AttributeSet::FunctionIndex, - Attribute::OptimizeForSize)) { + L->getHeader()->getParent()->hasFnAttribute( + Attribute::OptimizeForSize)) { Threshold = UP.OptSizeThreshold; PartialThreshold = UP.PartialOptSizeThreshold; } @@ -180,15 +219,18 @@ namespace { std::max<unsigned>(PartialThreshold, PragmaUnrollThreshold); } } + bool canUnrollCompletely(Loop *L, unsigned Threshold, + unsigned AbsoluteThreshold, uint64_t UnrolledSize, + unsigned NumberOfOptimizedInstructions, + unsigned PercentOfOptimizedForCompleteUnroll); }; } char LoopUnroll::ID = 0; INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false) -INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(FunctionTargetTransformInfo) -INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LCSSA) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) @@ -203,6 +245,407 @@ Pass *llvm::createSimpleLoopUnrollPass() { return llvm::createLoopUnrollPass(-1, -1, 0, 0); } +namespace { +/// \brief SCEV expressions visitor used for finding expressions that would +/// become constants if the loop L is unrolled. +struct FindConstantPointers { + /// \brief Shows whether the expression is ConstAddress+Constant or not. + bool IndexIsConstant; + + /// \brief Used for filtering out SCEV expressions with two or more AddRec + /// subexpressions. + /// + /// Used to filter out complicated SCEV expressions, having several AddRec + /// sub-expressions. We don't handle them, because unrolling one loop + /// would help to replace only one of these inductions with a constant, and + /// consequently, the expression would remain non-constant. + bool HaveSeenAR; + + /// \brief If the SCEV expression becomes ConstAddress+Constant, this value + /// holds ConstAddress. Otherwise, it's nullptr. + Value *BaseAddress; + + /// \brief The loop, which we try to completely unroll. + const Loop *L; + + ScalarEvolution &SE; + + FindConstantPointers(const Loop *L, ScalarEvolution &SE) + : IndexIsConstant(true), HaveSeenAR(false), BaseAddress(nullptr), + L(L), SE(SE) {} + + /// Examine the given expression S and figure out, if it can be a part of an + /// expression, that could become a constant after the loop is unrolled. + /// The routine sets IndexIsConstant and HaveSeenAR according to the analysis + /// results. + /// \returns true if we need to examine subexpressions, and false otherwise. + bool follow(const SCEV *S) { + if (const SCEVUnknown *SC = dyn_cast<SCEVUnknown>(S)) { + // We've reached the leaf node of SCEV, it's most probably just a + // variable. + // If it's the only one SCEV-subexpression, then it might be a base + // address of an index expression. + // If we've already recorded base address, then just give up on this SCEV + // - it's too complicated. + if (BaseAddress) { + IndexIsConstant = false; + return false; + } + BaseAddress = SC->getValue(); + return false; + } + if (isa<SCEVConstant>(S)) + return false; + if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { + // If the current SCEV expression is AddRec, and its loop isn't the loop + // we are about to unroll, then we won't get a constant address after + // unrolling, and thus, won't be able to eliminate the load. + if (AR->getLoop() != L) { + IndexIsConstant = false; + return false; + } + // We don't handle multiple AddRecs here, so give up in this case. + if (HaveSeenAR) { + IndexIsConstant = false; + return false; + } + HaveSeenAR = true; + } + + // Continue traversal. + return true; + } + bool isDone() const { return !IndexIsConstant; } +}; +} // End anonymous namespace. + +namespace { +/// \brief A cache of SCEV results used to optimize repeated queries to SCEV on +/// the same set of instructions. +/// +/// The primary cost this saves is the cost of checking the validity of a SCEV +/// every time it is looked up. However, in some cases we can provide a reduced +/// and especially useful model for an instruction based upon SCEV that is +/// non-trivial to compute but more useful to clients. +class SCEVCache { +public: + /// \brief Struct to represent a GEP whose start and step are known fixed + /// offsets from a base address due to SCEV's analysis. + struct GEPDescriptor { + Value *BaseAddr = nullptr; + unsigned Start = 0; + unsigned Step = 0; + }; + + Optional<GEPDescriptor> getGEPDescriptor(GetElementPtrInst *GEP); + + SCEVCache(const Loop &L, ScalarEvolution &SE) : L(L), SE(SE) {} + +private: + const Loop &L; + ScalarEvolution &SE; + + SmallDenseMap<GetElementPtrInst *, GEPDescriptor> GEPDescriptors; +}; +} // End anonymous namespace. + +/// \brief Get a simplified descriptor for a GEP instruction. +/// +/// Where possible, this produces a simplified descriptor for a GEP instruction +/// using SCEV analysis of the containing loop. If this isn't possible, it +/// returns an empty optional. +/// +/// The model is a base address, an initial offset, and a per-iteration step. +/// This fits very common patterns of GEPs inside loops and is something we can +/// use to simulate the behavior of a particular iteration of a loop. +/// +/// This is a cached interface. The first call may do non-trivial work to +/// compute the result, but all subsequent calls will return a fast answer +/// based on a cached result. This includes caching negative results. +Optional<SCEVCache::GEPDescriptor> +SCEVCache::getGEPDescriptor(GetElementPtrInst *GEP) { + decltype(GEPDescriptors)::iterator It; + bool Inserted; + + std::tie(It, Inserted) = GEPDescriptors.insert({GEP, {}}); + + if (!Inserted) { + if (!It->second.BaseAddr) + return None; + + return It->second; + } + + // We've inserted a new record into the cache, so compute the GEP descriptor + // if possible. + Value *V = cast<Value>(GEP); + if (!SE.isSCEVable(V->getType())) + return None; + const SCEV *S = SE.getSCEV(V); + + // FIXME: It'd be nice if the worklist and set used by the + // SCEVTraversal could be re-used between loop iterations, but the + // interface doesn't support that. There is no way to clear the visited + // sets between uses. + FindConstantPointers Visitor(&L, SE); + SCEVTraversal<FindConstantPointers> T(Visitor); + + // Try to find (BaseAddress+Step+Offset) tuple. + // If succeeded, save it to the cache - it might help in folding + // loads. + T.visitAll(S); + if (!Visitor.IndexIsConstant || !Visitor.BaseAddress) + return None; + + const SCEV *BaseAddrSE = SE.getSCEV(Visitor.BaseAddress); + if (BaseAddrSE->getType() != S->getType()) + return None; + const SCEV *OffSE = SE.getMinusSCEV(S, BaseAddrSE); + const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OffSE); + + if (!AR) + return None; + + const SCEVConstant *StepSE = + dyn_cast<SCEVConstant>(AR->getStepRecurrence(SE)); + const SCEVConstant *StartSE = dyn_cast<SCEVConstant>(AR->getStart()); + if (!StepSE || !StartSE) + return None; + + // Check and skip caching if doing so would require lots of bits to + // avoid overflow. + APInt Start = StartSE->getValue()->getValue(); + APInt Step = StepSE->getValue()->getValue(); + if (Start.getActiveBits() > 32 || Step.getActiveBits() > 32) + return None; + + // We found a cacheable SCEV model for the GEP. + It->second.BaseAddr = Visitor.BaseAddress; + It->second.Start = Start.getLimitedValue(); + It->second.Step = Step.getLimitedValue(); + return It->second; +} + +namespace { +// This class is used to get an estimate of the optimization effects that we +// could get from complete loop unrolling. It comes from the fact that some +// loads might be replaced with concrete constant values and that could trigger +// a chain of instruction simplifications. +// +// E.g. we might have: +// int a[] = {0, 1, 0}; +// v = 0; +// for (i = 0; i < 3; i ++) +// v += b[i]*a[i]; +// If we completely unroll the loop, we would get: +// v = b[0]*a[0] + b[1]*a[1] + b[2]*a[2] +// Which then will be simplified to: +// v = b[0]* 0 + b[1]* 1 + b[2]* 0 +// And finally: +// v = b[1] +class UnrolledInstAnalyzer : private InstVisitor<UnrolledInstAnalyzer, bool> { + typedef InstVisitor<UnrolledInstAnalyzer, bool> Base; + friend class InstVisitor<UnrolledInstAnalyzer, bool>; + +public: + UnrolledInstAnalyzer(unsigned Iteration, + DenseMap<Value *, Constant *> &SimplifiedValues, + SCEVCache &SC) + : Iteration(Iteration), SimplifiedValues(SimplifiedValues), SC(SC) {} + + // Allow access to the initial visit method. + using Base::visit; + +private: + /// \brief Number of currently simulated iteration. + /// + /// If an expression is ConstAddress+Constant, then the Constant is + /// Start + Iteration*Step, where Start and Step could be obtained from + /// SCEVGEPCache. + unsigned Iteration; + + // While we walk the loop instructions, we we build up and maintain a mapping + // of simplified values specific to this iteration. The idea is to propagate + // any special information we have about loads that can be replaced with + // constants after complete unrolling, and account for likely simplifications + // post-unrolling. + DenseMap<Value *, Constant *> &SimplifiedValues; + + // We use a cache to wrap all our SCEV queries. + SCEVCache &SC; + + /// Base case for the instruction visitor. + bool visitInstruction(Instruction &I) { return false; }; + + /// TODO: Add visitors for other instruction types, e.g. ZExt, SExt. + + /// Try to simplify binary operator I. + /// + /// TODO: Probaly it's worth to hoist the code for estimating the + /// simplifications effects to a separate class, since we have a very similar + /// code in InlineCost already. + bool visitBinaryOperator(BinaryOperator &I) { + Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); + if (!isa<Constant>(LHS)) + if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS)) + LHS = SimpleLHS; + if (!isa<Constant>(RHS)) + if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS)) + RHS = SimpleRHS; + Value *SimpleV = nullptr; + const DataLayout &DL = I.getModule()->getDataLayout(); + if (auto FI = dyn_cast<FPMathOperator>(&I)) + SimpleV = + SimplifyFPBinOp(I.getOpcode(), LHS, RHS, FI->getFastMathFlags(), DL); + else + SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS, DL); + + if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) + SimplifiedValues[&I] = C; + + return SimpleV; + } + + /// Try to fold load I. + bool visitLoad(LoadInst &I) { + Value *AddrOp = I.getPointerOperand(); + if (!isa<Constant>(AddrOp)) + if (Constant *SimplifiedAddrOp = SimplifiedValues.lookup(AddrOp)) + AddrOp = SimplifiedAddrOp; + + auto *GEP = dyn_cast<GetElementPtrInst>(AddrOp); + if (!GEP) + return false; + auto OptionalGEPDesc = SC.getGEPDescriptor(GEP); + if (!OptionalGEPDesc) + return false; + + auto GV = dyn_cast<GlobalVariable>(OptionalGEPDesc->BaseAddr); + // We're only interested in loads that can be completely folded to a + // constant. + if (!GV || !GV->hasInitializer()) + return false; + + ConstantDataSequential *CDS = + dyn_cast<ConstantDataSequential>(GV->getInitializer()); + if (!CDS) + return false; + + // This calculation should never overflow because we bound Iteration quite + // low and both the start and step are 32-bit integers. We use signed + // integers so that UBSan will catch if a bug sneaks into the code. + int ElemSize = CDS->getElementType()->getPrimitiveSizeInBits() / 8U; + int64_t Index = ((int64_t)OptionalGEPDesc->Start + + (int64_t)OptionalGEPDesc->Step * (int64_t)Iteration) / + ElemSize; + if (Index >= CDS->getNumElements()) { + // FIXME: For now we conservatively ignore out of bound accesses, but + // we're allowed to perform the optimization in this case. + return false; + } + + Constant *CV = CDS->getElementAsConstant(Index); + assert(CV && "Constant expected."); + SimplifiedValues[&I] = CV; + + return true; + } +}; +} // namespace + + +namespace { +struct EstimatedUnrollCost { + /// \brief Count the number of optimized instructions. + unsigned NumberOfOptimizedInstructions; + + /// \brief Count the total number of instructions. + unsigned UnrolledLoopSize; +}; +} + +/// \brief Figure out if the loop is worth full unrolling. +/// +/// Complete loop unrolling can make some loads constant, and we need to know +/// if that would expose any further optimization opportunities. This routine +/// estimates this optimization. It assigns computed number of instructions, +/// that potentially might be optimized away, to +/// NumberOfOptimizedInstructions, and total number of instructions to +/// UnrolledLoopSize (not counting blocks that won't be reached, if we were +/// able to compute the condition). +/// \returns false if we can't analyze the loop, or if we discovered that +/// unrolling won't give anything. Otherwise, returns true. +Optional<EstimatedUnrollCost> +analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, ScalarEvolution &SE, + const TargetTransformInfo &TTI, + unsigned MaxUnrolledLoopSize) { + // We want to be able to scale offsets by the trip count and add more offsets + // to them without checking for overflows, and we already don't want to + // analyze *massive* trip counts, so we force the max to be reasonably small. + assert(UnrollMaxIterationsCountToAnalyze < (INT_MAX / 2) && + "The unroll iterations max is too large!"); + + // Don't simulate loops with a big or unknown tripcount + if (!UnrollMaxIterationsCountToAnalyze || !TripCount || + TripCount > UnrollMaxIterationsCountToAnalyze) + return None; + + SmallSetVector<BasicBlock *, 16> BBWorklist; + DenseMap<Value *, Constant *> SimplifiedValues; + + // Use a cache to access SCEV expressions so that we don't pay the cost on + // each iteration. This cache is lazily self-populating. + SCEVCache SC(*L, SE); + + unsigned NumberOfOptimizedInstructions = 0; + unsigned UnrolledLoopSize = 0; + + // Simulate execution of each iteration of the loop counting instructions, + // which would be simplified. + // Since the same load will take different values on different iterations, + // we literally have to go through all loop's iterations. + for (unsigned Iteration = 0; Iteration < TripCount; ++Iteration) { + SimplifiedValues.clear(); + UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SC); + + BBWorklist.clear(); + BBWorklist.insert(L->getHeader()); + // Note that we *must not* cache the size, this loop grows the worklist. + for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) { + BasicBlock *BB = BBWorklist[Idx]; + + // Visit all instructions in the given basic block and try to simplify + // it. We don't change the actual IR, just count optimization + // opportunities. + for (Instruction &I : *BB) { + UnrolledLoopSize += TTI.getUserCost(&I); + + // Visit the instruction to analyze its loop cost after unrolling, + // and if the visitor returns true, then we can optimize this + // instruction away. + if (Analyzer.visit(I)) + NumberOfOptimizedInstructions += TTI.getUserCost(&I); + + // If unrolled body turns out to be too big, bail out. + if (UnrolledLoopSize - NumberOfOptimizedInstructions > + MaxUnrolledLoopSize) + return None; + } + + // Add BB's successors to the worklist. + for (BasicBlock *Succ : successors(BB)) + if (L->contains(Succ)) + BBWorklist.insert(Succ); + } + + // If we found no optimization opportunities on the first iteration, we + // won't find them on later ones too. + if (!NumberOfOptimizedInstructions) + return None; + } + return {{NumberOfOptimizedInstructions, UnrolledLoopSize}}; +} + /// ApproximateLoopSize - Approximate the size of the loop. static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls, bool &NotDuplicatable, @@ -234,44 +677,31 @@ static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls, // Returns the loop hint metadata node with the given name (for example, // "llvm.loop.unroll.count"). If no such metadata node exists, then nullptr is // returned. -static const MDNode *GetUnrollMetadata(const Loop *L, StringRef Name) { - MDNode *LoopID = L->getLoopID(); - if (!LoopID) - return nullptr; - - // First operand should refer to the loop id itself. - assert(LoopID->getNumOperands() > 0 && "requires at least one operand"); - assert(LoopID->getOperand(0) == LoopID && "invalid loop id"); - - for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) { - const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); - if (!MD) - continue; - - const MDString *S = dyn_cast<MDString>(MD->getOperand(0)); - if (!S) - continue; - - if (Name.equals(S->getString())) - return MD; - } +static MDNode *GetUnrollMetadataForLoop(const Loop *L, StringRef Name) { + if (MDNode *LoopID = L->getLoopID()) + return GetUnrollMetadata(LoopID, Name); return nullptr; } // Returns true if the loop has an unroll(full) pragma. static bool HasUnrollFullPragma(const Loop *L) { - return GetUnrollMetadata(L, "llvm.loop.unroll.full"); + return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.full"); } // Returns true if the loop has an unroll(disable) pragma. static bool HasUnrollDisablePragma(const Loop *L) { - return GetUnrollMetadata(L, "llvm.loop.unroll.disable"); + return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.disable"); +} + +// Returns true if the loop has an runtime unroll(disable) pragma. +static bool HasRuntimeUnrollDisablePragma(const Loop *L) { + return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.runtime.disable"); } // If loop has an unroll_count pragma return the (necessarily // positive) value from the pragma. Otherwise return 0. static unsigned UnrollCountPragmaValue(const Loop *L) { - const MDNode *MD = GetUnrollMetadata(L, "llvm.loop.unroll.count"); + MDNode *MD = GetUnrollMetadataForLoop(L, "llvm.loop.unroll.count"); if (MD) { assert(MD->getNumOperands() == 2 && "Unroll count hint metadata should have two operands."); @@ -319,6 +749,49 @@ static void SetLoopAlreadyUnrolled(Loop *L) { L->setLoopID(NewLoopID); } +bool LoopUnroll::canUnrollCompletely( + Loop *L, unsigned Threshold, unsigned AbsoluteThreshold, + uint64_t UnrolledSize, unsigned NumberOfOptimizedInstructions, + unsigned PercentOfOptimizedForCompleteUnroll) { + + if (Threshold == NoThreshold) { + DEBUG(dbgs() << " Can fully unroll, because no threshold is set.\n"); + return true; + } + + if (UnrolledSize <= Threshold) { + DEBUG(dbgs() << " Can fully unroll, because unrolled size: " + << UnrolledSize << "<" << Threshold << "\n"); + return true; + } + + assert(UnrolledSize && "UnrolledSize can't be 0 at this point."); + unsigned PercentOfOptimizedInstructions = + (uint64_t)NumberOfOptimizedInstructions * 100ull / UnrolledSize; + + if (UnrolledSize <= AbsoluteThreshold && + PercentOfOptimizedInstructions >= PercentOfOptimizedForCompleteUnroll) { + DEBUG(dbgs() << " Can fully unroll, because unrolling will help removing " + << PercentOfOptimizedInstructions + << "% instructions (threshold: " + << PercentOfOptimizedForCompleteUnroll << "%)\n"); + DEBUG(dbgs() << " Unrolled size (" << UnrolledSize + << ") is less than the threshold (" << AbsoluteThreshold + << ").\n"); + return true; + } + + DEBUG(dbgs() << " Too large to fully unroll:\n"); + DEBUG(dbgs() << " Unrolled size: " << UnrolledSize << "\n"); + DEBUG(dbgs() << " Estimated number of optimized instructions: " + << NumberOfOptimizedInstructions << "\n"); + DEBUG(dbgs() << " Absolute threshold: " << AbsoluteThreshold << "\n"); + DEBUG(dbgs() << " Minimum percent of removed instructions: " + << PercentOfOptimizedForCompleteUnroll << "\n"); + DEBUG(dbgs() << " Threshold for small loops: " << Threshold << "\n"); + return false; +} + unsigned LoopUnroll::selectUnrollCount( const Loop *L, unsigned TripCount, bool PragmaFullUnroll, unsigned PragmaCount, const TargetTransformInfo::UnrollingPreferences &UP, @@ -363,13 +836,13 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { if (skipOptnoneFunction(L)) return false; - LoopInfo *LI = &getAnalysis<LoopInfo>(); + Function &F = *L->getHeader()->getParent(); + + LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); ScalarEvolution *SE = &getAnalysis<ScalarEvolution>(); - const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfo>(); - const FunctionTargetTransformInfo &FTTI = - getAnalysis<FunctionTargetTransformInfo>(); - auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache( - *L->getHeader()->getParent()); + const TargetTransformInfo &TTI = + getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); BasicBlock *Header = L->getHeader(); DEBUG(dbgs() << "Loop Unroll: F[" << Header->getParent()->getName() @@ -383,7 +856,7 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { bool HasPragma = PragmaFullUnroll || PragmaCount > 0; TargetTransformInfo::UnrollingPreferences UP; - getUnrollingPreferences(L, FTTI, UP); + getUnrollingPreferences(L, TTI, UP); // Find trip count and trip multiple if count is not available unsigned TripCount = 0; @@ -426,20 +899,33 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { } unsigned Threshold, PartialThreshold; - selectThresholds(L, HasPragma, UP, Threshold, PartialThreshold); + unsigned AbsoluteThreshold, PercentOfOptimizedForCompleteUnroll; + selectThresholds(L, HasPragma, UP, Threshold, PartialThreshold, + AbsoluteThreshold, PercentOfOptimizedForCompleteUnroll); // Given Count, TripCount and thresholds determine the type of // unrolling which is to be performed. enum { Full = 0, Partial = 1, Runtime = 2 }; int Unrolling; if (TripCount && Count == TripCount) { - if (Threshold != NoThreshold && UnrolledSize > Threshold) { - DEBUG(dbgs() << " Too large to fully unroll with count: " << Count - << " because size: " << UnrolledSize << ">" << Threshold - << "\n"); - Unrolling = Partial; - } else { + Unrolling = Partial; + // If the loop is really small, we don't need to run an expensive analysis. + if (canUnrollCompletely( + L, Threshold, AbsoluteThreshold, + UnrolledSize, 0, 100)) { Unrolling = Full; + } else { + // The loop isn't that small, but we still can fully unroll it if that + // helps to remove a significant number of instructions. + // To check that, run additional analysis on the loop. + if (Optional<EstimatedUnrollCost> Cost = + analyzeLoopUnrollCost(L, TripCount, *SE, TTI, AbsoluteThreshold)) + if (canUnrollCompletely(L, Threshold, AbsoluteThreshold, + Cost->UnrolledLoopSize, + Cost->NumberOfOptimizedInstructions, + PercentOfOptimizedForCompleteUnroll)) { + Unrolling = Full; + } } } else if (TripCount && Count < TripCount) { Unrolling = Partial; @@ -450,6 +936,9 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { // Reduce count based on the type of unrolling and the threshold values. unsigned OriginalCount = Count; bool AllowRuntime = UserRuntime ? CurrentRuntime : UP.Runtime; + if (HasRuntimeUnrollDisablePragma(L)) { + AllowRuntime = false; + } if (Unrolling == Partial) { bool AllowPartial = UserAllowPartial ? CurrentAllowPartial : UP.Partial; if (!AllowPartial && !CountSetExplicitly) { @@ -518,8 +1007,8 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { } // Unroll the loop. - if (!UnrollLoop(L, Count, TripCount, AllowRuntime, TripMultiple, LI, this, - &LPM, &AC)) + if (!UnrollLoop(L, Count, TripCount, AllowRuntime, UP.AllowExpensiveTripCount, + TripMultiple, LI, this, &LPM, &AC)) return false; return true; diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp index 9f4c122..988d2af 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp @@ -42,6 +42,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -170,13 +171,13 @@ namespace { AU.addRequired<AssumptionCacheTracker>(); AU.addRequiredID(LoopSimplifyID); AU.addPreservedID(LoopSimplifyID); - AU.addRequired<LoopInfo>(); - AU.addPreserved<LoopInfo>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); AU.addRequiredID(LCSSAID); AU.addPreservedID(LCSSAID); AU.addPreserved<DominatorTreeWrapperPass>(); AU.addPreserved<ScalarEvolution>(); - AU.addRequired<TargetTransformInfo>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); } private: @@ -333,10 +334,10 @@ void LUAnalysisCache::cloneData(const Loop *NewLoop, const Loop *OldLoop, char LoopUnswitch::ID = 0; INITIALIZE_PASS_BEGIN(LoopUnswitch, "loop-unswitch", "Unswitch loops", false, false) -INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) -INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LCSSA) INITIALIZE_PASS_END(LoopUnswitch, "loop-unswitch", "Unswitch loops", false, false) @@ -387,7 +388,7 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) { AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache( *L->getHeader()->getParent()); - LI = &getAnalysis<LoopInfo>(); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); LPM = &LPM_Ref; DominatorTreeWrapperPass *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); @@ -432,8 +433,10 @@ bool LoopUnswitch::processCurrentLoop() { // Probably we reach the quota of branches for this loop. If so // stop unswitching. - if (!BranchesInfo.countLoop(currentLoop, getAnalysis<TargetTransformInfo>(), - AC)) + if (!BranchesInfo.countLoop( + currentLoop, getAnalysis<TargetTransformInfoWrapperPass>().getTTI( + *currentLoop->getHeader()->getParent()), + AC)) return false; // Loop over all of the basic blocks in the loop. If we find an interior @@ -655,9 +658,7 @@ bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val) { // Check to see if it would be profitable to unswitch current loop. // Do not do non-trivial unswitch while optimizing for size. - if (OptimizeForSize || - F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, - Attribute::OptimizeForSize)) + if (OptimizeForSize || F->hasFnAttribute(Attribute::OptimizeForSize)) return false; UnswitchNontrivialCondition(LoopCond, Val, currentLoop); @@ -675,7 +676,7 @@ static Loop *CloneLoop(Loop *L, Loop *PL, ValueToValueMapTy &VM, for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); I != E; ++I) if (LI->getLoopFor(*I) == L) - New->addBasicBlockToLoop(cast<BasicBlock>(VM[*I]), LI->getBase()); + New->addBasicBlockToLoop(cast<BasicBlock>(VM[*I]), *LI); // Add all of the subloops to the new loop. for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) @@ -706,8 +707,9 @@ void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val, // If either edge is critical, split it. This helps preserve LoopSimplify // form for enclosing loops. - SplitCriticalEdge(BI, 0, this, false, false, true); - SplitCriticalEdge(BI, 1, this, false, false, true); + auto Options = CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA(); + SplitCriticalEdge(BI, 0, Options); + SplitCriticalEdge(BI, 1, Options); } /// UnswitchTrivialCondition - Given a loop that has a trivial unswitchable @@ -726,7 +728,7 @@ void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, // First step, split the preheader, so that we know that there is a safe place // to insert the conditional branch. We will change loopPreheader to have a // conditional branch on Cond. - BasicBlock *NewPH = SplitEdge(loopPreheader, loopHeader, this); + BasicBlock *NewPH = SplitEdge(loopPreheader, loopHeader, DT, LI); // Now that we have a place to insert the conditional branch, create a place // to branch to: this is the exit block out of the loop that we should @@ -737,7 +739,7 @@ void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, // without actually branching to it (the exit block should be dominated by the // loop header, not the preheader). assert(!L->contains(ExitBlock) && "Exit block is in the loop?"); - BasicBlock *NewExit = SplitBlock(ExitBlock, ExitBlock->begin(), this); + BasicBlock *NewExit = SplitBlock(ExitBlock, ExitBlock->begin(), DT, LI); // Okay, now we have a position to branch from and a position to branch to, // insert the new conditional branch. @@ -768,13 +770,9 @@ void LoopUnswitch::SplitExitEdges(Loop *L, // Although SplitBlockPredecessors doesn't preserve loop-simplify in // general, if we call it on all predecessors of all exits then it does. - if (!ExitBlock->isLandingPad()) { - SplitBlockPredecessors(ExitBlock, Preds, ".us-lcssa", this); - } else { - SmallVector<BasicBlock*, 2> NewBBs; - SplitLandingPadPredecessors(ExitBlock, Preds, ".us-lcssa", ".us-lcssa", - this, NewBBs); - } + SplitBlockPredecessors(ExitBlock, Preds, ".us-lcssa", + /*AliasAnalysis*/ nullptr, DT, LI, + /*PreserveLCSSA*/ true); } } @@ -797,7 +795,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, // First step, split the preheader and exit blocks, and add these blocks to // the LoopBlocks list. - BasicBlock *NewPreheader = SplitEdge(loopPreheader, loopHeader, this); + BasicBlock *NewPreheader = SplitEdge(loopPreheader, loopHeader, DT, LI); LoopBlocks.push_back(NewPreheader); // We want the loop to come after the preheader, but before the exit blocks. @@ -850,14 +848,14 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, if (ParentLoop) { // Make sure to add the cloned preheader and exit blocks to the parent loop // as well. - ParentLoop->addBasicBlockToLoop(NewBlocks[0], LI->getBase()); + ParentLoop->addBasicBlockToLoop(NewBlocks[0], *LI); } for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) { BasicBlock *NewExit = cast<BasicBlock>(VMap[ExitBlocks[i]]); // The new exit block should be in the same loop as the old one. if (Loop *ExitBBLoop = LI->getLoopFor(ExitBlocks[i])) - ExitBBLoop->addBasicBlockToLoop(NewExit, LI->getBase()); + ExitBBLoop->addBasicBlockToLoop(NewExit, *LI); assert(NewExit->getTerminator()->getNumSuccessors() == 1 && "Exit block should have been split to have one successor!"); @@ -1043,7 +1041,7 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC, // and hooked up so as to preserve the loop structure, because // trying to update it is complicated. So instead we preserve the // loop structure and put the block on a dead code path. - SplitEdge(Switch, SISucc, this); + SplitEdge(Switch, SISucc, DT, LI); // Compute the successors instead of relying on the return value // of SplitEdge, since it may have split the switch successor // after PHI nodes. @@ -1085,6 +1083,7 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC, /// pass. /// void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) { + const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); while (!Worklist.empty()) { Instruction *I = Worklist.back(); Worklist.pop_back(); @@ -1107,7 +1106,7 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) { // See if instruction simplification can hack this up. This is common for // things like "select false, X, Y" after unswitching made the condition be // 'false'. TODO: update the domtree properly so we can pass it here. - if (Value *V = SimplifyInstruction(I)) + if (Value *V = SimplifyInstruction(I, DL)) if (LI->replacementPreservesLCSSAForm(I, V)) { ReplaceUsesOfWith(I, V, Worklist, L, LPM); continue; diff --git a/contrib/llvm/lib/Transforms/Utils/LowerExpectIntrinsic.cpp b/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp index ff89e74..0c47cbd 100644 --- a/contrib/llvm/lib/Transforms/Utils/LowerExpectIntrinsic.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp @@ -11,7 +11,8 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" @@ -24,13 +25,14 @@ #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include <vector> +#include "llvm/Transforms/Scalar.h" using namespace llvm; #define DEBUG_TYPE "lower-expect-intrinsic" -STATISTIC(IfHandled, "Number of 'expect' intrinsic instructions handled"); +STATISTIC(ExpectIntrinsicsHandled, + "Number of 'expect' intrinsic instructions handled"); static cl::opt<uint32_t> LikelyBranchWeight("likely-branch-weight", cl::Hidden, cl::init(64), @@ -39,27 +41,8 @@ static cl::opt<uint32_t> UnlikelyBranchWeight("unlikely-branch-weight", cl::Hidden, cl::init(4), cl::desc("Weight of the branch unlikely to be taken (default = 4)")); -namespace { - - class LowerExpectIntrinsic : public FunctionPass { - - bool HandleSwitchExpect(SwitchInst *SI); - - bool HandleIfExpect(BranchInst *BI); - - public: - static char ID; - LowerExpectIntrinsic() : FunctionPass(ID) { - initializeLowerExpectIntrinsicPass(*PassRegistry::getPassRegistry()); - } - - bool runOnFunction(Function &F) override; - }; -} - - -bool LowerExpectIntrinsic::HandleSwitchExpect(SwitchInst *SI) { - CallInst *CI = dyn_cast<CallInst>(SI->getCondition()); +static bool handleSwitchExpect(SwitchInst &SI) { + CallInst *CI = dyn_cast<CallInst>(SI.getCondition()); if (!CI) return false; @@ -72,26 +55,24 @@ bool LowerExpectIntrinsic::HandleSwitchExpect(SwitchInst *SI) { if (!ExpectedValue) return false; - SwitchInst::CaseIt Case = SI->findCaseValue(ExpectedValue); - unsigned n = SI->getNumCases(); // +1 for default case. - std::vector<uint32_t> Weights(n + 1); + SwitchInst::CaseIt Case = SI.findCaseValue(ExpectedValue); + unsigned n = SI.getNumCases(); // +1 for default case. + SmallVector<uint32_t, 16> Weights(n + 1, UnlikelyBranchWeight); - Weights[0] = Case == SI->case_default() ? LikelyBranchWeight - : UnlikelyBranchWeight; - for (unsigned i = 0; i != n; ++i) - Weights[i + 1] = i == Case.getCaseIndex() ? LikelyBranchWeight - : UnlikelyBranchWeight; + if (Case == SI.case_default()) + Weights[0] = LikelyBranchWeight; + else + Weights[Case.getCaseIndex() + 1] = LikelyBranchWeight; - SI->setMetadata(LLVMContext::MD_prof, - MDBuilder(CI->getContext()).createBranchWeights(Weights)); + SI.setMetadata(LLVMContext::MD_prof, + MDBuilder(CI->getContext()).createBranchWeights(Weights)); - SI->setCondition(ArgValue); + SI.setCondition(ArgValue); return true; } - -bool LowerExpectIntrinsic::HandleIfExpect(BranchInst *BI) { - if (BI->isUnconditional()) +static bool handleBranchExpect(BranchInst &BI) { + if (BI.isUnconditional()) return false; // Handle non-optimized IR code like: @@ -105,9 +86,9 @@ bool LowerExpectIntrinsic::HandleIfExpect(BranchInst *BI) { CallInst *CI; - ICmpInst *CmpI = dyn_cast<ICmpInst>(BI->getCondition()); + ICmpInst *CmpI = dyn_cast<ICmpInst>(BI.getCondition()); if (!CmpI) { - CI = dyn_cast<CallInst>(BI->getCondition()); + CI = dyn_cast<CallInst>(BI.getCondition()); } else { if (CmpI->getPredicate() != CmpInst::ICMP_NE) return false; @@ -136,32 +117,30 @@ bool LowerExpectIntrinsic::HandleIfExpect(BranchInst *BI) { else Node = MDB.createBranchWeights(UnlikelyBranchWeight, LikelyBranchWeight); - BI->setMetadata(LLVMContext::MD_prof, Node); + BI.setMetadata(LLVMContext::MD_prof, Node); if (CmpI) CmpI->setOperand(0, ArgValue); else - BI->setCondition(ArgValue); + BI.setCondition(ArgValue); return true; } +static bool lowerExpectIntrinsic(Function &F) { + bool Changed = false; -bool LowerExpectIntrinsic::runOnFunction(Function &F) { - for (Function::iterator I = F.begin(), E = F.end(); I != E;) { - BasicBlock *BB = I++; - + for (BasicBlock &BB : F) { // Create "block_weights" metadata. - if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) { - if (HandleIfExpect(BI)) - IfHandled++; - } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) { - if (HandleSwitchExpect(SI)) - IfHandled++; + if (BranchInst *BI = dyn_cast<BranchInst>(BB.getTerminator())) { + if (handleBranchExpect(*BI)) + ExpectIntrinsicsHandled++; + } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB.getTerminator())) { + if (handleSwitchExpect(*SI)) + ExpectIntrinsicsHandled++; } // remove llvm.expect intrinsics. - for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); - BI != BE; ) { + for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) { CallInst *CI = dyn_cast<CallInst>(BI++); if (!CI) continue; @@ -171,17 +150,42 @@ bool LowerExpectIntrinsic::runOnFunction(Function &F) { Value *Exp = CI->getArgOperand(0); CI->replaceAllUsesWith(Exp); CI->eraseFromParent(); + Changed = true; } } } - return false; + return Changed; } +PreservedAnalyses LowerExpectIntrinsicPass::run(Function &F) { + if (lowerExpectIntrinsic(F)) + return PreservedAnalyses::none(); + + return PreservedAnalyses::all(); +} + +namespace { +/// \brief Legacy pass for lowering expect intrinsics out of the IR. +/// +/// When this pass is run over a function it uses expect intrinsics which feed +/// branches and switches to provide branch weight metadata for those +/// terminators. It then removes the expect intrinsics from the IR so the rest +/// of the optimizer can ignore them. +class LowerExpectIntrinsic : public FunctionPass { +public: + static char ID; + LowerExpectIntrinsic() : FunctionPass(ID) { + initializeLowerExpectIntrinsicPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { return lowerExpectIntrinsic(F); } +}; +} char LowerExpectIntrinsic::ID = 0; -INITIALIZE_PASS(LowerExpectIntrinsic, "lower-expect", "Lower 'expect' " - "Intrinsics", false, false) +INITIALIZE_PASS(LowerExpectIntrinsic, "lower-expect", + "Lower 'expect' Intrinsics", false, false) FunctionPass *llvm::createLowerExpectIntrinsicPass() { return new LowerExpectIntrinsic(); diff --git a/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 3eea3d4..66d6ac6 100644 --- a/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -18,6 +18,7 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" @@ -28,7 +29,6 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/Local.h" #include <list> using namespace llvm; @@ -41,7 +41,8 @@ STATISTIC(NumMoveToCpy, "Number of memmoves converted to memcpy"); STATISTIC(NumCpyToSet, "Number of memcpys converted to memset"); static int64_t GetOffsetFromIndex(const GEPOperator *GEP, unsigned Idx, - bool &VariableIdxFound, const DataLayout &TD){ + bool &VariableIdxFound, + const DataLayout &DL) { // Skip over the first indices. gep_type_iterator GTI = gep_type_begin(GEP); for (unsigned i = 1; i != Idx; ++i, ++GTI) @@ -57,13 +58,13 @@ static int64_t GetOffsetFromIndex(const GEPOperator *GEP, unsigned Idx, // Handle struct indices, which add their field offset to the pointer. if (StructType *STy = dyn_cast<StructType>(*GTI)) { - Offset += TD.getStructLayout(STy)->getElementOffset(OpC->getZExtValue()); + Offset += DL.getStructLayout(STy)->getElementOffset(OpC->getZExtValue()); continue; } // Otherwise, we have a sequential type like an array or vector. Multiply // the index by the ElementSize. - uint64_t Size = TD.getTypeAllocSize(GTI.getIndexedType()); + uint64_t Size = DL.getTypeAllocSize(GTI.getIndexedType()); Offset += Size*OpC->getSExtValue(); } @@ -74,7 +75,7 @@ static int64_t GetOffsetFromIndex(const GEPOperator *GEP, unsigned Idx, /// constant offset, and return that constant offset. For example, Ptr1 might /// be &A[42], and Ptr2 might be &A[40]. In this case offset would be -8. static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset, - const DataLayout &TD) { + const DataLayout &DL) { Ptr1 = Ptr1->stripPointerCasts(); Ptr2 = Ptr2->stripPointerCasts(); @@ -92,12 +93,12 @@ static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset, // If one pointer is a GEP and the other isn't, then see if the GEP is a // constant offset from the base, as in "P" and "gep P, 1". if (GEP1 && !GEP2 && GEP1->getOperand(0)->stripPointerCasts() == Ptr2) { - Offset = -GetOffsetFromIndex(GEP1, 1, VariableIdxFound, TD); + Offset = -GetOffsetFromIndex(GEP1, 1, VariableIdxFound, DL); return !VariableIdxFound; } if (GEP2 && !GEP1 && GEP2->getOperand(0)->stripPointerCasts() == Ptr1) { - Offset = GetOffsetFromIndex(GEP2, 1, VariableIdxFound, TD); + Offset = GetOffsetFromIndex(GEP2, 1, VariableIdxFound, DL); return !VariableIdxFound; } @@ -115,8 +116,8 @@ static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset, if (GEP1->getOperand(Idx) != GEP2->getOperand(Idx)) break; - int64_t Offset1 = GetOffsetFromIndex(GEP1, Idx, VariableIdxFound, TD); - int64_t Offset2 = GetOffsetFromIndex(GEP2, Idx, VariableIdxFound, TD); + int64_t Offset1 = GetOffsetFromIndex(GEP1, Idx, VariableIdxFound, DL); + int64_t Offset2 = GetOffsetFromIndex(GEP2, Idx, VariableIdxFound, DL); if (VariableIdxFound) return false; Offset = Offset2-Offset1; @@ -150,12 +151,11 @@ struct MemsetRange { /// TheStores - The actual stores that make up this range. SmallVector<Instruction*, 16> TheStores; - bool isProfitableToUseMemset(const DataLayout &TD) const; - + bool isProfitableToUseMemset(const DataLayout &DL) const; }; } // end anon namespace -bool MemsetRange::isProfitableToUseMemset(const DataLayout &TD) const { +bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const { // If we found more than 4 stores to merge or 16 bytes, use memset. if (TheStores.size() >= 4 || End-Start >= 16) return true; @@ -183,7 +183,7 @@ bool MemsetRange::isProfitableToUseMemset(const DataLayout &TD) const { // size. If so, check to see whether we will end up actually reducing the // number of stores used. unsigned Bytes = unsigned(End-Start); - unsigned MaxIntSize = TD.getLargestLegalIntTypeSize(); + unsigned MaxIntSize = DL.getLargestLegalIntTypeSize(); if (MaxIntSize == 0) MaxIntSize = 1; unsigned NumPointerStores = Bytes / MaxIntSize; @@ -314,14 +314,12 @@ namespace { class MemCpyOpt : public FunctionPass { MemoryDependenceAnalysis *MD; TargetLibraryInfo *TLI; - const DataLayout *DL; public: static char ID; // Pass identification, replacement for typeid MemCpyOpt() : FunctionPass(ID) { initializeMemCpyOptPass(*PassRegistry::getPassRegistry()); MD = nullptr; TLI = nullptr; - DL = nullptr; } bool runOnFunction(Function &F) override; @@ -334,7 +332,7 @@ namespace { AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<MemoryDependenceAnalysis>(); AU.addRequired<AliasAnalysis>(); - AU.addRequired<TargetLibraryInfo>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); AU.addPreserved<AliasAnalysis>(); AU.addPreserved<MemoryDependenceAnalysis>(); } @@ -346,8 +344,9 @@ namespace { bool processMemMove(MemMoveInst *M); bool performCallSlotOptzn(Instruction *cpy, Value *cpyDst, Value *cpySrc, uint64_t cpyLen, unsigned cpyAlign, CallInst *C); - bool processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep, - uint64_t MSize); + bool processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep); + bool processMemSetMemCpyDependence(MemCpyInst *M, MemSetInst *MDep); + bool performMemCpyToMemSetOptzn(MemCpyInst *M, MemSetInst *MDep); bool processByValArgument(CallSite CS, unsigned ArgNo); Instruction *tryMergingIntoMemset(Instruction *I, Value *StartPtr, Value *ByteVal); @@ -366,7 +365,7 @@ INITIALIZE_PASS_BEGIN(MemCpyOpt, "memcpyopt", "MemCpy Optimization", INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) INITIALIZE_PASS_END(MemCpyOpt, "memcpyopt", "MemCpy Optimization", false, false) @@ -377,13 +376,13 @@ INITIALIZE_PASS_END(MemCpyOpt, "memcpyopt", "MemCpy Optimization", /// attempts to merge them together into a memcpy/memset. Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, Value *StartPtr, Value *ByteVal) { - if (!DL) return nullptr; + const DataLayout &DL = StartInst->getModule()->getDataLayout(); // Okay, so we now have a single store that can be splatable. Scan to find // all subsequent stores of the same value to offset from the same pointer. // Join these together into ranges, so we can decide whether contiguous blocks // are stored. - MemsetRanges Ranges(*DL); + MemsetRanges Ranges(DL); BasicBlock::iterator BI = StartInst; for (++BI; !isa<TerminatorInst>(BI); ++BI) { @@ -406,8 +405,8 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, // Check to see if this store is to a constant offset from the start ptr. int64_t Offset; - if (!IsPointerOffset(StartPtr, NextStore->getPointerOperand(), - Offset, *DL)) + if (!IsPointerOffset(StartPtr, NextStore->getPointerOperand(), Offset, + DL)) break; Ranges.addStore(Offset, NextStore); @@ -420,7 +419,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, // Check to see if this store is to a constant offset from the start ptr. int64_t Offset; - if (!IsPointerOffset(StartPtr, MSI->getDest(), Offset, *DL)) + if (!IsPointerOffset(StartPtr, MSI->getDest(), Offset, DL)) break; Ranges.addMemSet(Offset, MSI); @@ -452,7 +451,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, if (Range.TheStores.size() == 1) continue; // If it is profitable to lower this range to memset, do so now. - if (!Range.isProfitableToUseMemset(*DL)) + if (!Range.isProfitableToUseMemset(DL)) continue; // Otherwise, we do want to transform this! Create a new memset. @@ -464,7 +463,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, if (Alignment == 0) { Type *EltType = cast<PointerType>(StartPtr->getType())->getElementType(); - Alignment = DL->getABITypeAlignment(EltType); + Alignment = DL.getABITypeAlignment(EltType); } AMemSet = @@ -494,8 +493,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { if (!SI->isSimple()) return false; - - if (!DL) return false; + const DataLayout &DL = SI->getModule()->getDataLayout(); // Detect cases where we're performing call slot forwarding, but // happen to be using a load-store pair to implement it, rather than @@ -525,16 +523,16 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { if (C) { unsigned storeAlign = SI->getAlignment(); if (!storeAlign) - storeAlign = DL->getABITypeAlignment(SI->getOperand(0)->getType()); + storeAlign = DL.getABITypeAlignment(SI->getOperand(0)->getType()); unsigned loadAlign = LI->getAlignment(); if (!loadAlign) - loadAlign = DL->getABITypeAlignment(LI->getType()); + loadAlign = DL.getABITypeAlignment(LI->getType()); - bool changed = performCallSlotOptzn(LI, - SI->getPointerOperand()->stripPointerCasts(), - LI->getPointerOperand()->stripPointerCasts(), - DL->getTypeStoreSize(SI->getOperand(0)->getType()), - std::min(storeAlign, loadAlign), C); + bool changed = performCallSlotOptzn( + LI, SI->getPointerOperand()->stripPointerCasts(), + LI->getPointerOperand()->stripPointerCasts(), + DL.getTypeStoreSize(SI->getOperand(0)->getType()), + std::min(storeAlign, loadAlign), C); if (changed) { MD->removeInstruction(SI); SI->eraseFromParent(); @@ -606,15 +604,13 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, if (!srcAlloca) return false; - // Check that all of src is copied to dest. - if (!DL) return false; - ConstantInt *srcArraySize = dyn_cast<ConstantInt>(srcAlloca->getArraySize()); if (!srcArraySize) return false; - uint64_t srcSize = DL->getTypeAllocSize(srcAlloca->getAllocatedType()) * - srcArraySize->getZExtValue(); + const DataLayout &DL = cpy->getModule()->getDataLayout(); + uint64_t srcSize = DL.getTypeAllocSize(srcAlloca->getAllocatedType()) * + srcArraySize->getZExtValue(); if (cpyLen < srcSize) return false; @@ -628,8 +624,8 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, if (!destArraySize) return false; - uint64_t destSize = DL->getTypeAllocSize(A->getAllocatedType()) * - destArraySize->getZExtValue(); + uint64_t destSize = DL.getTypeAllocSize(A->getAllocatedType()) * + destArraySize->getZExtValue(); if (destSize < srcSize) return false; @@ -648,7 +644,7 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, return false; } - uint64_t destSize = DL->getTypeAllocSize(StructTy); + uint64_t destSize = DL.getTypeAllocSize(StructTy); if (destSize < srcSize) return false; } @@ -659,7 +655,7 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, // Check that dest points to memory that is at least as aligned as src. unsigned srcAlign = srcAlloca->getAlignment(); if (!srcAlign) - srcAlign = DL->getABITypeAlignment(srcAlloca->getAllocatedType()); + srcAlign = DL.getABITypeAlignment(srcAlloca->getAllocatedType()); bool isDestSufficientlyAligned = srcAlign <= cpyAlign; // If dest is not aligned enough and we can't increase its alignment then // bail out. @@ -769,10 +765,9 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, /// processMemCpyMemCpyDependence - We've found that the (upward scanning) /// memory dependence of memcpy 'M' is the memcpy 'MDep'. Try to simplify M to -/// copy from MDep's input if we can. MSize is the size of M's copy. +/// copy from MDep's input if we can. /// -bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep, - uint64_t MSize) { +bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep) { // We can only transforms memcpy's where the dest of one is the source of the // other. if (M->getSource() != MDep->getDest() || MDep->isVolatile()) @@ -844,6 +839,103 @@ bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep, return true; } +/// We've found that the (upward scanning) memory dependence of \p MemCpy is +/// \p MemSet. Try to simplify \p MemSet to only set the trailing bytes that +/// weren't copied over by \p MemCpy. +/// +/// In other words, transform: +/// \code +/// memset(dst, c, dst_size); +/// memcpy(dst, src, src_size); +/// \endcode +/// into: +/// \code +/// memcpy(dst, src, src_size); +/// memset(dst + src_size, c, dst_size <= src_size ? 0 : dst_size - src_size); +/// \endcode +bool MemCpyOpt::processMemSetMemCpyDependence(MemCpyInst *MemCpy, + MemSetInst *MemSet) { + // We can only transform memset/memcpy with the same destination. + if (MemSet->getDest() != MemCpy->getDest()) + return false; + + // Check that there are no other dependencies on the memset destination. + MemDepResult DstDepInfo = + MD->getPointerDependencyFrom(AliasAnalysis::getLocationForDest(MemSet), + false, MemCpy, MemCpy->getParent()); + if (DstDepInfo.getInst() != MemSet) + return false; + + // Use the same i8* dest as the memcpy, killing the memset dest if different. + Value *Dest = MemCpy->getRawDest(); + Value *DestSize = MemSet->getLength(); + Value *SrcSize = MemCpy->getLength(); + + // By default, create an unaligned memset. + unsigned Align = 1; + // If Dest is aligned, and SrcSize is constant, use the minimum alignment + // of the sum. + const unsigned DestAlign = + std::max(MemSet->getAlignment(), MemCpy->getAlignment()); + if (DestAlign > 1) + if (ConstantInt *SrcSizeC = dyn_cast<ConstantInt>(SrcSize)) + Align = MinAlign(SrcSizeC->getZExtValue(), DestAlign); + + IRBuilder<> Builder(MemCpy); + + // If the sizes have different types, zext the smaller one. + if (DestSize->getType() != SrcSize->getType()) { + if (DestSize->getType()->getIntegerBitWidth() > + SrcSize->getType()->getIntegerBitWidth()) + SrcSize = Builder.CreateZExt(SrcSize, DestSize->getType()); + else + DestSize = Builder.CreateZExt(DestSize, SrcSize->getType()); + } + + Value *MemsetLen = + Builder.CreateSelect(Builder.CreateICmpULE(DestSize, SrcSize), + ConstantInt::getNullValue(DestSize->getType()), + Builder.CreateSub(DestSize, SrcSize)); + Builder.CreateMemSet(Builder.CreateGEP(Dest, SrcSize), MemSet->getOperand(1), + MemsetLen, Align); + + MD->removeInstruction(MemSet); + MemSet->eraseFromParent(); + return true; +} + +/// Transform memcpy to memset when its source was just memset. +/// In other words, turn: +/// \code +/// memset(dst1, c, dst1_size); +/// memcpy(dst2, dst1, dst2_size); +/// \endcode +/// into: +/// \code +/// memset(dst1, c, dst1_size); +/// memset(dst2, c, dst2_size); +/// \endcode +/// When dst2_size <= dst1_size. +/// +/// The \p MemCpy must have a Constant length. +bool MemCpyOpt::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy, + MemSetInst *MemSet) { + // This only makes sense on memcpy(..., memset(...), ...). + if (MemSet->getRawDest() != MemCpy->getRawSource()) + return false; + + ConstantInt *CopySize = cast<ConstantInt>(MemCpy->getLength()); + ConstantInt *MemSetSize = dyn_cast<ConstantInt>(MemSet->getLength()); + // Make sure the memcpy doesn't read any more than what the memset wrote. + // Don't worry about sizes larger than i64. + if (!MemSetSize || CopySize->getZExtValue() > MemSetSize->getZExtValue()) + return false; + + IRBuilder<> Builder(MemCpy); + Builder.CreateMemSet(MemCpy->getRawDest(), MemSet->getOperand(1), + CopySize, MemCpy->getAlignment()); + return true; +} /// processMemCpy - perform simplification of memcpy's. If we have memcpy A /// which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite @@ -874,17 +966,26 @@ bool MemCpyOpt::processMemCpy(MemCpyInst *M) { return true; } + MemDepResult DepInfo = MD->getDependency(M); + + // Try to turn a partially redundant memset + memcpy into + // memcpy + smaller memset. We don't need the memcpy size for this. + if (DepInfo.isClobber()) + if (MemSetInst *MDep = dyn_cast<MemSetInst>(DepInfo.getInst())) + if (processMemSetMemCpyDependence(M, MDep)) + return true; + // The optimizations after this point require the memcpy size. ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength()); if (!CopySize) return false; - // The are three possible optimizations we can do for memcpy: + // There are four possible optimizations we can do for memcpy: // a) memcpy-memcpy xform which exposes redundance for DSE. // b) call-memcpy xform for return slot optimization. // c) memcpy from freshly alloca'd space or space that has just started its // lifetime copies undefined data, and we can therefore eliminate the // memcpy in favor of the data that was already at the destination. - MemDepResult DepInfo = MD->getDependency(M); + // d) memcpy from a just-memset'd source can be turned into memset. if (DepInfo.isClobber()) { if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) { if (performCallSlotOptzn(M, M->getDest(), M->getSource(), @@ -900,9 +1001,10 @@ bool MemCpyOpt::processMemCpy(MemCpyInst *M) { AliasAnalysis::Location SrcLoc = AliasAnalysis::getLocationForSource(M); MemDepResult SrcDepInfo = MD->getPointerDependencyFrom(SrcLoc, true, M, M->getParent()); + if (SrcDepInfo.isClobber()) { if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(SrcDepInfo.getInst())) - return processMemCpyMemCpyDependence(M, MDep, CopySize->getZExtValue()); + return processMemCpyMemCpyDependence(M, MDep); } else if (SrcDepInfo.isDef()) { Instruction *I = SrcDepInfo.getInst(); bool hasUndefContents = false; @@ -924,6 +1026,15 @@ bool MemCpyOpt::processMemCpy(MemCpyInst *M) { } } + if (SrcDepInfo.isClobber()) + if (MemSetInst *MDep = dyn_cast<MemSetInst>(SrcDepInfo.getInst())) + if (performMemCpyToMemSetOptzn(M, MDep)) { + MD->removeInstruction(M); + M->eraseFromParent(); + ++NumCpyToSet; + return true; + } + return false; } @@ -959,12 +1070,11 @@ bool MemCpyOpt::processMemMove(MemMoveInst *M) { /// processByValArgument - This is called on every byval argument in call sites. bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) { - if (!DL) return false; - + const DataLayout &DL = CS.getCaller()->getParent()->getDataLayout(); // Find out what feeds this byval argument. Value *ByValArg = CS.getArgument(ArgNo); Type *ByValTy = cast<PointerType>(ByValArg->getType())->getElementType(); - uint64_t ByValSize = DL->getTypeAllocSize(ByValTy); + uint64_t ByValSize = DL.getTypeAllocSize(ByValTy); MemDepResult DepInfo = MD->getPointerDependencyFrom(AliasAnalysis::Location(ByValArg, ByValSize), true, CS.getInstruction(), @@ -997,8 +1107,8 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) { *CS->getParent()->getParent()); DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); if (MDep->getAlignment() < ByValAlign && - getOrEnforceKnownAlignment(MDep->getSource(), ByValAlign, DL, &AC, - CS.getInstruction(), &DT) < ByValAlign) + getOrEnforceKnownAlignment(MDep->getSource(), ByValAlign, DL, + CS.getInstruction(), &AC, &DT) < ByValAlign) return false; // Verify that the copied-from memory doesn't change in between the memcpy and @@ -1051,7 +1161,7 @@ bool MemCpyOpt::iterateOnFunction(Function &F) { RepeatInstruction = processMemCpy(M); else if (MemMoveInst *M = dyn_cast<MemMoveInst>(I)) RepeatInstruction = processMemMove(M); - else if (CallSite CS = (Value*)I) { + else if (auto CS = CallSite(I)) { for (unsigned i = 0, e = CS.arg_size(); i != e; ++i) if (CS.isByValArgument(i)) MadeChange |= processByValArgument(CS, i); @@ -1077,9 +1187,7 @@ bool MemCpyOpt::runOnFunction(Function &F) { bool MadeChange = false; MD = &getAnalysis<MemoryDependenceAnalysis>(); - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - DL = DLP ? &DLP->getDataLayout() : nullptr; - TLI = &getAnalysis<TargetLibraryInfo>(); + TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); // If we don't have at least memset and memcpy, there is little point of doing // anything here. These are required by a freestanding implementation, so if diff --git a/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp index 1f73cbc..611a941 100644 --- a/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp @@ -81,12 +81,13 @@ #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/SSAUpdater.h" #include <vector> @@ -115,9 +116,9 @@ public: private: // This transformation requires dominator postdominator info void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<TargetLibraryInfo>(); - AU.addRequired<MemoryDependenceAnalysis>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); AU.addRequired<AliasAnalysis>(); + AU.addPreserved<MemoryDependenceAnalysis>(); AU.addPreserved<AliasAnalysis>(); } @@ -168,7 +169,7 @@ FunctionPass *llvm::createMergedLoadStoreMotionPass() { INITIALIZE_PASS_BEGIN(MergedLoadStoreMotion, "mldst-motion", "MergedLoadStoreMotion", false, false) INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) INITIALIZE_PASS_END(MergedLoadStoreMotion, "mldst-motion", "MergedLoadStoreMotion", false, false) @@ -579,7 +580,7 @@ bool MergedLoadStoreMotion::mergeStores(BasicBlock *T) { /// \brief Run the transformation for each function /// bool MergedLoadStoreMotion::runOnFunction(Function &F) { - MD = &getAnalysis<MemoryDependenceAnalysis>(); + MD = getAnalysisIfAvailable<MemoryDependenceAnalysis>(); AA = &getAnalysis<AliasAnalysis>(); bool Changed = false; diff --git a/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp b/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp new file mode 100644 index 0000000..5b370e0 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp @@ -0,0 +1,481 @@ +//===- NaryReassociate.cpp - Reassociate n-ary expressions ----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass reassociates n-ary add expressions and eliminates the redundancy +// exposed by the reassociation. +// +// A motivating example: +// +// void foo(int a, int b) { +// bar(a + b); +// bar((a + 2) + b); +// } +// +// An ideal compiler should reassociate (a + 2) + b to (a + b) + 2 and simplify +// the above code to +// +// int t = a + b; +// bar(t); +// bar(t + 2); +// +// However, the Reassociate pass is unable to do that because it processes each +// instruction individually and believes (a + 2) + b is the best form according +// to its rank system. +// +// To address this limitation, NaryReassociate reassociates an expression in a +// form that reuses existing instructions. As a result, NaryReassociate can +// reassociate (a + 2) + b in the example to (a + b) + 2 because it detects that +// (a + b) is computed before. +// +// NaryReassociate works as follows. For every instruction in the form of (a + +// b) + c, it checks whether a + c or b + c is already computed by a dominating +// instruction. If so, it then reassociates (a + b) + c into (a + c) + b or (b + +// c) + a and removes the redundancy accordingly. To efficiently look up whether +// an expression is computed before, we store each instruction seen and its SCEV +// into an SCEV-to-instruction map. +// +// Although the algorithm pattern-matches only ternary additions, it +// automatically handles many >3-ary expressions by walking through the function +// in the depth-first order. For example, given +// +// (a + c) + d +// ((a + b) + c) + d +// +// NaryReassociate first rewrites (a + b) + c to (a + c) + b, and then rewrites +// ((a + c) + b) + d into ((a + c) + d) + b. +// +// Finally, the above dominator-based algorithm may need to be run multiple +// iterations before emitting optimal code. One source of this need is that we +// only split an operand when it is used only once. The above algorithm can +// eliminate an instruction and decrease the usage count of its operands. As a +// result, an instruction that previously had multiple uses may become a +// single-use instruction and thus eligible for split consideration. For +// example, +// +// ac = a + c +// ab = a + b +// abc = ab + c +// ab2 = ab + b +// ab2c = ab2 + c +// +// In the first iteration, we cannot reassociate abc to ac+b because ab is used +// twice. However, we can reassociate ab2c to abc+b in the first iteration. As a +// result, ab2 becomes dead and ab will be used only once in the second +// iteration. +// +// Limitations and TODO items: +// +// 1) We only considers n-ary adds for now. This should be extended and +// generalized. +// +// 2) Besides arithmetic operations, similar reassociation can be applied to +// GEPs. For example, if +// X = &arr[a] +// dominates +// Y = &arr[a + b] +// we may rewrite Y into X + b. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" +using namespace llvm; +using namespace PatternMatch; + +#define DEBUG_TYPE "nary-reassociate" + +namespace { +class NaryReassociate : public FunctionPass { +public: + static char ID; + + NaryReassociate(): FunctionPass(ID) { + initializeNaryReassociatePass(*PassRegistry::getPassRegistry()); + } + + bool doInitialization(Module &M) override { + DL = &M.getDataLayout(); + return false; + } + bool runOnFunction(Function &F) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addPreserved<ScalarEvolution>(); + AU.addPreserved<TargetLibraryInfoWrapperPass>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<ScalarEvolution>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.setPreservesCFG(); + } + +private: + // Runs only one iteration of the dominator-based algorithm. See the header + // comments for why we need multiple iterations. + bool doOneIteration(Function &F); + + // Reassociates I for better CSE. + Instruction *tryReassociate(Instruction *I); + + // Reassociate GEP for better CSE. + Instruction *tryReassociateGEP(GetElementPtrInst *GEP); + // Try splitting GEP at the I-th index and see whether either part can be + // CSE'ed. This is a helper function for tryReassociateGEP. + // + // \p IndexedType The element type indexed by GEP's I-th index. This is + // equivalent to + // GEP->getIndexedType(GEP->getPointerOperand(), 0-th index, + // ..., i-th index). + GetElementPtrInst *tryReassociateGEPAtIndex(GetElementPtrInst *GEP, + unsigned I, Type *IndexedType); + // Given GEP's I-th index = LHS + RHS, see whether &Base[..][LHS][..] or + // &Base[..][RHS][..] can be CSE'ed and rewrite GEP accordingly. + GetElementPtrInst *tryReassociateGEPAtIndex(GetElementPtrInst *GEP, + unsigned I, Value *LHS, + Value *RHS, Type *IndexedType); + + // Reassociate Add for better CSE. + Instruction *tryReassociateAdd(BinaryOperator *I); + // A helper function for tryReassociateAdd. LHS and RHS are explicitly passed. + Instruction *tryReassociateAdd(Value *LHS, Value *RHS, Instruction *I); + // Rewrites I to LHS + RHS if LHS is computed already. + Instruction *tryReassociatedAdd(const SCEV *LHS, Value *RHS, Instruction *I); + + // Returns the closest dominator of \c Dominatee that computes + // \c CandidateExpr. Returns null if not found. + Instruction *findClosestMatchingDominator(const SCEV *CandidateExpr, + Instruction *Dominatee); + // GetElementPtrInst implicitly sign-extends an index if the index is shorter + // than the pointer size. This function returns whether Index is shorter than + // GEP's pointer size, i.e., whether Index needs to be sign-extended in order + // to be an index of GEP. + bool requiresSignExtension(Value *Index, GetElementPtrInst *GEP); + + DominatorTree *DT; + ScalarEvolution *SE; + TargetLibraryInfo *TLI; + TargetTransformInfo *TTI; + const DataLayout *DL; + // A lookup table quickly telling which instructions compute the given SCEV. + // Note that there can be multiple instructions at different locations + // computing to the same SCEV, so we map a SCEV to an instruction list. For + // example, + // + // if (p1) + // foo(a + b); + // if (p2) + // bar(a + b); + DenseMap<const SCEV *, SmallVector<Instruction *, 2>> SeenExprs; +}; +} // anonymous namespace + +char NaryReassociate::ID = 0; +INITIALIZE_PASS_BEGIN(NaryReassociate, "nary-reassociate", "Nary reassociation", + false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_END(NaryReassociate, "nary-reassociate", "Nary reassociation", + false, false) + +FunctionPass *llvm::createNaryReassociatePass() { + return new NaryReassociate(); +} + +bool NaryReassociate::runOnFunction(Function &F) { + if (skipOptnoneFunction(F)) + return false; + + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + SE = &getAnalysis<ScalarEvolution>(); + TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + + bool Changed = false, ChangedInThisIteration; + do { + ChangedInThisIteration = doOneIteration(F); + Changed |= ChangedInThisIteration; + } while (ChangedInThisIteration); + return Changed; +} + +// Whitelist the instruction types NaryReassociate handles for now. +static bool isPotentiallyNaryReassociable(Instruction *I) { + switch (I->getOpcode()) { + case Instruction::Add: + case Instruction::GetElementPtr: + return true; + default: + return false; + } +} + +bool NaryReassociate::doOneIteration(Function &F) { + bool Changed = false; + SeenExprs.clear(); + // Process the basic blocks in pre-order of the dominator tree. This order + // ensures that all bases of a candidate are in Candidates when we process it. + for (auto Node = GraphTraits<DominatorTree *>::nodes_begin(DT); + Node != GraphTraits<DominatorTree *>::nodes_end(DT); ++Node) { + BasicBlock *BB = Node->getBlock(); + for (auto I = BB->begin(); I != BB->end(); ++I) { + if (SE->isSCEVable(I->getType()) && isPotentiallyNaryReassociable(I)) { + if (Instruction *NewI = tryReassociate(I)) { + Changed = true; + SE->forgetValue(I); + I->replaceAllUsesWith(NewI); + RecursivelyDeleteTriviallyDeadInstructions(I, TLI); + I = NewI; + } + // Add the rewritten instruction to SeenExprs; the original instruction + // is deleted. + SeenExprs[SE->getSCEV(I)].push_back(I); + } + } + } + return Changed; +} + +Instruction *NaryReassociate::tryReassociate(Instruction *I) { + switch (I->getOpcode()) { + case Instruction::Add: + return tryReassociateAdd(cast<BinaryOperator>(I)); + case Instruction::GetElementPtr: + return tryReassociateGEP(cast<GetElementPtrInst>(I)); + default: + llvm_unreachable("should be filtered out by isPotentiallyNaryReassociable"); + } +} + +// FIXME: extract this method into TTI->getGEPCost. +static bool isGEPFoldable(GetElementPtrInst *GEP, + const TargetTransformInfo *TTI, + const DataLayout *DL) { + GlobalVariable *BaseGV = nullptr; + int64_t BaseOffset = 0; + bool HasBaseReg = false; + int64_t Scale = 0; + + if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getPointerOperand())) + BaseGV = GV; + else + HasBaseReg = true; + + gep_type_iterator GTI = gep_type_begin(GEP); + for (auto I = GEP->idx_begin(); I != GEP->idx_end(); ++I, ++GTI) { + if (isa<SequentialType>(*GTI)) { + int64_t ElementSize = DL->getTypeAllocSize(GTI.getIndexedType()); + if (ConstantInt *ConstIdx = dyn_cast<ConstantInt>(*I)) { + BaseOffset += ConstIdx->getSExtValue() * ElementSize; + } else { + // Needs scale register. + if (Scale != 0) { + // No addressing mode takes two scale registers. + return false; + } + Scale = ElementSize; + } + } else { + StructType *STy = cast<StructType>(*GTI); + uint64_t Field = cast<ConstantInt>(*I)->getZExtValue(); + BaseOffset += DL->getStructLayout(STy)->getElementOffset(Field); + } + } + return TTI->isLegalAddressingMode(GEP->getType()->getElementType(), BaseGV, + BaseOffset, HasBaseReg, Scale); +} + +Instruction *NaryReassociate::tryReassociateGEP(GetElementPtrInst *GEP) { + // Not worth reassociating GEP if it is foldable. + if (isGEPFoldable(GEP, TTI, DL)) + return nullptr; + + gep_type_iterator GTI = gep_type_begin(*GEP); + for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I) { + if (isa<SequentialType>(*GTI++)) { + if (auto *NewGEP = tryReassociateGEPAtIndex(GEP, I - 1, *GTI)) { + return NewGEP; + } + } + } + return nullptr; +} + +bool NaryReassociate::requiresSignExtension(Value *Index, + GetElementPtrInst *GEP) { + unsigned PointerSizeInBits = + DL->getPointerSizeInBits(GEP->getType()->getPointerAddressSpace()); + return cast<IntegerType>(Index->getType())->getBitWidth() < PointerSizeInBits; +} + +GetElementPtrInst * +NaryReassociate::tryReassociateGEPAtIndex(GetElementPtrInst *GEP, unsigned I, + Type *IndexedType) { + Value *IndexToSplit = GEP->getOperand(I + 1); + if (SExtInst *SExt = dyn_cast<SExtInst>(IndexToSplit)) + IndexToSplit = SExt->getOperand(0); + + if (AddOperator *AO = dyn_cast<AddOperator>(IndexToSplit)) { + // If the I-th index needs sext and the underlying add is not equipped with + // nsw, we cannot split the add because + // sext(LHS + RHS) != sext(LHS) + sext(RHS). + if (requiresSignExtension(IndexToSplit, GEP) && !AO->hasNoSignedWrap()) + return nullptr; + Value *LHS = AO->getOperand(0), *RHS = AO->getOperand(1); + // IndexToSplit = LHS + RHS. + if (auto *NewGEP = tryReassociateGEPAtIndex(GEP, I, LHS, RHS, IndexedType)) + return NewGEP; + // Symmetrically, try IndexToSplit = RHS + LHS. + if (LHS != RHS) { + if (auto *NewGEP = + tryReassociateGEPAtIndex(GEP, I, RHS, LHS, IndexedType)) + return NewGEP; + } + } + return nullptr; +} + +GetElementPtrInst * +NaryReassociate::tryReassociateGEPAtIndex(GetElementPtrInst *GEP, unsigned I, + Value *LHS, Value *RHS, + Type *IndexedType) { + // Look for GEP's closest dominator that has the same SCEV as GEP except that + // the I-th index is replaced with LHS. + SmallVector<const SCEV *, 4> IndexExprs; + for (auto Index = GEP->idx_begin(); Index != GEP->idx_end(); ++Index) + IndexExprs.push_back(SE->getSCEV(*Index)); + // Replace the I-th index with LHS. + IndexExprs[I] = SE->getSCEV(LHS); + const SCEV *CandidateExpr = SE->getGEPExpr( + GEP->getSourceElementType(), SE->getSCEV(GEP->getPointerOperand()), + IndexExprs, GEP->isInBounds()); + + auto *Candidate = findClosestMatchingDominator(CandidateExpr, GEP); + if (Candidate == nullptr) + return nullptr; + + PointerType *TypeOfCandidate = dyn_cast<PointerType>(Candidate->getType()); + // Pretty rare but theoretically possible when a numeric value happens to + // share CandidateExpr. + if (TypeOfCandidate == nullptr) + return nullptr; + + // NewGEP = (char *)Candidate + RHS * sizeof(IndexedType) + uint64_t IndexedSize = DL->getTypeAllocSize(IndexedType); + Type *ElementType = TypeOfCandidate->getElementType(); + uint64_t ElementSize = DL->getTypeAllocSize(ElementType); + // Another less rare case: because I is not necessarily the last index of the + // GEP, the size of the type at the I-th index (IndexedSize) is not + // necessarily divisible by ElementSize. For example, + // + // #pragma pack(1) + // struct S { + // int a[3]; + // int64 b[8]; + // }; + // #pragma pack() + // + // sizeof(S) = 100 is indivisible by sizeof(int64) = 8. + // + // TODO: bail out on this case for now. We could emit uglygep. + if (IndexedSize % ElementSize != 0) + return nullptr; + + // NewGEP = &Candidate[RHS * (sizeof(IndexedType) / sizeof(Candidate[0]))); + IRBuilder<> Builder(GEP); + Type *IntPtrTy = DL->getIntPtrType(TypeOfCandidate); + if (RHS->getType() != IntPtrTy) + RHS = Builder.CreateSExtOrTrunc(RHS, IntPtrTy); + if (IndexedSize != ElementSize) { + RHS = Builder.CreateMul( + RHS, ConstantInt::get(IntPtrTy, IndexedSize / ElementSize)); + } + GetElementPtrInst *NewGEP = + cast<GetElementPtrInst>(Builder.CreateGEP(Candidate, RHS)); + NewGEP->setIsInBounds(GEP->isInBounds()); + NewGEP->takeName(GEP); + return NewGEP; +} + +Instruction *NaryReassociate::tryReassociateAdd(BinaryOperator *I) { + Value *LHS = I->getOperand(0), *RHS = I->getOperand(1); + if (auto *NewI = tryReassociateAdd(LHS, RHS, I)) + return NewI; + if (auto *NewI = tryReassociateAdd(RHS, LHS, I)) + return NewI; + return nullptr; +} + +Instruction *NaryReassociate::tryReassociateAdd(Value *LHS, Value *RHS, + Instruction *I) { + Value *A = nullptr, *B = nullptr; + // To be conservative, we reassociate I only when it is the only user of A+B. + if (LHS->hasOneUse() && match(LHS, m_Add(m_Value(A), m_Value(B)))) { + // I = (A + B) + RHS + // = (A + RHS) + B or (B + RHS) + A + const SCEV *AExpr = SE->getSCEV(A), *BExpr = SE->getSCEV(B); + const SCEV *RHSExpr = SE->getSCEV(RHS); + if (BExpr != RHSExpr) { + if (auto *NewI = tryReassociatedAdd(SE->getAddExpr(AExpr, RHSExpr), B, I)) + return NewI; + } + if (AExpr != RHSExpr) { + if (auto *NewI = tryReassociatedAdd(SE->getAddExpr(BExpr, RHSExpr), A, I)) + return NewI; + } + } + return nullptr; +} + +Instruction *NaryReassociate::tryReassociatedAdd(const SCEV *LHSExpr, + Value *RHS, Instruction *I) { + auto Pos = SeenExprs.find(LHSExpr); + // Bail out if LHSExpr is not previously seen. + if (Pos == SeenExprs.end()) + return nullptr; + + // Look for the closest dominator LHS of I that computes LHSExpr, and replace + // I with LHS + RHS. + auto *LHS = findClosestMatchingDominator(LHSExpr, I); + if (LHS == nullptr) + return nullptr; + + Instruction *NewI = BinaryOperator::CreateAdd(LHS, RHS, "", I); + NewI->takeName(I); + return NewI; +} + +Instruction * +NaryReassociate::findClosestMatchingDominator(const SCEV *CandidateExpr, + Instruction *Dominatee) { + auto Pos = SeenExprs.find(CandidateExpr); + if (Pos == SeenExprs.end()) + return nullptr; + + auto &Candidates = Pos->second; + // Because we process the basic blocks in pre-order of the dominator tree, a + // candidate that doesn't dominate the current instruction won't dominate any + // future instruction either. Therefore, we pop it out of the stack. This + // optimization makes the algorithm O(n). + while (!Candidates.empty()) { + Instruction *Candidate = Candidates.back(); + if (DT->dominates(Candidate, Dominatee)) + return Candidate; + Candidates.pop_back(); + } + return nullptr; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp index 5c8bed5..31d7df3 100644 --- a/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp @@ -18,7 +18,7 @@ #include "llvm/IR/Intrinsics.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" @@ -52,16 +52,18 @@ INITIALIZE_PASS(PartiallyInlineLibCalls, "partially-inline-libcalls", "Partially inline calls to library functions", false, false) void PartiallyInlineLibCalls::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<TargetLibraryInfo>(); - AU.addRequired<TargetTransformInfo>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); FunctionPass::getAnalysisUsage(AU); } bool PartiallyInlineLibCalls::runOnFunction(Function &F) { bool Changed = false; Function::iterator CurrBB; - TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>(); - const TargetTransformInfo *TTI = &getAnalysis<TargetTransformInfo>(); + TargetLibraryInfo *TLI = + &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + const TargetTransformInfo *TTI = + &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); for (Function::iterator BB = F.begin(), BE = F.end(); BB != BE;) { CurrBB = BB++; @@ -126,7 +128,7 @@ bool PartiallyInlineLibCalls::optimizeSQRT(CallInst *Call, // Move all instructions following Call to newly created block JoinBB. // Create phi and replace all uses. - BasicBlock *JoinBB = llvm::SplitBlock(&CurrBB, Call->getNextNode(), this); + BasicBlock *JoinBB = llvm::SplitBlock(&CurrBB, Call->getNextNode()); IRBuilder<> Builder(JoinBB, JoinBB->begin()); PHINode *Phi = Builder.CreatePHI(Call->getType(), 2); Call->replaceAllUsesWith(Phi); diff --git a/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp b/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp new file mode 100644 index 0000000..3e7deeb --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp @@ -0,0 +1,993 @@ +//===- PlaceSafepoints.cpp - Place GC Safepoints --------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Place garbage collection safepoints at appropriate locations in the IR. This +// does not make relocation semantics or variable liveness explicit. That's +// done by RewriteStatepointsForGC. +// +// Terminology: +// - A call is said to be "parseable" if there is a stack map generated for the +// return PC of the call. A runtime can determine where values listed in the +// deopt arguments and (after RewriteStatepointsForGC) gc arguments are located +// on the stack when the code is suspended inside such a call. Every parse +// point is represented by a call wrapped in an gc.statepoint intrinsic. +// - A "poll" is an explicit check in the generated code to determine if the +// runtime needs the generated code to cooperate by calling a helper routine +// and thus suspending its execution at a known state. The call to the helper +// routine will be parseable. The (gc & runtime specific) logic of a poll is +// assumed to be provided in a function of the name "gc.safepoint_poll". +// +// We aim to insert polls such that running code can quickly be brought to a +// well defined state for inspection by the collector. In the current +// implementation, this is done via the insertion of poll sites at method entry +// and the backedge of most loops. We try to avoid inserting more polls than +// are neccessary to ensure a finite period between poll sites. This is not +// because the poll itself is expensive in the generated code; it's not. Polls +// do tend to impact the optimizer itself in negative ways; we'd like to avoid +// perturbing the optimization of the method as much as we can. +// +// We also need to make most call sites parseable. The callee might execute a +// poll (or otherwise be inspected by the GC). If so, the entire stack +// (including the suspended frame of the current method) must be parseable. +// +// This pass will insert: +// - Call parse points ("call safepoints") for any call which may need to +// reach a safepoint during the execution of the callee function. +// - Backedge safepoint polls and entry safepoint polls to ensure that +// executing code reaches a safepoint poll in a finite amount of time. +// +// We do not currently support return statepoints, but adding them would not +// be hard. They are not required for correctness - entry safepoints are an +// alternative - but some GCs may prefer them. Patches welcome. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Pass.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/ADT/SetOperations.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Statepoint.h" +#include "llvm/IR/Value.h" +#include "llvm/IR/Verifier.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/Local.h" + +#define DEBUG_TYPE "safepoint-placement" +STATISTIC(NumEntrySafepoints, "Number of entry safepoints inserted"); +STATISTIC(NumCallSafepoints, "Number of call safepoints inserted"); +STATISTIC(NumBackedgeSafepoints, "Number of backedge safepoints inserted"); + +STATISTIC(CallInLoop, "Number of loops w/o safepoints due to calls in loop"); +STATISTIC(FiniteExecution, "Number of loops w/o safepoints finite execution"); + +using namespace llvm; + +// Ignore oppurtunities to avoid placing safepoints on backedges, useful for +// validation +static cl::opt<bool> AllBackedges("spp-all-backedges", cl::Hidden, + cl::init(false)); + +/// If true, do not place backedge safepoints in counted loops. +static cl::opt<bool> SkipCounted("spp-counted", cl::Hidden, cl::init(true)); + +// If true, split the backedge of a loop when placing the safepoint, otherwise +// split the latch block itself. Both are useful to support for +// experimentation, but in practice, it looks like splitting the backedge +// optimizes better. +static cl::opt<bool> SplitBackedge("spp-split-backedge", cl::Hidden, + cl::init(false)); + +// Print tracing output +static cl::opt<bool> TraceLSP("spp-trace", cl::Hidden, cl::init(false)); + +namespace { + +/// An analysis pass whose purpose is to identify each of the backedges in +/// the function which require a safepoint poll to be inserted. +struct PlaceBackedgeSafepointsImpl : public FunctionPass { + static char ID; + + /// The output of the pass - gives a list of each backedge (described by + /// pointing at the branch) which need a poll inserted. + std::vector<TerminatorInst *> PollLocations; + + /// True unless we're running spp-no-calls in which case we need to disable + /// the call dependend placement opts. + bool CallSafepointsEnabled; + + ScalarEvolution *SE = nullptr; + DominatorTree *DT = nullptr; + LoopInfo *LI = nullptr; + + PlaceBackedgeSafepointsImpl(bool CallSafepoints = false) + : FunctionPass(ID), CallSafepointsEnabled(CallSafepoints) { + initializePlaceBackedgeSafepointsImplPass(*PassRegistry::getPassRegistry()); + } + + bool runOnLoop(Loop *); + void runOnLoopAndSubLoops(Loop *L) { + // Visit all the subloops + for (auto I = L->begin(), E = L->end(); I != E; I++) + runOnLoopAndSubLoops(*I); + runOnLoop(L); + } + + bool runOnFunction(Function &F) override { + SE = &getAnalysis<ScalarEvolution>(); + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + for (auto I = LI->begin(), E = LI->end(); I != E; I++) { + runOnLoopAndSubLoops(*I); + } + return false; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<ScalarEvolution>(); + AU.addRequired<LoopInfoWrapperPass>(); + // We no longer modify the IR at all in this pass. Thus all + // analysis are preserved. + AU.setPreservesAll(); + } +}; +} + +static cl::opt<bool> NoEntry("spp-no-entry", cl::Hidden, cl::init(false)); +static cl::opt<bool> NoCall("spp-no-call", cl::Hidden, cl::init(false)); +static cl::opt<bool> NoBackedge("spp-no-backedge", cl::Hidden, cl::init(false)); + +namespace { +struct PlaceSafepoints : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + + PlaceSafepoints() : FunctionPass(ID) { + initializePlaceSafepointsPass(*PassRegistry::getPassRegistry()); + } + bool runOnFunction(Function &F) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + // We modify the graph wholesale (inlining, block insertion, etc). We + // preserve nothing at the moment. We could potentially preserve dom tree + // if that was worth doing + } +}; +} + +// Insert a safepoint poll immediately before the given instruction. Does +// not handle the parsability of state at the runtime call, that's the +// callers job. +static void +InsertSafepointPoll(Instruction *InsertBefore, + std::vector<CallSite> &ParsePointsNeeded /*rval*/); + +static bool isGCLeafFunction(const CallSite &CS); + +static bool needsStatepoint(const CallSite &CS) { + if (isGCLeafFunction(CS)) + return false; + if (CS.isCall()) { + CallInst *call = cast<CallInst>(CS.getInstruction()); + if (call->isInlineAsm()) + return false; + } + if (isStatepoint(CS) || isGCRelocate(CS) || isGCResult(CS)) { + return false; + } + return true; +} + +static Value *ReplaceWithStatepoint(const CallSite &CS, Pass *P); + +/// Returns true if this loop is known to contain a call safepoint which +/// must unconditionally execute on any iteration of the loop which returns +/// to the loop header via an edge from Pred. Returns a conservative correct +/// answer; i.e. false is always valid. +static bool containsUnconditionalCallSafepoint(Loop *L, BasicBlock *Header, + BasicBlock *Pred, + DominatorTree &DT) { + // In general, we're looking for any cut of the graph which ensures + // there's a call safepoint along every edge between Header and Pred. + // For the moment, we look only for the 'cuts' that consist of a single call + // instruction in a block which is dominated by the Header and dominates the + // loop latch (Pred) block. Somewhat surprisingly, walking the entire chain + // of such dominating blocks gets substaintially more occurences than just + // checking the Pred and Header blocks themselves. This may be due to the + // density of loop exit conditions caused by range and null checks. + // TODO: structure this as an analysis pass, cache the result for subloops, + // avoid dom tree recalculations + assert(DT.dominates(Header, Pred) && "loop latch not dominated by header?"); + + BasicBlock *Current = Pred; + while (true) { + for (Instruction &I : *Current) { + if (auto CS = CallSite(&I)) + // Note: Technically, needing a safepoint isn't quite the right + // condition here. We should instead be checking if the target method + // has an + // unconditional poll. In practice, this is only a theoretical concern + // since we don't have any methods with conditional-only safepoint + // polls. + if (needsStatepoint(CS)) + return true; + } + + if (Current == Header) + break; + Current = DT.getNode(Current)->getIDom()->getBlock(); + } + + return false; +} + +/// Returns true if this loop is known to terminate in a finite number of +/// iterations. Note that this function may return false for a loop which +/// does actual terminate in a finite constant number of iterations due to +/// conservatism in the analysis. +static bool mustBeFiniteCountedLoop(Loop *L, ScalarEvolution *SE, + BasicBlock *Pred) { + // Only used when SkipCounted is off + const unsigned upperTripBound = 8192; + + // A conservative bound on the loop as a whole. + const SCEV *MaxTrips = SE->getMaxBackedgeTakenCount(L); + if (MaxTrips != SE->getCouldNotCompute()) { + if (SE->getUnsignedRange(MaxTrips).getUnsignedMax().ult(upperTripBound)) + return true; + if (SkipCounted && + SE->getUnsignedRange(MaxTrips).getUnsignedMax().isIntN(32)) + return true; + } + + // If this is a conditional branch to the header with the alternate path + // being outside the loop, we can ask questions about the execution frequency + // of the exit block. + if (L->isLoopExiting(Pred)) { + // This returns an exact expression only. TODO: We really only need an + // upper bound here, but SE doesn't expose that. + const SCEV *MaxExec = SE->getExitCount(L, Pred); + if (MaxExec != SE->getCouldNotCompute()) { + if (SE->getUnsignedRange(MaxExec).getUnsignedMax().ult(upperTripBound)) + return true; + if (SkipCounted && + SE->getUnsignedRange(MaxExec).getUnsignedMax().isIntN(32)) + return true; + } + } + + return /* not finite */ false; +} + +static void scanOneBB(Instruction *start, Instruction *end, + std::vector<CallInst *> &calls, + std::set<BasicBlock *> &seen, + std::vector<BasicBlock *> &worklist) { + for (BasicBlock::iterator itr(start); + itr != start->getParent()->end() && itr != BasicBlock::iterator(end); + itr++) { + if (CallInst *CI = dyn_cast<CallInst>(&*itr)) { + calls.push_back(CI); + } + // FIXME: This code does not handle invokes + assert(!dyn_cast<InvokeInst>(&*itr) && + "support for invokes in poll code needed"); + // Only add the successor blocks if we reach the terminator instruction + // without encountering end first + if (itr->isTerminator()) { + BasicBlock *BB = itr->getParent(); + for (BasicBlock *Succ : successors(BB)) { + if (seen.count(Succ) == 0) { + worklist.push_back(Succ); + seen.insert(Succ); + } + } + } + } +} +static void scanInlinedCode(Instruction *start, Instruction *end, + std::vector<CallInst *> &calls, + std::set<BasicBlock *> &seen) { + calls.clear(); + std::vector<BasicBlock *> worklist; + seen.insert(start->getParent()); + scanOneBB(start, end, calls, seen, worklist); + while (!worklist.empty()) { + BasicBlock *BB = worklist.back(); + worklist.pop_back(); + scanOneBB(&*BB->begin(), end, calls, seen, worklist); + } +} + +bool PlaceBackedgeSafepointsImpl::runOnLoop(Loop *L) { + // Loop through all loop latches (branches controlling backedges). We need + // to place a safepoint on every backedge (potentially). + // Note: In common usage, there will be only one edge due to LoopSimplify + // having run sometime earlier in the pipeline, but this code must be correct + // w.r.t. loops with multiple backedges. + BasicBlock *header = L->getHeader(); + SmallVector<BasicBlock*, 16> LoopLatches; + L->getLoopLatches(LoopLatches); + for (BasicBlock *pred : LoopLatches) { + assert(L->contains(pred)); + + // Make a policy decision about whether this loop needs a safepoint or + // not. Note that this is about unburdening the optimizer in loops, not + // avoiding the runtime cost of the actual safepoint. + if (!AllBackedges) { + if (mustBeFiniteCountedLoop(L, SE, pred)) { + if (TraceLSP) + errs() << "skipping safepoint placement in finite loop\n"; + FiniteExecution++; + continue; + } + if (CallSafepointsEnabled && + containsUnconditionalCallSafepoint(L, header, pred, *DT)) { + // Note: This is only semantically legal since we won't do any further + // IPO or inlining before the actual call insertion.. If we hadn't, we + // might latter loose this call safepoint. + if (TraceLSP) + errs() << "skipping safepoint placement due to unconditional call\n"; + CallInLoop++; + continue; + } + } + + // TODO: We can create an inner loop which runs a finite number of + // iterations with an outer loop which contains a safepoint. This would + // not help runtime performance that much, but it might help our ability to + // optimize the inner loop. + + // Safepoint insertion would involve creating a new basic block (as the + // target of the current backedge) which does the safepoint (of all live + // variables) and branches to the true header + TerminatorInst *term = pred->getTerminator(); + + if (TraceLSP) { + errs() << "[LSP] terminator instruction: "; + term->dump(); + } + + PollLocations.push_back(term); + } + + return false; +} + +/// Returns true if an entry safepoint is not required before this callsite in +/// the caller function. +static bool doesNotRequireEntrySafepointBefore(const CallSite &CS) { + Instruction *Inst = CS.getInstruction(); + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) { + switch (II->getIntrinsicID()) { + case Intrinsic::experimental_gc_statepoint: + case Intrinsic::experimental_patchpoint_void: + case Intrinsic::experimental_patchpoint_i64: + // The can wrap an actual call which may grow the stack by an unbounded + // amount or run forever. + return false; + default: + // Most LLVM intrinsics are things which do not expand to actual calls, or + // at least if they do, are leaf functions that cause only finite stack + // growth. In particular, the optimizer likes to form things like memsets + // out of stores in the original IR. Another important example is + // llvm.frameescape which must occur in the entry block. Inserting a + // safepoint before it is not legal since it could push the frameescape + // out of the entry block. + return true; + } + } + return false; +} + +static Instruction *findLocationForEntrySafepoint(Function &F, + DominatorTree &DT) { + + // Conceptually, this poll needs to be on method entry, but in + // practice, we place it as late in the entry block as possible. We + // can place it as late as we want as long as it dominates all calls + // that can grow the stack. This, combined with backedge polls, + // give us all the progress guarantees we need. + + // hasNextInstruction and nextInstruction are used to iterate + // through a "straight line" execution sequence. + + auto hasNextInstruction = [](Instruction *I) { + if (!I->isTerminator()) { + return true; + } + BasicBlock *nextBB = I->getParent()->getUniqueSuccessor(); + return nextBB && (nextBB->getUniquePredecessor() != nullptr); + }; + + auto nextInstruction = [&hasNextInstruction](Instruction *I) { + assert(hasNextInstruction(I) && + "first check if there is a next instruction!"); + if (I->isTerminator()) { + return I->getParent()->getUniqueSuccessor()->begin(); + } else { + return std::next(BasicBlock::iterator(I)); + } + }; + + Instruction *cursor = nullptr; + for (cursor = F.getEntryBlock().begin(); hasNextInstruction(cursor); + cursor = nextInstruction(cursor)) { + + // We need to ensure a safepoint poll occurs before any 'real' call. The + // easiest way to ensure finite execution between safepoints in the face of + // recursive and mutually recursive functions is to enforce that each take + // a safepoint. Additionally, we need to ensure a poll before any call + // which can grow the stack by an unbounded amount. This isn't required + // for GC semantics per se, but is a common requirement for languages + // which detect stack overflow via guard pages and then throw exceptions. + if (auto CS = CallSite(cursor)) { + if (doesNotRequireEntrySafepointBefore(CS)) + continue; + break; + } + } + + assert((hasNextInstruction(cursor) || cursor->isTerminator()) && + "either we stopped because of a call, or because of terminator"); + + return cursor; +} + +/// Identify the list of call sites which need to be have parseable state +static void findCallSafepoints(Function &F, + std::vector<CallSite> &Found /*rval*/) { + assert(Found.empty() && "must be empty!"); + for (Instruction &I : inst_range(F)) { + Instruction *inst = &I; + if (isa<CallInst>(inst) || isa<InvokeInst>(inst)) { + CallSite CS(inst); + + // No safepoint needed or wanted + if (!needsStatepoint(CS)) { + continue; + } + + Found.push_back(CS); + } + } +} + +/// Implement a unique function which doesn't require we sort the input +/// vector. Doing so has the effect of changing the output of a couple of +/// tests in ways which make them less useful in testing fused safepoints. +template <typename T> static void unique_unsorted(std::vector<T> &vec) { + std::set<T> seen; + std::vector<T> tmp; + vec.reserve(vec.size()); + std::swap(tmp, vec); + for (auto V : tmp) { + if (seen.insert(V).second) { + vec.push_back(V); + } + } +} + +static std::string GCSafepointPollName("gc.safepoint_poll"); + +static bool isGCSafepointPoll(Function &F) { + return F.getName().equals(GCSafepointPollName); +} + +/// Returns true if this function should be rewritten to include safepoint +/// polls and parseable call sites. The main point of this function is to be +/// an extension point for custom logic. +static bool shouldRewriteFunction(Function &F) { + // TODO: This should check the GCStrategy + if (F.hasGC()) { + const char *FunctionGCName = F.getGC(); + const StringRef StatepointExampleName("statepoint-example"); + const StringRef CoreCLRName("coreclr"); + return (StatepointExampleName == FunctionGCName) || + (CoreCLRName == FunctionGCName); + } else + return false; +} + +// TODO: These should become properties of the GCStrategy, possibly with +// command line overrides. +static bool enableEntrySafepoints(Function &F) { return !NoEntry; } +static bool enableBackedgeSafepoints(Function &F) { return !NoBackedge; } +static bool enableCallSafepoints(Function &F) { return !NoCall; } + +// Normalize basic block to make it ready to be target of invoke statepoint. +// Ensure that 'BB' does not have phi nodes. It may require spliting it. +static BasicBlock *normalizeForInvokeSafepoint(BasicBlock *BB, + BasicBlock *InvokeParent) { + BasicBlock *ret = BB; + + if (!BB->getUniquePredecessor()) { + ret = SplitBlockPredecessors(BB, InvokeParent, ""); + } + + // Now that 'ret' has unique predecessor we can safely remove all phi nodes + // from it + FoldSingleEntryPHINodes(ret); + assert(!isa<PHINode>(ret->begin())); + + return ret; +} + +bool PlaceSafepoints::runOnFunction(Function &F) { + if (F.isDeclaration() || F.empty()) { + // This is a declaration, nothing to do. Must exit early to avoid crash in + // dom tree calculation + return false; + } + + if (isGCSafepointPoll(F)) { + // Given we're inlining this inside of safepoint poll insertion, this + // doesn't make any sense. Note that we do make any contained calls + // parseable after we inline a poll. + return false; + } + + if (!shouldRewriteFunction(F)) + return false; + + bool modified = false; + + // In various bits below, we rely on the fact that uses are reachable from + // defs. When there are basic blocks unreachable from the entry, dominance + // and reachablity queries return non-sensical results. Thus, we preprocess + // the function to ensure these properties hold. + modified |= removeUnreachableBlocks(F); + + // STEP 1 - Insert the safepoint polling locations. We do not need to + // actually insert parse points yet. That will be done for all polls and + // calls in a single pass. + + DominatorTree DT; + DT.recalculate(F); + + SmallVector<Instruction *, 16> PollsNeeded; + std::vector<CallSite> ParsePointNeeded; + + if (enableBackedgeSafepoints(F)) { + // Construct a pass manager to run the LoopPass backedge logic. We + // need the pass manager to handle scheduling all the loop passes + // appropriately. Doing this by hand is painful and just not worth messing + // with for the moment. + legacy::FunctionPassManager FPM(F.getParent()); + bool CanAssumeCallSafepoints = enableCallSafepoints(F); + PlaceBackedgeSafepointsImpl *PBS = + new PlaceBackedgeSafepointsImpl(CanAssumeCallSafepoints); + FPM.add(PBS); + FPM.run(F); + + // We preserve dominance information when inserting the poll, otherwise + // we'd have to recalculate this on every insert + DT.recalculate(F); + + auto &PollLocations = PBS->PollLocations; + + auto OrderByBBName = [](Instruction *a, Instruction *b) { + return a->getParent()->getName() < b->getParent()->getName(); + }; + // We need the order of list to be stable so that naming ends up stable + // when we split edges. This makes test cases much easier to write. + std::sort(PollLocations.begin(), PollLocations.end(), OrderByBBName); + + // We can sometimes end up with duplicate poll locations. This happens if + // a single loop is visited more than once. The fact this happens seems + // wrong, but it does happen for the split-backedge.ll test case. + PollLocations.erase(std::unique(PollLocations.begin(), + PollLocations.end()), + PollLocations.end()); + + // Insert a poll at each point the analysis pass identified + // The poll location must be the terminator of a loop latch block. + for (TerminatorInst *Term : PollLocations) { + // We are inserting a poll, the function is modified + modified = true; + + if (SplitBackedge) { + // Split the backedge of the loop and insert the poll within that new + // basic block. This creates a loop with two latches per original + // latch (which is non-ideal), but this appears to be easier to + // optimize in practice than inserting the poll immediately before the + // latch test. + + // Since this is a latch, at least one of the successors must dominate + // it. Its possible that we have a) duplicate edges to the same header + // and b) edges to distinct loop headers. We need to insert pools on + // each. + SetVector<BasicBlock *> Headers; + for (unsigned i = 0; i < Term->getNumSuccessors(); i++) { + BasicBlock *Succ = Term->getSuccessor(i); + if (DT.dominates(Succ, Term->getParent())) { + Headers.insert(Succ); + } + } + assert(!Headers.empty() && "poll location is not a loop latch?"); + + // The split loop structure here is so that we only need to recalculate + // the dominator tree once. Alternatively, we could just keep it up to + // date and use a more natural merged loop. + SetVector<BasicBlock *> SplitBackedges; + for (BasicBlock *Header : Headers) { + BasicBlock *NewBB = SplitEdge(Term->getParent(), Header, &DT); + PollsNeeded.push_back(NewBB->getTerminator()); + NumBackedgeSafepoints++; + } + } else { + // Split the latch block itself, right before the terminator. + PollsNeeded.push_back(Term); + NumBackedgeSafepoints++; + } + } + } + + if (enableEntrySafepoints(F)) { + Instruction *Location = findLocationForEntrySafepoint(F, DT); + if (!Location) { + // policy choice not to insert? + } else { + PollsNeeded.push_back(Location); + modified = true; + NumEntrySafepoints++; + } + } + + // Now that we've identified all the needed safepoint poll locations, insert + // safepoint polls themselves. + for (Instruction *PollLocation : PollsNeeded) { + std::vector<CallSite> RuntimeCalls; + InsertSafepointPoll(PollLocation, RuntimeCalls); + ParsePointNeeded.insert(ParsePointNeeded.end(), RuntimeCalls.begin(), + RuntimeCalls.end()); + } + PollsNeeded.clear(); // make sure we don't accidentally use + // The dominator tree has been invalidated by the inlining performed in the + // above loop. TODO: Teach the inliner how to update the dom tree? + DT.recalculate(F); + + if (enableCallSafepoints(F)) { + std::vector<CallSite> Calls; + findCallSafepoints(F, Calls); + NumCallSafepoints += Calls.size(); + ParsePointNeeded.insert(ParsePointNeeded.end(), Calls.begin(), Calls.end()); + } + + // Unique the vectors since we can end up with duplicates if we scan the call + // site for call safepoints after we add it for entry or backedge. The + // only reason we need tracking at all is that some functions might have + // polls but not call safepoints and thus we might miss marking the runtime + // calls for the polls. (This is useful in test cases!) + unique_unsorted(ParsePointNeeded); + + // Any parse point (no matter what source) will be handled here + + // We're about to start modifying the function + if (!ParsePointNeeded.empty()) + modified = true; + + // Now run through and insert the safepoints, but do _NOT_ update or remove + // any existing uses. We have references to live variables that need to + // survive to the last iteration of this loop. + std::vector<Value *> Results; + Results.reserve(ParsePointNeeded.size()); + for (size_t i = 0; i < ParsePointNeeded.size(); i++) { + CallSite &CS = ParsePointNeeded[i]; + + // For invoke statepoints we need to remove all phi nodes at the normal + // destination block. + // Reason for this is that we can place gc_result only after last phi node + // in basic block. We will get malformed code after RAUW for the + // gc_result if one of this phi nodes uses result from the invoke. + if (InvokeInst *Invoke = dyn_cast<InvokeInst>(CS.getInstruction())) { + normalizeForInvokeSafepoint(Invoke->getNormalDest(), + Invoke->getParent()); + } + + Value *GCResult = ReplaceWithStatepoint(CS, nullptr); + Results.push_back(GCResult); + } + assert(Results.size() == ParsePointNeeded.size()); + + // Adjust all users of the old call sites to use the new ones instead + for (size_t i = 0; i < ParsePointNeeded.size(); i++) { + CallSite &CS = ParsePointNeeded[i]; + Value *GCResult = Results[i]; + if (GCResult) { + // Can not RAUW for the invoke gc result in case of phi nodes preset. + assert(CS.isCall() || !isa<PHINode>(cast<Instruction>(GCResult)->getParent()->begin())); + + // Replace all uses with the new call + CS.getInstruction()->replaceAllUsesWith(GCResult); + } + + // Now that we've handled all uses, remove the original call itself + // Note: The insert point can't be the deleted instruction! + CS.getInstruction()->eraseFromParent(); + } + return modified; +} + +char PlaceBackedgeSafepointsImpl::ID = 0; +char PlaceSafepoints::ID = 0; + +FunctionPass *llvm::createPlaceSafepointsPass() { + return new PlaceSafepoints(); +} + +INITIALIZE_PASS_BEGIN(PlaceBackedgeSafepointsImpl, + "place-backedge-safepoints-impl", + "Place Backedge Safepoints", false, false) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_END(PlaceBackedgeSafepointsImpl, + "place-backedge-safepoints-impl", + "Place Backedge Safepoints", false, false) + +INITIALIZE_PASS_BEGIN(PlaceSafepoints, "place-safepoints", "Place Safepoints", + false, false) +INITIALIZE_PASS_END(PlaceSafepoints, "place-safepoints", "Place Safepoints", + false, false) + +static bool isGCLeafFunction(const CallSite &CS) { + Instruction *inst = CS.getInstruction(); + if (isa<IntrinsicInst>(inst)) { + // Most LLVM intrinsics are things which can never take a safepoint. + // As a result, we don't need to have the stack parsable at the + // callsite. This is a highly useful optimization since intrinsic + // calls are fairly prevelent, particularly in debug builds. + return true; + } + + // If this function is marked explicitly as a leaf call, we don't need to + // place a safepoint of it. In fact, for correctness we *can't* in many + // cases. Note: Indirect calls return Null for the called function, + // these obviously aren't runtime functions with attributes + // TODO: Support attributes on the call site as well. + const Function *F = CS.getCalledFunction(); + bool isLeaf = + F && + F->getFnAttribute("gc-leaf-function").getValueAsString().equals("true"); + if (isLeaf) { + return true; + } + return false; +} + +static void +InsertSafepointPoll(Instruction *InsertBefore, + std::vector<CallSite> &ParsePointsNeeded /*rval*/) { + BasicBlock *OrigBB = InsertBefore->getParent(); + Module *M = InsertBefore->getModule(); + assert(M && "must be part of a module"); + + // Inline the safepoint poll implementation - this will get all the branch, + // control flow, etc.. Most importantly, it will introduce the actual slow + // path call - where we need to insert a safepoint (parsepoint). + + auto *F = M->getFunction(GCSafepointPollName); + assert(F->getType()->getElementType() == + FunctionType::get(Type::getVoidTy(M->getContext()), false) && + "gc.safepoint_poll declared with wrong type"); + assert(!F->empty() && "gc.safepoint_poll must be a non-empty function"); + CallInst *PollCall = CallInst::Create(F, "", InsertBefore); + + // Record some information about the call site we're replacing + BasicBlock::iterator before(PollCall), after(PollCall); + bool isBegin(false); + if (before == OrigBB->begin()) { + isBegin = true; + } else { + before--; + } + after++; + assert(after != OrigBB->end() && "must have successor"); + + // do the actual inlining + InlineFunctionInfo IFI; + bool InlineStatus = InlineFunction(PollCall, IFI); + assert(InlineStatus && "inline must succeed"); + (void)InlineStatus; // suppress warning in release-asserts + + // Check post conditions + assert(IFI.StaticAllocas.empty() && "can't have allocs"); + + std::vector<CallInst *> calls; // new calls + std::set<BasicBlock *> BBs; // new BBs + insertee + // Include only the newly inserted instructions, Note: begin may not be valid + // if we inserted to the beginning of the basic block + BasicBlock::iterator start; + if (isBegin) { + start = OrigBB->begin(); + } else { + start = before; + start++; + } + + // If your poll function includes an unreachable at the end, that's not + // valid. Bugpoint likes to create this, so check for it. + assert(isPotentiallyReachable(&*start, &*after, nullptr, nullptr) && + "malformed poll function"); + + scanInlinedCode(&*(start), &*(after), calls, BBs); + assert(!calls.empty() && "slow path not found for safepoint poll"); + + // Record the fact we need a parsable state at the runtime call contained in + // the poll function. This is required so that the runtime knows how to + // parse the last frame when we actually take the safepoint (i.e. execute + // the slow path) + assert(ParsePointsNeeded.empty()); + for (size_t i = 0; i < calls.size(); i++) { + + // No safepoint needed or wanted + if (!needsStatepoint(calls[i])) { + continue; + } + + // These are likely runtime calls. Should we assert that via calling + // convention or something? + ParsePointsNeeded.push_back(CallSite(calls[i])); + } + assert(ParsePointsNeeded.size() <= calls.size()); +} + +/// Replaces the given call site (Call or Invoke) with a gc.statepoint +/// intrinsic with an empty deoptimization arguments list. This does +/// NOT do explicit relocation for GC support. +static Value *ReplaceWithStatepoint(const CallSite &CS, /* to replace */ + Pass *P) { + assert(CS.getInstruction()->getParent()->getParent()->getParent() && + "must be set"); + + // TODO: technically, a pass is not allowed to get functions from within a + // function pass since it might trigger a new function addition. Refactor + // this logic out to the initialization of the pass. Doesn't appear to + // matter in practice. + + // Then go ahead and use the builder do actually do the inserts. We insert + // immediately before the previous instruction under the assumption that all + // arguments will be available here. We can't insert afterwards since we may + // be replacing a terminator. + IRBuilder<> Builder(CS.getInstruction()); + + // Note: The gc args are not filled in at this time, that's handled by + // RewriteStatepointsForGC (which is currently under review). + + // Create the statepoint given all the arguments + Instruction *Token = nullptr; + + uint64_t ID; + uint32_t NumPatchBytes; + + AttributeSet OriginalAttrs = CS.getAttributes(); + Attribute AttrID = + OriginalAttrs.getAttribute(AttributeSet::FunctionIndex, "statepoint-id"); + Attribute AttrNumPatchBytes = OriginalAttrs.getAttribute( + AttributeSet::FunctionIndex, "statepoint-num-patch-bytes"); + + AttrBuilder AttrsToRemove; + bool HasID = AttrID.isStringAttribute() && + !AttrID.getValueAsString().getAsInteger(10, ID); + + if (HasID) + AttrsToRemove.addAttribute("statepoint-id"); + else + ID = 0xABCDEF00; + + bool HasNumPatchBytes = + AttrNumPatchBytes.isStringAttribute() && + !AttrNumPatchBytes.getValueAsString().getAsInteger(10, NumPatchBytes); + + if (HasNumPatchBytes) + AttrsToRemove.addAttribute("statepoint-num-patch-bytes"); + else + NumPatchBytes = 0; + + OriginalAttrs = OriginalAttrs.removeAttributes( + CS.getInstruction()->getContext(), AttributeSet::FunctionIndex, + AttrsToRemove); + + Value *StatepointTarget = NumPatchBytes == 0 + ? CS.getCalledValue() + : ConstantPointerNull::get(cast<PointerType>( + CS.getCalledValue()->getType())); + + if (CS.isCall()) { + CallInst *ToReplace = cast<CallInst>(CS.getInstruction()); + CallInst *Call = Builder.CreateGCStatepointCall( + ID, NumPatchBytes, StatepointTarget, + makeArrayRef(CS.arg_begin(), CS.arg_end()), None, None, + "safepoint_token"); + Call->setTailCall(ToReplace->isTailCall()); + Call->setCallingConv(ToReplace->getCallingConv()); + + // In case if we can handle this set of attributes - set up function + // attributes directly on statepoint and return attributes later for + // gc_result intrinsic. + Call->setAttributes(OriginalAttrs.getFnAttributes()); + + Token = Call; + + // Put the following gc_result and gc_relocate calls immediately after the + // the old call (which we're about to delete). + assert(ToReplace->getNextNode() && "not a terminator, must have next"); + Builder.SetInsertPoint(ToReplace->getNextNode()); + Builder.SetCurrentDebugLocation(ToReplace->getNextNode()->getDebugLoc()); + } else if (CS.isInvoke()) { + InvokeInst *ToReplace = cast<InvokeInst>(CS.getInstruction()); + + // Insert the new invoke into the old block. We'll remove the old one in a + // moment at which point this will become the new terminator for the + // original block. + Builder.SetInsertPoint(ToReplace->getParent()); + InvokeInst *Invoke = Builder.CreateGCStatepointInvoke( + ID, NumPatchBytes, StatepointTarget, ToReplace->getNormalDest(), + ToReplace->getUnwindDest(), makeArrayRef(CS.arg_begin(), CS.arg_end()), + None, None, "safepoint_token"); + + Invoke->setCallingConv(ToReplace->getCallingConv()); + + // In case if we can handle this set of attributes - set up function + // attributes directly on statepoint and return attributes later for + // gc_result intrinsic. + Invoke->setAttributes(OriginalAttrs.getFnAttributes()); + + Token = Invoke; + + // We'll insert the gc.result into the normal block + BasicBlock *NormalDest = ToReplace->getNormalDest(); + // Can not insert gc.result in case of phi nodes preset. + // Should have removed this cases prior to runnning this function + assert(!isa<PHINode>(NormalDest->begin())); + Instruction *IP = &*(NormalDest->getFirstInsertionPt()); + Builder.SetInsertPoint(IP); + } else { + llvm_unreachable("unexpect type of CallSite"); + } + assert(Token); + + // Handle the return value of the original call - update all uses to use a + // gc_result hanging off the statepoint node we just inserted + + // Only add the gc_result iff there is actually a used result + if (!CS.getType()->isVoidTy() && !CS.getInstruction()->use_empty()) { + std::string TakenName = + CS.getInstruction()->hasName() ? CS.getInstruction()->getName() : ""; + CallInst *GCResult = Builder.CreateGCResult(Token, CS.getType(), TakenName); + GCResult->setAttributes(OriginalAttrs.getRetAttributes()); + return GCResult; + } else { + // No return value for the call. + return nullptr; + } +} diff --git a/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp index 4e02255..b677523 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp @@ -59,7 +59,7 @@ namespace { } #ifndef NDEBUG -/// PrintOps - Print out the expression identified in the Ops list. +/// Print out the expression identified in the Ops list. /// static void PrintOps(Instruction *I, const SmallVectorImpl<ValueEntry> &Ops) { Module *M = I->getParent()->getParent()->getParent(); @@ -233,8 +233,8 @@ INITIALIZE_PASS(Reassociate, "reassociate", // Public interface to the Reassociate pass FunctionPass *llvm::createReassociatePass() { return new Reassociate(); } -/// isReassociableOp - Return true if V is an instruction of the specified -/// opcode and if it only has one use. +/// Return true if V is an instruction of the specified opcode and if it +/// only has one use. static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode) { if (V->hasOneUse() && isa<Instruction>(V) && cast<Instruction>(V)->getOpcode() == Opcode && @@ -321,10 +321,8 @@ unsigned Reassociate::getRank(Value *V) { // If this is a not or neg instruction, do not count it for rank. This // assures us that X and ~X will have the same rank. - Type *Ty = V->getType(); - if ((!Ty->isIntegerTy() && !Ty->isFloatingPointTy()) || - (!BinaryOperator::isNot(I) && !BinaryOperator::isNeg(I) && - !BinaryOperator::isFNeg(I))) + if (!BinaryOperator::isNot(I) && !BinaryOperator::isNeg(I) && + !BinaryOperator::isFNeg(I)) ++Rank; DEBUG(dbgs() << "Calculated Rank[" << V->getName() << "] = " << Rank << "\n"); @@ -351,7 +349,7 @@ void Reassociate::canonicalizeOperands(Instruction *I) { static BinaryOperator *CreateAdd(Value *S1, Value *S2, const Twine &Name, Instruction *InsertBefore, Value *FlagsOp) { - if (S1->getType()->isIntegerTy()) + if (S1->getType()->isIntOrIntVectorTy()) return BinaryOperator::CreateAdd(S1, S2, Name, InsertBefore); else { BinaryOperator *Res = @@ -363,7 +361,7 @@ static BinaryOperator *CreateAdd(Value *S1, Value *S2, const Twine &Name, static BinaryOperator *CreateMul(Value *S1, Value *S2, const Twine &Name, Instruction *InsertBefore, Value *FlagsOp) { - if (S1->getType()->isIntegerTy()) + if (S1->getType()->isIntOrIntVectorTy()) return BinaryOperator::CreateMul(S1, S2, Name, InsertBefore); else { BinaryOperator *Res = @@ -375,7 +373,7 @@ static BinaryOperator *CreateMul(Value *S1, Value *S2, const Twine &Name, static BinaryOperator *CreateNeg(Value *S1, const Twine &Name, Instruction *InsertBefore, Value *FlagsOp) { - if (S1->getType()->isIntegerTy()) + if (S1->getType()->isIntOrIntVectorTy()) return BinaryOperator::CreateNeg(S1, Name, InsertBefore); else { BinaryOperator *Res = BinaryOperator::CreateFNeg(S1, Name, InsertBefore); @@ -384,12 +382,11 @@ static BinaryOperator *CreateNeg(Value *S1, const Twine &Name, } } -/// LowerNegateToMultiply - Replace 0-X with X*-1. -/// +/// Replace 0-X with X*-1. static BinaryOperator *LowerNegateToMultiply(Instruction *Neg) { Type *Ty = Neg->getType(); - Constant *NegOne = Ty->isIntegerTy() ? ConstantInt::getAllOnesValue(Ty) - : ConstantFP::get(Ty, -1.0); + Constant *NegOne = Ty->isIntOrIntVectorTy() ? + ConstantInt::getAllOnesValue(Ty) : ConstantFP::get(Ty, -1.0); BinaryOperator *Res = CreateMul(Neg->getOperand(1), NegOne, "", Neg, Neg); Neg->setOperand(1, Constant::getNullValue(Ty)); // Drop use of op. @@ -399,8 +396,8 @@ static BinaryOperator *LowerNegateToMultiply(Instruction *Neg) { return Res; } -/// CarmichaelShift - Returns k such that lambda(2^Bitwidth) = 2^k, where lambda -/// is the Carmichael function. This means that x^(2^k) === 1 mod 2^Bitwidth for +/// Returns k such that lambda(2^Bitwidth) = 2^k, where lambda is the Carmichael +/// function. This means that x^(2^k) === 1 mod 2^Bitwidth for /// every odd x, i.e. x^(2^k) = 1 for every odd x in Bitwidth-bit arithmetic. /// Note that 0 <= k < Bitwidth, and if Bitwidth > 3 then x^(2^k) = 0 for every /// even x in Bitwidth-bit arithmetic. @@ -410,7 +407,7 @@ static unsigned CarmichaelShift(unsigned Bitwidth) { return Bitwidth - 2; } -/// IncorporateWeight - Add the extra weight 'RHS' to the existing weight 'LHS', +/// Add the extra weight 'RHS' to the existing weight 'LHS', /// reducing the combined weight using any special properties of the operation. /// The existing weight LHS represents the computation X op X op ... op X where /// X occurs LHS times. The combined weight represents X op X op ... op X with @@ -492,7 +489,7 @@ static void IncorporateWeight(APInt &LHS, const APInt &RHS, unsigned Opcode) { typedef std::pair<Value*, APInt> RepeatedValue; -/// LinearizeExprTree - Given an associative binary expression, return the leaf +/// Given an associative binary expression, return the leaf /// nodes in Ops along with their weights (how many times the leaf occurs). The /// original expression is the same as /// (Ops[0].first op Ops[0].first op ... Ops[0].first) <- Ops[0].second times @@ -742,8 +739,8 @@ static bool LinearizeExprTree(BinaryOperator *I, return Changed; } -// RewriteExprTree - Now that the operands for this expression tree are -// linearized and optimized, emit them in-order. +/// Now that the operands for this expression tree are +/// linearized and optimized, emit them in-order. void Reassociate::RewriteExprTree(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops) { assert(Ops.size() > 1 && "Single values should be used directly!"); @@ -872,7 +869,7 @@ void Reassociate::RewriteExprTree(BinaryOperator *I, Constant *Undef = UndefValue::get(I->getType()); NewOp = BinaryOperator::Create(Instruction::BinaryOps(Opcode), Undef, Undef, "", I); - if (NewOp->getType()->isFloatingPointTy()) + if (NewOp->getType()->isFPOrFPVectorTy()) NewOp->setFastMathFlags(I->getFastMathFlags()); } else { NewOp = NodesToRewrite.pop_back_val(); @@ -912,15 +909,18 @@ void Reassociate::RewriteExprTree(BinaryOperator *I, RedoInsts.insert(NodesToRewrite[i]); } -/// NegateValue - Insert instructions before the instruction pointed to by BI, +/// Insert instructions before the instruction pointed to by BI, /// that computes the negative version of the value specified. The negative /// version of the value is returned, and BI is left pointing at the instruction /// that should be processed next by the reassociation pass. static Value *NegateValue(Value *V, Instruction *BI) { - if (ConstantFP *C = dyn_cast<ConstantFP>(V)) - return ConstantExpr::getFNeg(C); - if (Constant *C = dyn_cast<Constant>(V)) + if (Constant *C = dyn_cast<Constant>(V)) { + if (C->getType()->isFPOrFPVectorTy()) { + return ConstantExpr::getFNeg(C); + } return ConstantExpr::getNeg(C); + } + // We are trying to expose opportunity for reassociation. One of the things // that we want to do to achieve this is to push a negation as deep into an @@ -984,8 +984,7 @@ static Value *NegateValue(Value *V, Instruction *BI) { return CreateNeg(V, V->getName() + ".neg", BI, BI); } -/// ShouldBreakUpSubtract - Return true if we should break up this subtract of -/// X-Y into (X + -Y). +/// Return true if we should break up this subtract of X-Y into (X + -Y). static bool ShouldBreakUpSubtract(Instruction *Sub) { // If this is a negation, we can't split it up! if (BinaryOperator::isNeg(Sub) || BinaryOperator::isFNeg(Sub)) @@ -1014,9 +1013,8 @@ static bool ShouldBreakUpSubtract(Instruction *Sub) { return false; } -/// BreakUpSubtract - If we have (X-Y), and if either X is an add, or if this is -/// only used by an add, transform this into (X+(0-Y)) to promote better -/// reassociation. +/// If we have (X-Y), and if either X is an add, or if this is only used by an +/// add, transform this into (X+(0-Y)) to promote better reassociation. static BinaryOperator *BreakUpSubtract(Instruction *Sub) { // Convert a subtract into an add and a neg instruction. This allows sub // instructions to be commuted with other add instructions. @@ -1038,9 +1036,8 @@ static BinaryOperator *BreakUpSubtract(Instruction *Sub) { return New; } -/// ConvertShiftToMul - If this is a shift of a reassociable multiply or is used -/// by one, change this into a multiply by a constant to assist with further -/// reassociation. +/// If this is a shift of a reassociable multiply or is used by one, change +/// this into a multiply by a constant to assist with further reassociation. static BinaryOperator *ConvertShiftToMul(Instruction *Shl) { Constant *MulCst = ConstantInt::get(Shl->getType(), 1); MulCst = ConstantExpr::getShl(MulCst, cast<Constant>(Shl->getOperand(1))); @@ -1065,10 +1062,9 @@ static BinaryOperator *ConvertShiftToMul(Instruction *Shl) { return Mul; } -/// FindInOperandList - Scan backwards and forwards among values with the same -/// rank as element i to see if X exists. If X does not exist, return i. This -/// is useful when scanning for 'x' when we see '-x' because they both get the -/// same rank. +/// Scan backwards and forwards among values with the same rank as element i +/// to see if X exists. If X does not exist, return i. This is useful when +/// scanning for 'x' when we see '-x' because they both get the same rank. static unsigned FindInOperandList(SmallVectorImpl<ValueEntry> &Ops, unsigned i, Value *X) { unsigned XRank = Ops[i].Rank; @@ -1093,7 +1089,7 @@ static unsigned FindInOperandList(SmallVectorImpl<ValueEntry> &Ops, unsigned i, return i; } -/// EmitAddTreeOfValues - Emit a tree of add instructions, summing Ops together +/// Emit a tree of add instructions, summing Ops together /// and returning the result. Insert the tree before I. static Value *EmitAddTreeOfValues(Instruction *I, SmallVectorImpl<WeakVH> &Ops){ @@ -1105,8 +1101,8 @@ static Value *EmitAddTreeOfValues(Instruction *I, return CreateAdd(V2, V1, "tmp", I, I); } -/// RemoveFactorFromExpression - If V is an expression tree that is a -/// multiplication sequence, and if this sequence contains a multiply by Factor, +/// If V is an expression tree that is a multiplication sequence, +/// and if this sequence contains a multiply by Factor, /// remove Factor from the tree and return the new tree. Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) { BinaryOperator *BO = isReassociableOp(V, Instruction::Mul, Instruction::FMul); @@ -1178,8 +1174,8 @@ Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) { return V; } -/// FindSingleUseMultiplyFactors - If V is a single-use multiply, recursively -/// add its operands as factors, otherwise add V to the list of factors. +/// If V is a single-use multiply, recursively add its operands as factors, +/// otherwise add V to the list of factors. /// /// Ops is the top-level list of add operands we're trying to factor. static void FindSingleUseMultiplyFactors(Value *V, @@ -1196,10 +1192,9 @@ static void FindSingleUseMultiplyFactors(Value *V, FindSingleUseMultiplyFactors(BO->getOperand(0), Factors, Ops); } -/// OptimizeAndOrXor - Optimize a series of operands to an 'and', 'or', or 'xor' -/// instruction. This optimizes based on identities. If it can be reduced to -/// a single Value, it is returned, otherwise the Ops list is mutated as -/// necessary. +/// Optimize a series of operands to an 'and', 'or', or 'xor' instruction. +/// This optimizes based on identities. If it can be reduced to a single Value, +/// it is returned, otherwise the Ops list is mutated as necessary. static Value *OptimizeAndOrXor(unsigned Opcode, SmallVectorImpl<ValueEntry> &Ops) { // Scan the operand lists looking for X and ~X pairs, along with X,X pairs. @@ -1489,7 +1484,7 @@ Value *Reassociate::OptimizeXor(Instruction *I, return nullptr; } -/// OptimizeAdd - Optimize a series of operands to an 'add' instruction. This +/// Optimize a series of operands to an 'add' instruction. This /// optimizes based on identities. If it can be reduced to a single Value, it /// is returned, otherwise the Ops list is mutated as necessary. Value *Reassociate::OptimizeAdd(Instruction *I, @@ -1517,8 +1512,8 @@ Value *Reassociate::OptimizeAdd(Instruction *I, // Insert a new multiply. Type *Ty = TheOp->getType(); - Constant *C = Ty->isIntegerTy() ? ConstantInt::get(Ty, NumFound) - : ConstantFP::get(Ty, NumFound); + Constant *C = Ty->isIntOrIntVectorTy() ? + ConstantInt::get(Ty, NumFound) : ConstantFP::get(Ty, NumFound); Instruction *Mul = CreateMul(TheOp, C, "factor", I, I); // Now that we have inserted a multiply, optimize it. This allows us to @@ -1658,7 +1653,7 @@ Value *Reassociate::OptimizeAdd(Instruction *I, // from an expression will drop a use of maxocc, and this can cause // RemoveFactorFromExpression on successive values to behave differently. Instruction *DummyInst = - I->getType()->isIntegerTy() + I->getType()->isIntOrIntVectorTy() ? BinaryOperator::CreateAdd(MaxOccVal, MaxOccVal) : BinaryOperator::CreateFAdd(MaxOccVal, MaxOccVal); @@ -1789,7 +1784,7 @@ static Value *buildMultiplyTree(IRBuilder<> &Builder, Value *LHS = Ops.pop_back_val(); do { - if (LHS->getType()->isIntegerTy()) + if (LHS->getType()->isIntOrIntVectorTy()) LHS = Builder.CreateMul(LHS, Ops.pop_back_val()); else LHS = Builder.CreateFMul(LHS, Ops.pop_back_val()); @@ -1942,8 +1937,7 @@ Value *Reassociate::OptimizeExpression(BinaryOperator *I, return nullptr; } -/// EraseInst - Zap the given instruction, adding interesting operands to the -/// work list. +/// Zap the given instruction, adding interesting operands to the work list. void Reassociate::EraseInst(Instruction *I) { assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!"); SmallVector<Value*, 8> Ops(I->op_begin(), I->op_end()); @@ -1988,7 +1982,7 @@ Instruction *Reassociate::canonicalizeNegConstExpr(Instruction *I) { Constant *C = C0 ? C0 : C1; unsigned ConstIdx = C0 ? 0 : 1; if (auto *CI = dyn_cast<ConstantInt>(C)) { - if (!CI->isNegative()) + if (!CI->isNegative() || CI->isMinValue(true)) return nullptr; } else if (auto *CF = dyn_cast<ConstantFP>(C)) { if (!CF->isNegative()) @@ -2057,7 +2051,7 @@ Instruction *Reassociate::canonicalizeNegConstExpr(Instruction *I) { return NI; } -/// OptimizeInst - Inspect and optimize the given instruction. Note that erasing +/// Inspect and optimize the given instruction. Note that erasing /// instructions is not allowed. void Reassociate::OptimizeInst(Instruction *I) { // Only consider operations that we understand. @@ -2087,8 +2081,9 @@ void Reassociate::OptimizeInst(Instruction *I) { if (I->isCommutative()) canonicalizeOperands(I); - // Don't optimize vector instructions. - if (I->getType()->isVectorTy()) + // TODO: We should optimize vector Xor instructions, but they are + // currently unsupported. + if (I->getType()->isVectorTy() && I->getOpcode() == Instruction::Xor) return; // Don't optimize floating point instructions that don't have unsafe algebra. @@ -2167,9 +2162,6 @@ void Reassociate::OptimizeInst(Instruction *I) { } void Reassociate::ReassociateExpression(BinaryOperator *I) { - assert(!I->getType()->isVectorTy() && - "Reassociation of vector instructions is not supported."); - // First, walk the expression tree, linearizing the tree, collecting the // operand information. SmallVector<RepeatedValue, 8> Tree; @@ -2192,7 +2184,7 @@ void Reassociate::ReassociateExpression(BinaryOperator *I) { // the vector. std::stable_sort(Ops.begin(), Ops.end()); - // OptimizeExpression - Now that we have the expression tree in a convenient + // Now that we have the expression tree in a convenient // sorted form, optimize it globally if possible. if (Value *V = OptimizeExpression(I, Ops)) { if (V == I) diff --git a/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp new file mode 100644 index 0000000..6cf765a --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp @@ -0,0 +1,2506 @@ +//===- RewriteStatepointsForGC.cpp - Make GC relocations explicit ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Rewrite an existing set of gc.statepoints such that they make potential +// relocations performed by the garbage collector explicit in the IR. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Pass.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/ADT/SetOperations.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Statepoint.h" +#include "llvm/IR/Value.h" +#include "llvm/IR/Verifier.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/PromoteMemToReg.h" + +#define DEBUG_TYPE "rewrite-statepoints-for-gc" + +using namespace llvm; + +// Print tracing output +static cl::opt<bool> TraceLSP("trace-rewrite-statepoints", cl::Hidden, + cl::init(false)); + +// Print the liveset found at the insert location +static cl::opt<bool> PrintLiveSet("spp-print-liveset", cl::Hidden, + cl::init(false)); +static cl::opt<bool> PrintLiveSetSize("spp-print-liveset-size", cl::Hidden, + cl::init(false)); +// Print out the base pointers for debugging +static cl::opt<bool> PrintBasePointers("spp-print-base-pointers", cl::Hidden, + cl::init(false)); + +// Cost threshold measuring when it is profitable to rematerialize value instead +// of relocating it +static cl::opt<unsigned> +RematerializationThreshold("spp-rematerialization-threshold", cl::Hidden, + cl::init(6)); + +#ifdef XDEBUG +static bool ClobberNonLive = true; +#else +static bool ClobberNonLive = false; +#endif +static cl::opt<bool, true> ClobberNonLiveOverride("rs4gc-clobber-non-live", + cl::location(ClobberNonLive), + cl::Hidden); + +namespace { +struct RewriteStatepointsForGC : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + + RewriteStatepointsForGC() : FunctionPass(ID) { + initializeRewriteStatepointsForGCPass(*PassRegistry::getPassRegistry()); + } + bool runOnFunction(Function &F) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + // We add and rewrite a bunch of instructions, but don't really do much + // else. We could in theory preserve a lot more analyses here. + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); + } +}; +} // namespace + +char RewriteStatepointsForGC::ID = 0; + +FunctionPass *llvm::createRewriteStatepointsForGCPass() { + return new RewriteStatepointsForGC(); +} + +INITIALIZE_PASS_BEGIN(RewriteStatepointsForGC, "rewrite-statepoints-for-gc", + "Make relocations explicit at statepoints", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_END(RewriteStatepointsForGC, "rewrite-statepoints-for-gc", + "Make relocations explicit at statepoints", false, false) + +namespace { +struct GCPtrLivenessData { + /// Values defined in this block. + DenseMap<BasicBlock *, DenseSet<Value *>> KillSet; + /// Values used in this block (and thus live); does not included values + /// killed within this block. + DenseMap<BasicBlock *, DenseSet<Value *>> LiveSet; + + /// Values live into this basic block (i.e. used by any + /// instruction in this basic block or ones reachable from here) + DenseMap<BasicBlock *, DenseSet<Value *>> LiveIn; + + /// Values live out of this basic block (i.e. live into + /// any successor block) + DenseMap<BasicBlock *, DenseSet<Value *>> LiveOut; +}; + +// The type of the internal cache used inside the findBasePointers family +// of functions. From the callers perspective, this is an opaque type and +// should not be inspected. +// +// In the actual implementation this caches two relations: +// - The base relation itself (i.e. this pointer is based on that one) +// - The base defining value relation (i.e. before base_phi insertion) +// Generally, after the execution of a full findBasePointer call, only the +// base relation will remain. Internally, we add a mixture of the two +// types, then update all the second type to the first type +typedef DenseMap<Value *, Value *> DefiningValueMapTy; +typedef DenseSet<llvm::Value *> StatepointLiveSetTy; +typedef DenseMap<Instruction *, Value *> RematerializedValueMapTy; + +struct PartiallyConstructedSafepointRecord { + /// The set of values known to be live accross this safepoint + StatepointLiveSetTy liveset; + + /// Mapping from live pointers to a base-defining-value + DenseMap<llvm::Value *, llvm::Value *> PointerToBase; + + /// The *new* gc.statepoint instruction itself. This produces the token + /// that normal path gc.relocates and the gc.result are tied to. + Instruction *StatepointToken; + + /// Instruction to which exceptional gc relocates are attached + /// Makes it easier to iterate through them during relocationViaAlloca. + Instruction *UnwindToken; + + /// Record live values we are rematerialized instead of relocating. + /// They are not included into 'liveset' field. + /// Maps rematerialized copy to it's original value. + RematerializedValueMapTy RematerializedValues; +}; +} + +/// Compute the live-in set for every basic block in the function +static void computeLiveInValues(DominatorTree &DT, Function &F, + GCPtrLivenessData &Data); + +/// Given results from the dataflow liveness computation, find the set of live +/// Values at a particular instruction. +static void findLiveSetAtInst(Instruction *inst, GCPtrLivenessData &Data, + StatepointLiveSetTy &out); + +// TODO: Once we can get to the GCStrategy, this becomes +// Optional<bool> isGCManagedPointer(const Value *V) const override { + +static bool isGCPointerType(const Type *T) { + if (const PointerType *PT = dyn_cast<PointerType>(T)) + // For the sake of this example GC, we arbitrarily pick addrspace(1) as our + // GC managed heap. We know that a pointer into this heap needs to be + // updated and that no other pointer does. + return (1 == PT->getAddressSpace()); + return false; +} + +// Return true if this type is one which a) is a gc pointer or contains a GC +// pointer and b) is of a type this code expects to encounter as a live value. +// (The insertion code will assert that a type which matches (a) and not (b) +// is not encountered.) +static bool isHandledGCPointerType(Type *T) { + // We fully support gc pointers + if (isGCPointerType(T)) + return true; + // We partially support vectors of gc pointers. The code will assert if it + // can't handle something. + if (auto VT = dyn_cast<VectorType>(T)) + if (isGCPointerType(VT->getElementType())) + return true; + return false; +} + +#ifndef NDEBUG +/// Returns true if this type contains a gc pointer whether we know how to +/// handle that type or not. +static bool containsGCPtrType(Type *Ty) { + if (isGCPointerType(Ty)) + return true; + if (VectorType *VT = dyn_cast<VectorType>(Ty)) + return isGCPointerType(VT->getScalarType()); + if (ArrayType *AT = dyn_cast<ArrayType>(Ty)) + return containsGCPtrType(AT->getElementType()); + if (StructType *ST = dyn_cast<StructType>(Ty)) + return std::any_of( + ST->subtypes().begin(), ST->subtypes().end(), + [](Type *SubType) { return containsGCPtrType(SubType); }); + return false; +} + +// Returns true if this is a type which a) is a gc pointer or contains a GC +// pointer and b) is of a type which the code doesn't expect (i.e. first class +// aggregates). Used to trip assertions. +static bool isUnhandledGCPointerType(Type *Ty) { + return containsGCPtrType(Ty) && !isHandledGCPointerType(Ty); +} +#endif + +static bool order_by_name(llvm::Value *a, llvm::Value *b) { + if (a->hasName() && b->hasName()) { + return -1 == a->getName().compare(b->getName()); + } else if (a->hasName() && !b->hasName()) { + return true; + } else if (!a->hasName() && b->hasName()) { + return false; + } else { + // Better than nothing, but not stable + return a < b; + } +} + +// Conservatively identifies any definitions which might be live at the +// given instruction. The analysis is performed immediately before the +// given instruction. Values defined by that instruction are not considered +// live. Values used by that instruction are considered live. +static void analyzeParsePointLiveness( + DominatorTree &DT, GCPtrLivenessData &OriginalLivenessData, + const CallSite &CS, PartiallyConstructedSafepointRecord &result) { + Instruction *inst = CS.getInstruction(); + + StatepointLiveSetTy liveset; + findLiveSetAtInst(inst, OriginalLivenessData, liveset); + + if (PrintLiveSet) { + // Note: This output is used by several of the test cases + // The order of elemtns in a set is not stable, put them in a vec and sort + // by name + SmallVector<Value *, 64> temp; + temp.insert(temp.end(), liveset.begin(), liveset.end()); + std::sort(temp.begin(), temp.end(), order_by_name); + errs() << "Live Variables:\n"; + for (Value *V : temp) { + errs() << " " << V->getName(); // no newline + V->dump(); + } + } + if (PrintLiveSetSize) { + errs() << "Safepoint For: " << CS.getCalledValue()->getName() << "\n"; + errs() << "Number live values: " << liveset.size() << "\n"; + } + result.liveset = liveset; +} + +static Value *findBaseDefiningValue(Value *I); + +/// If we can trivially determine that the index specified in the given vector +/// is a base pointer, return it. In cases where the entire vector is known to +/// consist of base pointers, the entire vector will be returned. This +/// indicates that the relevant extractelement is a valid base pointer and +/// should be used directly. +static Value *findBaseOfVector(Value *I, Value *Index) { + assert(I->getType()->isVectorTy() && + cast<VectorType>(I->getType())->getElementType()->isPointerTy() && + "Illegal to ask for the base pointer of a non-pointer type"); + + // Each case parallels findBaseDefiningValue below, see that code for + // detailed motivation. + + if (isa<Argument>(I)) + // An incoming argument to the function is a base pointer + return I; + + // We shouldn't see the address of a global as a vector value? + assert(!isa<GlobalVariable>(I) && + "unexpected global variable found in base of vector"); + + // inlining could possibly introduce phi node that contains + // undef if callee has multiple returns + if (isa<UndefValue>(I)) + // utterly meaningless, but useful for dealing with partially optimized + // code. + return I; + + // Due to inheritance, this must be _after_ the global variable and undef + // checks + if (Constant *Con = dyn_cast<Constant>(I)) { + assert(!isa<GlobalVariable>(I) && !isa<UndefValue>(I) && + "order of checks wrong!"); + assert(Con->isNullValue() && "null is the only case which makes sense"); + return Con; + } + + if (isa<LoadInst>(I)) + return I; + + // For an insert element, we might be able to look through it if we know + // something about the indexes, but if the indices are arbitrary values, we + // can't without much more extensive scalarization. + if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(I)) { + Value *InsertIndex = IEI->getOperand(2); + // This index is inserting the value, look for it's base + if (InsertIndex == Index) + return findBaseDefiningValue(IEI->getOperand(1)); + // Both constant, and can't be equal per above. This insert is definitely + // not relevant, look back at the rest of the vector and keep trying. + if (isa<ConstantInt>(Index) && isa<ConstantInt>(InsertIndex)) + return findBaseOfVector(IEI->getOperand(0), Index); + } + + // Note: This code is currently rather incomplete. We are essentially only + // handling cases where the vector element is trivially a base pointer. We + // need to update the entire base pointer construction algorithm to know how + // to track vector elements and potentially scalarize, but the case which + // would motivate the work hasn't shown up in real workloads yet. + llvm_unreachable("no base found for vector element"); +} + +/// Helper function for findBasePointer - Will return a value which either a) +/// defines the base pointer for the input or b) blocks the simple search +/// (i.e. a PHI or Select of two derived pointers) +static Value *findBaseDefiningValue(Value *I) { + assert(I->getType()->isPointerTy() && + "Illegal to ask for the base pointer of a non-pointer type"); + + // This case is a bit of a hack - it only handles extracts from vectors which + // trivially contain only base pointers or cases where we can directly match + // the index of the original extract element to an insertion into the vector. + // See note inside the function for how to improve this. + if (auto *EEI = dyn_cast<ExtractElementInst>(I)) { + Value *VectorOperand = EEI->getVectorOperand(); + Value *Index = EEI->getIndexOperand(); + Value *VectorBase = findBaseOfVector(VectorOperand, Index); + // If the result returned is a vector, we know the entire vector must + // contain base pointers. In that case, the extractelement is a valid base + // for this value. + if (VectorBase->getType()->isVectorTy()) + return EEI; + // Otherwise, we needed to look through the vector to find the base for + // this particular element. + assert(VectorBase->getType()->isPointerTy()); + return VectorBase; + } + + if (isa<Argument>(I)) + // An incoming argument to the function is a base pointer + // We should have never reached here if this argument isn't an gc value + return I; + + if (isa<GlobalVariable>(I)) + // base case + return I; + + // inlining could possibly introduce phi node that contains + // undef if callee has multiple returns + if (isa<UndefValue>(I)) + // utterly meaningless, but useful for dealing with + // partially optimized code. + return I; + + // Due to inheritance, this must be _after_ the global variable and undef + // checks + if (Constant *Con = dyn_cast<Constant>(I)) { + assert(!isa<GlobalVariable>(I) && !isa<UndefValue>(I) && + "order of checks wrong!"); + // Note: Finding a constant base for something marked for relocation + // doesn't really make sense. The most likely case is either a) some + // screwed up the address space usage or b) your validating against + // compiled C++ code w/o the proper separation. The only real exception + // is a null pointer. You could have generic code written to index of + // off a potentially null value and have proven it null. We also use + // null pointers in dead paths of relocation phis (which we might later + // want to find a base pointer for). + assert(isa<ConstantPointerNull>(Con) && + "null is the only case which makes sense"); + return Con; + } + + if (CastInst *CI = dyn_cast<CastInst>(I)) { + Value *Def = CI->stripPointerCasts(); + // If we find a cast instruction here, it means we've found a cast which is + // not simply a pointer cast (i.e. an inttoptr). We don't know how to + // handle int->ptr conversion. + assert(!isa<CastInst>(Def) && "shouldn't find another cast here"); + return findBaseDefiningValue(Def); + } + + if (isa<LoadInst>(I)) + return I; // The value loaded is an gc base itself + + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) + // The base of this GEP is the base + return findBaseDefiningValue(GEP->getPointerOperand()); + + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { + switch (II->getIntrinsicID()) { + case Intrinsic::experimental_gc_result_ptr: + default: + // fall through to general call handling + break; + case Intrinsic::experimental_gc_statepoint: + case Intrinsic::experimental_gc_result_float: + case Intrinsic::experimental_gc_result_int: + llvm_unreachable("these don't produce pointers"); + case Intrinsic::experimental_gc_relocate: { + // Rerunning safepoint insertion after safepoints are already + // inserted is not supported. It could probably be made to work, + // but why are you doing this? There's no good reason. + llvm_unreachable("repeat safepoint insertion is not supported"); + } + case Intrinsic::gcroot: + // Currently, this mechanism hasn't been extended to work with gcroot. + // There's no reason it couldn't be, but I haven't thought about the + // implications much. + llvm_unreachable( + "interaction with the gcroot mechanism is not supported"); + } + } + // We assume that functions in the source language only return base + // pointers. This should probably be generalized via attributes to support + // both source language and internal functions. + if (isa<CallInst>(I) || isa<InvokeInst>(I)) + return I; + + // I have absolutely no idea how to implement this part yet. It's not + // neccessarily hard, I just haven't really looked at it yet. + assert(!isa<LandingPadInst>(I) && "Landing Pad is unimplemented"); + + if (isa<AtomicCmpXchgInst>(I)) + // A CAS is effectively a atomic store and load combined under a + // predicate. From the perspective of base pointers, we just treat it + // like a load. + return I; + + assert(!isa<AtomicRMWInst>(I) && "Xchg handled above, all others are " + "binary ops which don't apply to pointers"); + + // The aggregate ops. Aggregates can either be in the heap or on the + // stack, but in either case, this is simply a field load. As a result, + // this is a defining definition of the base just like a load is. + if (isa<ExtractValueInst>(I)) + return I; + + // We should never see an insert vector since that would require we be + // tracing back a struct value not a pointer value. + assert(!isa<InsertValueInst>(I) && + "Base pointer for a struct is meaningless"); + + // The last two cases here don't return a base pointer. Instead, they + // return a value which dynamically selects from amoung several base + // derived pointers (each with it's own base potentially). It's the job of + // the caller to resolve these. + assert((isa<SelectInst>(I) || isa<PHINode>(I)) && + "missing instruction case in findBaseDefiningValing"); + return I; +} + +/// Returns the base defining value for this value. +static Value *findBaseDefiningValueCached(Value *I, DefiningValueMapTy &Cache) { + Value *&Cached = Cache[I]; + if (!Cached) { + Cached = findBaseDefiningValue(I); + } + assert(Cache[I] != nullptr); + + if (TraceLSP) { + dbgs() << "fBDV-cached: " << I->getName() << " -> " << Cached->getName() + << "\n"; + } + return Cached; +} + +/// Return a base pointer for this value if known. Otherwise, return it's +/// base defining value. +static Value *findBaseOrBDV(Value *I, DefiningValueMapTy &Cache) { + Value *Def = findBaseDefiningValueCached(I, Cache); + auto Found = Cache.find(Def); + if (Found != Cache.end()) { + // Either a base-of relation, or a self reference. Caller must check. + return Found->second; + } + // Only a BDV available + return Def; +} + +/// Given the result of a call to findBaseDefiningValue, or findBaseOrBDV, +/// is it known to be a base pointer? Or do we need to continue searching. +static bool isKnownBaseResult(Value *V) { + if (!isa<PHINode>(V) && !isa<SelectInst>(V)) { + // no recursion possible + return true; + } + if (isa<Instruction>(V) && + cast<Instruction>(V)->getMetadata("is_base_value")) { + // This is a previously inserted base phi or select. We know + // that this is a base value. + return true; + } + + // We need to keep searching + return false; +} + +// TODO: find a better name for this +namespace { +class PhiState { +public: + enum Status { Unknown, Base, Conflict }; + + PhiState(Status s, Value *b = nullptr) : status(s), base(b) { + assert(status != Base || b); + } + PhiState(Value *b) : status(Base), base(b) {} + PhiState() : status(Unknown), base(nullptr) {} + + Status getStatus() const { return status; } + Value *getBase() const { return base; } + + bool isBase() const { return getStatus() == Base; } + bool isUnknown() const { return getStatus() == Unknown; } + bool isConflict() const { return getStatus() == Conflict; } + + bool operator==(const PhiState &other) const { + return base == other.base && status == other.status; + } + + bool operator!=(const PhiState &other) const { return !(*this == other); } + + void dump() { + errs() << status << " (" << base << " - " + << (base ? base->getName() : "nullptr") << "): "; + } + +private: + Status status; + Value *base; // non null only if status == base +}; + +typedef DenseMap<Value *, PhiState> ConflictStateMapTy; +// Values of type PhiState form a lattice, and this is a helper +// class that implementes the meet operation. The meat of the meet +// operation is implemented in MeetPhiStates::pureMeet +class MeetPhiStates { +public: + // phiStates is a mapping from PHINodes and SelectInst's to PhiStates. + explicit MeetPhiStates(const ConflictStateMapTy &phiStates) + : phiStates(phiStates) {} + + // Destructively meet the current result with the base V. V can + // either be a merge instruction (SelectInst / PHINode), in which + // case its status is looked up in the phiStates map; or a regular + // SSA value, in which case it is assumed to be a base. + void meetWith(Value *V) { + PhiState otherState = getStateForBDV(V); + assert((MeetPhiStates::pureMeet(otherState, currentResult) == + MeetPhiStates::pureMeet(currentResult, otherState)) && + "math is wrong: meet does not commute!"); + currentResult = MeetPhiStates::pureMeet(otherState, currentResult); + } + + PhiState getResult() const { return currentResult; } + +private: + const ConflictStateMapTy &phiStates; + PhiState currentResult; + + /// Return a phi state for a base defining value. We'll generate a new + /// base state for known bases and expect to find a cached state otherwise + PhiState getStateForBDV(Value *baseValue) { + if (isKnownBaseResult(baseValue)) { + return PhiState(baseValue); + } else { + return lookupFromMap(baseValue); + } + } + + PhiState lookupFromMap(Value *V) { + auto I = phiStates.find(V); + assert(I != phiStates.end() && "lookup failed!"); + return I->second; + } + + static PhiState pureMeet(const PhiState &stateA, const PhiState &stateB) { + switch (stateA.getStatus()) { + case PhiState::Unknown: + return stateB; + + case PhiState::Base: + assert(stateA.getBase() && "can't be null"); + if (stateB.isUnknown()) + return stateA; + + if (stateB.isBase()) { + if (stateA.getBase() == stateB.getBase()) { + assert(stateA == stateB && "equality broken!"); + return stateA; + } + return PhiState(PhiState::Conflict); + } + assert(stateB.isConflict() && "only three states!"); + return PhiState(PhiState::Conflict); + + case PhiState::Conflict: + return stateA; + } + llvm_unreachable("only three states!"); + } +}; +} +/// For a given value or instruction, figure out what base ptr it's derived +/// from. For gc objects, this is simply itself. On success, returns a value +/// which is the base pointer. (This is reliable and can be used for +/// relocation.) On failure, returns nullptr. +static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) { + Value *def = findBaseOrBDV(I, cache); + + if (isKnownBaseResult(def)) { + return def; + } + + // Here's the rough algorithm: + // - For every SSA value, construct a mapping to either an actual base + // pointer or a PHI which obscures the base pointer. + // - Construct a mapping from PHI to unknown TOP state. Use an + // optimistic algorithm to propagate base pointer information. Lattice + // looks like: + // UNKNOWN + // b1 b2 b3 b4 + // CONFLICT + // When algorithm terminates, all PHIs will either have a single concrete + // base or be in a conflict state. + // - For every conflict, insert a dummy PHI node without arguments. Add + // these to the base[Instruction] = BasePtr mapping. For every + // non-conflict, add the actual base. + // - For every conflict, add arguments for the base[a] of each input + // arguments. + // + // Note: A simpler form of this would be to add the conflict form of all + // PHIs without running the optimistic algorithm. This would be + // analougous to pessimistic data flow and would likely lead to an + // overall worse solution. + + ConflictStateMapTy states; + states[def] = PhiState(); + // Recursively fill in all phis & selects reachable from the initial one + // for which we don't already know a definite base value for + // TODO: This should be rewritten with a worklist + bool done = false; + while (!done) { + done = true; + // Since we're adding elements to 'states' as we run, we can't keep + // iterators into the set. + SmallVector<Value *, 16> Keys; + Keys.reserve(states.size()); + for (auto Pair : states) { + Value *V = Pair.first; + Keys.push_back(V); + } + for (Value *v : Keys) { + assert(!isKnownBaseResult(v) && "why did it get added?"); + if (PHINode *phi = dyn_cast<PHINode>(v)) { + assert(phi->getNumIncomingValues() > 0 && + "zero input phis are illegal"); + for (Value *InVal : phi->incoming_values()) { + Value *local = findBaseOrBDV(InVal, cache); + if (!isKnownBaseResult(local) && states.find(local) == states.end()) { + states[local] = PhiState(); + done = false; + } + } + } else if (SelectInst *sel = dyn_cast<SelectInst>(v)) { + Value *local = findBaseOrBDV(sel->getTrueValue(), cache); + if (!isKnownBaseResult(local) && states.find(local) == states.end()) { + states[local] = PhiState(); + done = false; + } + local = findBaseOrBDV(sel->getFalseValue(), cache); + if (!isKnownBaseResult(local) && states.find(local) == states.end()) { + states[local] = PhiState(); + done = false; + } + } + } + } + + if (TraceLSP) { + errs() << "States after initialization:\n"; + for (auto Pair : states) { + Instruction *v = cast<Instruction>(Pair.first); + PhiState state = Pair.second; + state.dump(); + v->dump(); + } + } + + // TODO: come back and revisit the state transitions around inputs which + // have reached conflict state. The current version seems too conservative. + + bool progress = true; + while (progress) { +#ifndef NDEBUG + size_t oldSize = states.size(); +#endif + progress = false; + // We're only changing keys in this loop, thus safe to keep iterators + for (auto Pair : states) { + MeetPhiStates calculateMeet(states); + Value *v = Pair.first; + assert(!isKnownBaseResult(v) && "why did it get added?"); + if (SelectInst *select = dyn_cast<SelectInst>(v)) { + calculateMeet.meetWith(findBaseOrBDV(select->getTrueValue(), cache)); + calculateMeet.meetWith(findBaseOrBDV(select->getFalseValue(), cache)); + } else + for (Value *Val : cast<PHINode>(v)->incoming_values()) + calculateMeet.meetWith(findBaseOrBDV(Val, cache)); + + PhiState oldState = states[v]; + PhiState newState = calculateMeet.getResult(); + if (oldState != newState) { + progress = true; + states[v] = newState; + } + } + + assert(oldSize <= states.size()); + assert(oldSize == states.size() || progress); + } + + if (TraceLSP) { + errs() << "States after meet iteration:\n"; + for (auto Pair : states) { + Instruction *v = cast<Instruction>(Pair.first); + PhiState state = Pair.second; + state.dump(); + v->dump(); + } + } + + // Insert Phis for all conflicts + // We want to keep naming deterministic in the loop that follows, so + // sort the keys before iteration. This is useful in allowing us to + // write stable tests. Note that there is no invalidation issue here. + SmallVector<Value *, 16> Keys; + Keys.reserve(states.size()); + for (auto Pair : states) { + Value *V = Pair.first; + Keys.push_back(V); + } + std::sort(Keys.begin(), Keys.end(), order_by_name); + // TODO: adjust naming patterns to avoid this order of iteration dependency + for (Value *V : Keys) { + Instruction *v = cast<Instruction>(V); + PhiState state = states[V]; + assert(!isKnownBaseResult(v) && "why did it get added?"); + assert(!state.isUnknown() && "Optimistic algorithm didn't complete!"); + if (!state.isConflict()) + continue; + + if (isa<PHINode>(v)) { + int num_preds = + std::distance(pred_begin(v->getParent()), pred_end(v->getParent())); + assert(num_preds > 0 && "how did we reach here"); + PHINode *phi = PHINode::Create(v->getType(), num_preds, "base_phi", v); + // Add metadata marking this as a base value + auto *const_1 = ConstantInt::get( + Type::getInt32Ty( + v->getParent()->getParent()->getParent()->getContext()), + 1); + auto MDConst = ConstantAsMetadata::get(const_1); + MDNode *md = MDNode::get( + v->getParent()->getParent()->getParent()->getContext(), MDConst); + phi->setMetadata("is_base_value", md); + states[v] = PhiState(PhiState::Conflict, phi); + } else { + SelectInst *sel = cast<SelectInst>(v); + // The undef will be replaced later + UndefValue *undef = UndefValue::get(sel->getType()); + SelectInst *basesel = SelectInst::Create(sel->getCondition(), undef, + undef, "base_select", sel); + // Add metadata marking this as a base value + auto *const_1 = ConstantInt::get( + Type::getInt32Ty( + v->getParent()->getParent()->getParent()->getContext()), + 1); + auto MDConst = ConstantAsMetadata::get(const_1); + MDNode *md = MDNode::get( + v->getParent()->getParent()->getParent()->getContext(), MDConst); + basesel->setMetadata("is_base_value", md); + states[v] = PhiState(PhiState::Conflict, basesel); + } + } + + // Fixup all the inputs of the new PHIs + for (auto Pair : states) { + Instruction *v = cast<Instruction>(Pair.first); + PhiState state = Pair.second; + + assert(!isKnownBaseResult(v) && "why did it get added?"); + assert(!state.isUnknown() && "Optimistic algorithm didn't complete!"); + if (!state.isConflict()) + continue; + + if (PHINode *basephi = dyn_cast<PHINode>(state.getBase())) { + PHINode *phi = cast<PHINode>(v); + unsigned NumPHIValues = phi->getNumIncomingValues(); + for (unsigned i = 0; i < NumPHIValues; i++) { + Value *InVal = phi->getIncomingValue(i); + BasicBlock *InBB = phi->getIncomingBlock(i); + + // If we've already seen InBB, add the same incoming value + // we added for it earlier. The IR verifier requires phi + // nodes with multiple entries from the same basic block + // to have the same incoming value for each of those + // entries. If we don't do this check here and basephi + // has a different type than base, we'll end up adding two + // bitcasts (and hence two distinct values) as incoming + // values for the same basic block. + + int blockIndex = basephi->getBasicBlockIndex(InBB); + if (blockIndex != -1) { + Value *oldBase = basephi->getIncomingValue(blockIndex); + basephi->addIncoming(oldBase, InBB); +#ifndef NDEBUG + Value *base = findBaseOrBDV(InVal, cache); + if (!isKnownBaseResult(base)) { + // Either conflict or base. + assert(states.count(base)); + base = states[base].getBase(); + assert(base != nullptr && "unknown PhiState!"); + } + + // In essense this assert states: the only way two + // values incoming from the same basic block may be + // different is by being different bitcasts of the same + // value. A cleanup that remains TODO is changing + // findBaseOrBDV to return an llvm::Value of the correct + // type (and still remain pure). This will remove the + // need to add bitcasts. + assert(base->stripPointerCasts() == oldBase->stripPointerCasts() && + "sanity -- findBaseOrBDV should be pure!"); +#endif + continue; + } + + // Find either the defining value for the PHI or the normal base for + // a non-phi node + Value *base = findBaseOrBDV(InVal, cache); + if (!isKnownBaseResult(base)) { + // Either conflict or base. + assert(states.count(base)); + base = states[base].getBase(); + assert(base != nullptr && "unknown PhiState!"); + } + assert(base && "can't be null"); + // Must use original input BB since base may not be Instruction + // The cast is needed since base traversal may strip away bitcasts + if (base->getType() != basephi->getType()) { + base = new BitCastInst(base, basephi->getType(), "cast", + InBB->getTerminator()); + } + basephi->addIncoming(base, InBB); + } + assert(basephi->getNumIncomingValues() == NumPHIValues); + } else { + SelectInst *basesel = cast<SelectInst>(state.getBase()); + SelectInst *sel = cast<SelectInst>(v); + // Operand 1 & 2 are true, false path respectively. TODO: refactor to + // something more safe and less hacky. + for (int i = 1; i <= 2; i++) { + Value *InVal = sel->getOperand(i); + // Find either the defining value for the PHI or the normal base for + // a non-phi node + Value *base = findBaseOrBDV(InVal, cache); + if (!isKnownBaseResult(base)) { + // Either conflict or base. + assert(states.count(base)); + base = states[base].getBase(); + assert(base != nullptr && "unknown PhiState!"); + } + assert(base && "can't be null"); + // Must use original input BB since base may not be Instruction + // The cast is needed since base traversal may strip away bitcasts + if (base->getType() != basesel->getType()) { + base = new BitCastInst(base, basesel->getType(), "cast", basesel); + } + basesel->setOperand(i, base); + } + } + } + + // Cache all of our results so we can cheaply reuse them + // NOTE: This is actually two caches: one of the base defining value + // relation and one of the base pointer relation! FIXME + for (auto item : states) { + Value *v = item.first; + Value *base = item.second.getBase(); + assert(v && base); + assert(!isKnownBaseResult(v) && "why did it get added?"); + + if (TraceLSP) { + std::string fromstr = + cache.count(v) ? (cache[v]->hasName() ? cache[v]->getName() : "") + : "none"; + errs() << "Updating base value cache" + << " for: " << (v->hasName() ? v->getName() : "") + << " from: " << fromstr + << " to: " << (base->hasName() ? base->getName() : "") << "\n"; + } + + assert(isKnownBaseResult(base) && + "must be something we 'know' is a base pointer"); + if (cache.count(v)) { + // Once we transition from the BDV relation being store in the cache to + // the base relation being stored, it must be stable + assert((!isKnownBaseResult(cache[v]) || cache[v] == base) && + "base relation should be stable"); + } + cache[v] = base; + } + assert(cache.find(def) != cache.end()); + return cache[def]; +} + +// For a set of live pointers (base and/or derived), identify the base +// pointer of the object which they are derived from. This routine will +// mutate the IR graph as needed to make the 'base' pointer live at the +// definition site of 'derived'. This ensures that any use of 'derived' can +// also use 'base'. This may involve the insertion of a number of +// additional PHI nodes. +// +// preconditions: live is a set of pointer type Values +// +// side effects: may insert PHI nodes into the existing CFG, will preserve +// CFG, will not remove or mutate any existing nodes +// +// post condition: PointerToBase contains one (derived, base) pair for every +// pointer in live. Note that derived can be equal to base if the original +// pointer was a base pointer. +static void +findBasePointers(const StatepointLiveSetTy &live, + DenseMap<llvm::Value *, llvm::Value *> &PointerToBase, + DominatorTree *DT, DefiningValueMapTy &DVCache) { + // For the naming of values inserted to be deterministic - which makes for + // much cleaner and more stable tests - we need to assign an order to the + // live values. DenseSets do not provide a deterministic order across runs. + SmallVector<Value *, 64> Temp; + Temp.insert(Temp.end(), live.begin(), live.end()); + std::sort(Temp.begin(), Temp.end(), order_by_name); + for (Value *ptr : Temp) { + Value *base = findBasePointer(ptr, DVCache); + assert(base && "failed to find base pointer"); + PointerToBase[ptr] = base; + assert((!isa<Instruction>(base) || !isa<Instruction>(ptr) || + DT->dominates(cast<Instruction>(base)->getParent(), + cast<Instruction>(ptr)->getParent())) && + "The base we found better dominate the derived pointer"); + + // If you see this trip and like to live really dangerously, the code should + // be correct, just with idioms the verifier can't handle. You can try + // disabling the verifier at your own substaintial risk. + assert(!isa<ConstantPointerNull>(base) && + "the relocation code needs adjustment to handle the relocation of " + "a null pointer constant without causing false positives in the " + "safepoint ir verifier."); + } +} + +/// Find the required based pointers (and adjust the live set) for the given +/// parse point. +static void findBasePointers(DominatorTree &DT, DefiningValueMapTy &DVCache, + const CallSite &CS, + PartiallyConstructedSafepointRecord &result) { + DenseMap<llvm::Value *, llvm::Value *> PointerToBase; + findBasePointers(result.liveset, PointerToBase, &DT, DVCache); + + if (PrintBasePointers) { + // Note: Need to print these in a stable order since this is checked in + // some tests. + errs() << "Base Pairs (w/o Relocation):\n"; + SmallVector<Value *, 64> Temp; + Temp.reserve(PointerToBase.size()); + for (auto Pair : PointerToBase) { + Temp.push_back(Pair.first); + } + std::sort(Temp.begin(), Temp.end(), order_by_name); + for (Value *Ptr : Temp) { + Value *Base = PointerToBase[Ptr]; + errs() << " derived %" << Ptr->getName() << " base %" << Base->getName() + << "\n"; + } + } + + result.PointerToBase = PointerToBase; +} + +/// Given an updated version of the dataflow liveness results, update the +/// liveset and base pointer maps for the call site CS. +static void recomputeLiveInValues(GCPtrLivenessData &RevisedLivenessData, + const CallSite &CS, + PartiallyConstructedSafepointRecord &result); + +static void recomputeLiveInValues( + Function &F, DominatorTree &DT, Pass *P, ArrayRef<CallSite> toUpdate, + MutableArrayRef<struct PartiallyConstructedSafepointRecord> records) { + // TODO-PERF: reuse the original liveness, then simply run the dataflow + // again. The old values are still live and will help it stablize quickly. + GCPtrLivenessData RevisedLivenessData; + computeLiveInValues(DT, F, RevisedLivenessData); + for (size_t i = 0; i < records.size(); i++) { + struct PartiallyConstructedSafepointRecord &info = records[i]; + const CallSite &CS = toUpdate[i]; + recomputeLiveInValues(RevisedLivenessData, CS, info); + } +} + +// When inserting gc.relocate calls, we need to ensure there are no uses +// of the original value between the gc.statepoint and the gc.relocate call. +// One case which can arise is a phi node starting one of the successor blocks. +// We also need to be able to insert the gc.relocates only on the path which +// goes through the statepoint. We might need to split an edge to make this +// possible. +static BasicBlock * +normalizeForInvokeSafepoint(BasicBlock *BB, BasicBlock *InvokeParent, Pass *P) { + DominatorTree *DT = nullptr; + if (auto *DTP = P->getAnalysisIfAvailable<DominatorTreeWrapperPass>()) + DT = &DTP->getDomTree(); + + BasicBlock *Ret = BB; + if (!BB->getUniquePredecessor()) { + Ret = SplitBlockPredecessors(BB, InvokeParent, "", nullptr, DT); + } + + // Now that 'ret' has unique predecessor we can safely remove all phi nodes + // from it + FoldSingleEntryPHINodes(Ret); + assert(!isa<PHINode>(Ret->begin())); + + // At this point, we can safely insert a gc.relocate as the first instruction + // in Ret if needed. + return Ret; +} + +static int find_index(ArrayRef<Value *> livevec, Value *val) { + auto itr = std::find(livevec.begin(), livevec.end(), val); + assert(livevec.end() != itr); + size_t index = std::distance(livevec.begin(), itr); + assert(index < livevec.size()); + return index; +} + +// Create new attribute set containing only attributes which can be transfered +// from original call to the safepoint. +static AttributeSet legalizeCallAttributes(AttributeSet AS) { + AttributeSet ret; + + for (unsigned Slot = 0; Slot < AS.getNumSlots(); Slot++) { + unsigned index = AS.getSlotIndex(Slot); + + if (index == AttributeSet::ReturnIndex || + index == AttributeSet::FunctionIndex) { + + for (auto it = AS.begin(Slot), it_end = AS.end(Slot); it != it_end; + ++it) { + Attribute attr = *it; + + // Do not allow certain attributes - just skip them + // Safepoint can not be read only or read none. + if (attr.hasAttribute(Attribute::ReadNone) || + attr.hasAttribute(Attribute::ReadOnly)) + continue; + + ret = ret.addAttributes( + AS.getContext(), index, + AttributeSet::get(AS.getContext(), index, AttrBuilder(attr))); + } + } + + // Just skip parameter attributes for now + } + + return ret; +} + +/// Helper function to place all gc relocates necessary for the given +/// statepoint. +/// Inputs: +/// liveVariables - list of variables to be relocated. +/// liveStart - index of the first live variable. +/// basePtrs - base pointers. +/// statepointToken - statepoint instruction to which relocates should be +/// bound. +/// Builder - Llvm IR builder to be used to construct new calls. +static void CreateGCRelocates(ArrayRef<llvm::Value *> LiveVariables, + const int LiveStart, + ArrayRef<llvm::Value *> BasePtrs, + Instruction *StatepointToken, + IRBuilder<> Builder) { + SmallVector<Instruction *, 64> NewDefs; + NewDefs.reserve(LiveVariables.size()); + + Module *M = StatepointToken->getParent()->getParent()->getParent(); + + for (unsigned i = 0; i < LiveVariables.size(); i++) { + // We generate a (potentially) unique declaration for every pointer type + // combination. This results is some blow up the function declarations in + // the IR, but removes the need for argument bitcasts which shrinks the IR + // greatly and makes it much more readable. + SmallVector<Type *, 1> Types; // one per 'any' type + // All gc_relocate are set to i8 addrspace(1)* type. This could help avoid + // cases where the actual value's type mangling is not supported by llvm. A + // bitcast is added later to convert gc_relocate to the actual value's type. + Types.push_back(Type::getInt8PtrTy(M->getContext(), 1)); + Value *GCRelocateDecl = Intrinsic::getDeclaration( + M, Intrinsic::experimental_gc_relocate, Types); + + // Generate the gc.relocate call and save the result + Value *BaseIdx = + ConstantInt::get(Type::getInt32Ty(M->getContext()), + LiveStart + find_index(LiveVariables, BasePtrs[i])); + Value *LiveIdx = ConstantInt::get( + Type::getInt32Ty(M->getContext()), + LiveStart + find_index(LiveVariables, LiveVariables[i])); + + // only specify a debug name if we can give a useful one + Value *Reloc = Builder.CreateCall( + GCRelocateDecl, {StatepointToken, BaseIdx, LiveIdx}, + LiveVariables[i]->hasName() ? LiveVariables[i]->getName() + ".relocated" + : ""); + // Trick CodeGen into thinking there are lots of free registers at this + // fake call. + cast<CallInst>(Reloc)->setCallingConv(CallingConv::Cold); + + NewDefs.push_back(cast<Instruction>(Reloc)); + } + assert(NewDefs.size() == LiveVariables.size() && + "missing or extra redefinition at safepoint"); +} + +static void +makeStatepointExplicitImpl(const CallSite &CS, /* to replace */ + const SmallVectorImpl<llvm::Value *> &basePtrs, + const SmallVectorImpl<llvm::Value *> &liveVariables, + Pass *P, + PartiallyConstructedSafepointRecord &result) { + assert(basePtrs.size() == liveVariables.size()); + assert(isStatepoint(CS) && + "This method expects to be rewriting a statepoint"); + + BasicBlock *BB = CS.getInstruction()->getParent(); + assert(BB); + Function *F = BB->getParent(); + assert(F && "must be set"); + Module *M = F->getParent(); + (void)M; + assert(M && "must be set"); + + // We're not changing the function signature of the statepoint since the gc + // arguments go into the var args section. + Function *gc_statepoint_decl = CS.getCalledFunction(); + + // Then go ahead and use the builder do actually do the inserts. We insert + // immediately before the previous instruction under the assumption that all + // arguments will be available here. We can't insert afterwards since we may + // be replacing a terminator. + Instruction *insertBefore = CS.getInstruction(); + IRBuilder<> Builder(insertBefore); + // Copy all of the arguments from the original statepoint - this includes the + // target, call args, and deopt args + SmallVector<llvm::Value *, 64> args; + args.insert(args.end(), CS.arg_begin(), CS.arg_end()); + // TODO: Clear the 'needs rewrite' flag + + // add all the pointers to be relocated (gc arguments) + // Capture the start of the live variable list for use in the gc_relocates + const int live_start = args.size(); + args.insert(args.end(), liveVariables.begin(), liveVariables.end()); + + // Create the statepoint given all the arguments + Instruction *token = nullptr; + AttributeSet return_attributes; + if (CS.isCall()) { + CallInst *toReplace = cast<CallInst>(CS.getInstruction()); + CallInst *call = + Builder.CreateCall(gc_statepoint_decl, args, "safepoint_token"); + call->setTailCall(toReplace->isTailCall()); + call->setCallingConv(toReplace->getCallingConv()); + + // Currently we will fail on parameter attributes and on certain + // function attributes. + AttributeSet new_attrs = legalizeCallAttributes(toReplace->getAttributes()); + // In case if we can handle this set of sttributes - set up function attrs + // directly on statepoint and return attrs later for gc_result intrinsic. + call->setAttributes(new_attrs.getFnAttributes()); + return_attributes = new_attrs.getRetAttributes(); + + token = call; + + // Put the following gc_result and gc_relocate calls immediately after the + // the old call (which we're about to delete) + BasicBlock::iterator next(toReplace); + assert(BB->end() != next && "not a terminator, must have next"); + next++; + Instruction *IP = &*(next); + Builder.SetInsertPoint(IP); + Builder.SetCurrentDebugLocation(IP->getDebugLoc()); + + } else { + InvokeInst *toReplace = cast<InvokeInst>(CS.getInstruction()); + + // Insert the new invoke into the old block. We'll remove the old one in a + // moment at which point this will become the new terminator for the + // original block. + InvokeInst *invoke = InvokeInst::Create( + gc_statepoint_decl, toReplace->getNormalDest(), + toReplace->getUnwindDest(), args, "", toReplace->getParent()); + invoke->setCallingConv(toReplace->getCallingConv()); + + // Currently we will fail on parameter attributes and on certain + // function attributes. + AttributeSet new_attrs = legalizeCallAttributes(toReplace->getAttributes()); + // In case if we can handle this set of sttributes - set up function attrs + // directly on statepoint and return attrs later for gc_result intrinsic. + invoke->setAttributes(new_attrs.getFnAttributes()); + return_attributes = new_attrs.getRetAttributes(); + + token = invoke; + + // Generate gc relocates in exceptional path + BasicBlock *unwindBlock = toReplace->getUnwindDest(); + assert(!isa<PHINode>(unwindBlock->begin()) && + unwindBlock->getUniquePredecessor() && + "can't safely insert in this block!"); + + Instruction *IP = &*(unwindBlock->getFirstInsertionPt()); + Builder.SetInsertPoint(IP); + Builder.SetCurrentDebugLocation(toReplace->getDebugLoc()); + + // Extract second element from landingpad return value. We will attach + // exceptional gc relocates to it. + const unsigned idx = 1; + Instruction *exceptional_token = + cast<Instruction>(Builder.CreateExtractValue( + unwindBlock->getLandingPadInst(), idx, "relocate_token")); + result.UnwindToken = exceptional_token; + + // Just throw away return value. We will use the one we got for normal + // block. + (void)CreateGCRelocates(liveVariables, live_start, basePtrs, + exceptional_token, Builder); + + // Generate gc relocates and returns for normal block + BasicBlock *normalDest = toReplace->getNormalDest(); + assert(!isa<PHINode>(normalDest->begin()) && + normalDest->getUniquePredecessor() && + "can't safely insert in this block!"); + + IP = &*(normalDest->getFirstInsertionPt()); + Builder.SetInsertPoint(IP); + + // gc relocates will be generated later as if it were regular call + // statepoint + } + assert(token); + + // Take the name of the original value call if it had one. + token->takeName(CS.getInstruction()); + +// The GCResult is already inserted, we just need to find it +#ifndef NDEBUG + Instruction *toReplace = CS.getInstruction(); + assert((toReplace->hasNUses(0) || toReplace->hasNUses(1)) && + "only valid use before rewrite is gc.result"); + assert(!toReplace->hasOneUse() || + isGCResult(cast<Instruction>(*toReplace->user_begin()))); +#endif + + // Update the gc.result of the original statepoint (if any) to use the newly + // inserted statepoint. This is safe to do here since the token can't be + // considered a live reference. + CS.getInstruction()->replaceAllUsesWith(token); + + result.StatepointToken = token; + + // Second, create a gc.relocate for every live variable + CreateGCRelocates(liveVariables, live_start, basePtrs, token, Builder); +} + +namespace { +struct name_ordering { + Value *base; + Value *derived; + bool operator()(name_ordering const &a, name_ordering const &b) { + return -1 == a.derived->getName().compare(b.derived->getName()); + } +}; +} +static void stablize_order(SmallVectorImpl<Value *> &basevec, + SmallVectorImpl<Value *> &livevec) { + assert(basevec.size() == livevec.size()); + + SmallVector<name_ordering, 64> temp; + for (size_t i = 0; i < basevec.size(); i++) { + name_ordering v; + v.base = basevec[i]; + v.derived = livevec[i]; + temp.push_back(v); + } + std::sort(temp.begin(), temp.end(), name_ordering()); + for (size_t i = 0; i < basevec.size(); i++) { + basevec[i] = temp[i].base; + livevec[i] = temp[i].derived; + } +} + +// Replace an existing gc.statepoint with a new one and a set of gc.relocates +// which make the relocations happening at this safepoint explicit. +// +// WARNING: Does not do any fixup to adjust users of the original live +// values. That's the callers responsibility. +static void +makeStatepointExplicit(DominatorTree &DT, const CallSite &CS, Pass *P, + PartiallyConstructedSafepointRecord &result) { + auto liveset = result.liveset; + auto PointerToBase = result.PointerToBase; + + // Convert to vector for efficient cross referencing. + SmallVector<Value *, 64> basevec, livevec; + livevec.reserve(liveset.size()); + basevec.reserve(liveset.size()); + for (Value *L : liveset) { + livevec.push_back(L); + + assert(PointerToBase.find(L) != PointerToBase.end()); + Value *base = PointerToBase[L]; + basevec.push_back(base); + } + assert(livevec.size() == basevec.size()); + + // To make the output IR slightly more stable (for use in diffs), ensure a + // fixed order of the values in the safepoint (by sorting the value name). + // The order is otherwise meaningless. + stablize_order(basevec, livevec); + + // Do the actual rewriting and delete the old statepoint + makeStatepointExplicitImpl(CS, basevec, livevec, P, result); + CS.getInstruction()->eraseFromParent(); +} + +// Helper function for the relocationViaAlloca. +// It receives iterator to the statepoint gc relocates and emits store to the +// assigned +// location (via allocaMap) for the each one of them. +// Add visited values into the visitedLiveValues set we will later use them +// for sanity check. +static void +insertRelocationStores(iterator_range<Value::user_iterator> GCRelocs, + DenseMap<Value *, Value *> &AllocaMap, + DenseSet<Value *> &VisitedLiveValues) { + + for (User *U : GCRelocs) { + if (!isa<IntrinsicInst>(U)) + continue; + + IntrinsicInst *RelocatedValue = cast<IntrinsicInst>(U); + + // We only care about relocates + if (RelocatedValue->getIntrinsicID() != + Intrinsic::experimental_gc_relocate) { + continue; + } + + GCRelocateOperands RelocateOperands(RelocatedValue); + Value *OriginalValue = + const_cast<Value *>(RelocateOperands.getDerivedPtr()); + assert(AllocaMap.count(OriginalValue)); + Value *Alloca = AllocaMap[OriginalValue]; + + // Emit store into the related alloca + // All gc_relocate are i8 addrspace(1)* typed, and it must be bitcasted to + // the correct type according to alloca. + assert(RelocatedValue->getNextNode() && "Should always have one since it's not a terminator"); + IRBuilder<> Builder(RelocatedValue->getNextNode()); + Value *CastedRelocatedValue = + Builder.CreateBitCast(RelocatedValue, cast<AllocaInst>(Alloca)->getAllocatedType(), + RelocatedValue->hasName() ? RelocatedValue->getName() + ".casted" : ""); + + StoreInst *Store = new StoreInst(CastedRelocatedValue, Alloca); + Store->insertAfter(cast<Instruction>(CastedRelocatedValue)); + +#ifndef NDEBUG + VisitedLiveValues.insert(OriginalValue); +#endif + } +} + +// Helper function for the "relocationViaAlloca". Similar to the +// "insertRelocationStores" but works for rematerialized values. +static void +insertRematerializationStores( + RematerializedValueMapTy RematerializedValues, + DenseMap<Value *, Value *> &AllocaMap, + DenseSet<Value *> &VisitedLiveValues) { + + for (auto RematerializedValuePair: RematerializedValues) { + Instruction *RematerializedValue = RematerializedValuePair.first; + Value *OriginalValue = RematerializedValuePair.second; + + assert(AllocaMap.count(OriginalValue) && + "Can not find alloca for rematerialized value"); + Value *Alloca = AllocaMap[OriginalValue]; + + StoreInst *Store = new StoreInst(RematerializedValue, Alloca); + Store->insertAfter(RematerializedValue); + +#ifndef NDEBUG + VisitedLiveValues.insert(OriginalValue); +#endif + } +} + +/// do all the relocation update via allocas and mem2reg +static void relocationViaAlloca( + Function &F, DominatorTree &DT, ArrayRef<Value *> Live, + ArrayRef<struct PartiallyConstructedSafepointRecord> Records) { +#ifndef NDEBUG + // record initial number of (static) allocas; we'll check we have the same + // number when we get done. + int InitialAllocaNum = 0; + for (auto I = F.getEntryBlock().begin(), E = F.getEntryBlock().end(); I != E; + I++) + if (isa<AllocaInst>(*I)) + InitialAllocaNum++; +#endif + + // TODO-PERF: change data structures, reserve + DenseMap<Value *, Value *> AllocaMap; + SmallVector<AllocaInst *, 200> PromotableAllocas; + // Used later to chack that we have enough allocas to store all values + std::size_t NumRematerializedValues = 0; + PromotableAllocas.reserve(Live.size()); + + // Emit alloca for "LiveValue" and record it in "allocaMap" and + // "PromotableAllocas" + auto emitAllocaFor = [&](Value *LiveValue) { + AllocaInst *Alloca = new AllocaInst(LiveValue->getType(), "", + F.getEntryBlock().getFirstNonPHI()); + AllocaMap[LiveValue] = Alloca; + PromotableAllocas.push_back(Alloca); + }; + + // emit alloca for each live gc pointer + for (unsigned i = 0; i < Live.size(); i++) { + emitAllocaFor(Live[i]); + } + + // emit allocas for rematerialized values + for (size_t i = 0; i < Records.size(); i++) { + const struct PartiallyConstructedSafepointRecord &Info = Records[i]; + + for (auto RematerializedValuePair : Info.RematerializedValues) { + Value *OriginalValue = RematerializedValuePair.second; + if (AllocaMap.count(OriginalValue) != 0) + continue; + + emitAllocaFor(OriginalValue); + ++NumRematerializedValues; + } + } + + // The next two loops are part of the same conceptual operation. We need to + // insert a store to the alloca after the original def and at each + // redefinition. We need to insert a load before each use. These are split + // into distinct loops for performance reasons. + + // update gc pointer after each statepoint + // either store a relocated value or null (if no relocated value found for + // this gc pointer and it is not a gc_result) + // this must happen before we update the statepoint with load of alloca + // otherwise we lose the link between statepoint and old def + for (size_t i = 0; i < Records.size(); i++) { + const struct PartiallyConstructedSafepointRecord &Info = Records[i]; + Value *Statepoint = Info.StatepointToken; + + // This will be used for consistency check + DenseSet<Value *> VisitedLiveValues; + + // Insert stores for normal statepoint gc relocates + insertRelocationStores(Statepoint->users(), AllocaMap, VisitedLiveValues); + + // In case if it was invoke statepoint + // we will insert stores for exceptional path gc relocates. + if (isa<InvokeInst>(Statepoint)) { + insertRelocationStores(Info.UnwindToken->users(), AllocaMap, + VisitedLiveValues); + } + + // Do similar thing with rematerialized values + insertRematerializationStores(Info.RematerializedValues, AllocaMap, + VisitedLiveValues); + + if (ClobberNonLive) { + // As a debuging aid, pretend that an unrelocated pointer becomes null at + // the gc.statepoint. This will turn some subtle GC problems into + // slightly easier to debug SEGVs. Note that on large IR files with + // lots of gc.statepoints this is extremely costly both memory and time + // wise. + SmallVector<AllocaInst *, 64> ToClobber; + for (auto Pair : AllocaMap) { + Value *Def = Pair.first; + AllocaInst *Alloca = cast<AllocaInst>(Pair.second); + + // This value was relocated + if (VisitedLiveValues.count(Def)) { + continue; + } + ToClobber.push_back(Alloca); + } + + auto InsertClobbersAt = [&](Instruction *IP) { + for (auto *AI : ToClobber) { + auto AIType = cast<PointerType>(AI->getType()); + auto PT = cast<PointerType>(AIType->getElementType()); + Constant *CPN = ConstantPointerNull::get(PT); + StoreInst *Store = new StoreInst(CPN, AI); + Store->insertBefore(IP); + } + }; + + // Insert the clobbering stores. These may get intermixed with the + // gc.results and gc.relocates, but that's fine. + if (auto II = dyn_cast<InvokeInst>(Statepoint)) { + InsertClobbersAt(II->getNormalDest()->getFirstInsertionPt()); + InsertClobbersAt(II->getUnwindDest()->getFirstInsertionPt()); + } else { + BasicBlock::iterator Next(cast<CallInst>(Statepoint)); + Next++; + InsertClobbersAt(Next); + } + } + } + // update use with load allocas and add store for gc_relocated + for (auto Pair : AllocaMap) { + Value *Def = Pair.first; + Value *Alloca = Pair.second; + + // we pre-record the uses of allocas so that we dont have to worry about + // later update + // that change the user information. + SmallVector<Instruction *, 20> Uses; + // PERF: trade a linear scan for repeated reallocation + Uses.reserve(std::distance(Def->user_begin(), Def->user_end())); + for (User *U : Def->users()) { + if (!isa<ConstantExpr>(U)) { + // If the def has a ConstantExpr use, then the def is either a + // ConstantExpr use itself or null. In either case + // (recursively in the first, directly in the second), the oop + // it is ultimately dependent on is null and this particular + // use does not need to be fixed up. + Uses.push_back(cast<Instruction>(U)); + } + } + + std::sort(Uses.begin(), Uses.end()); + auto Last = std::unique(Uses.begin(), Uses.end()); + Uses.erase(Last, Uses.end()); + + for (Instruction *Use : Uses) { + if (isa<PHINode>(Use)) { + PHINode *Phi = cast<PHINode>(Use); + for (unsigned i = 0; i < Phi->getNumIncomingValues(); i++) { + if (Def == Phi->getIncomingValue(i)) { + LoadInst *Load = new LoadInst( + Alloca, "", Phi->getIncomingBlock(i)->getTerminator()); + Phi->setIncomingValue(i, Load); + } + } + } else { + LoadInst *Load = new LoadInst(Alloca, "", Use); + Use->replaceUsesOfWith(Def, Load); + } + } + + // emit store for the initial gc value + // store must be inserted after load, otherwise store will be in alloca's + // use list and an extra load will be inserted before it + StoreInst *Store = new StoreInst(Def, Alloca); + if (Instruction *Inst = dyn_cast<Instruction>(Def)) { + if (InvokeInst *Invoke = dyn_cast<InvokeInst>(Inst)) { + // InvokeInst is a TerminatorInst so the store need to be inserted + // into its normal destination block. + BasicBlock *NormalDest = Invoke->getNormalDest(); + Store->insertBefore(NormalDest->getFirstNonPHI()); + } else { + assert(!Inst->isTerminator() && + "The only TerminatorInst that can produce a value is " + "InvokeInst which is handled above."); + Store->insertAfter(Inst); + } + } else { + assert(isa<Argument>(Def)); + Store->insertAfter(cast<Instruction>(Alloca)); + } + } + + assert(PromotableAllocas.size() == Live.size() + NumRematerializedValues && + "we must have the same allocas with lives"); + if (!PromotableAllocas.empty()) { + // apply mem2reg to promote alloca to SSA + PromoteMemToReg(PromotableAllocas, DT); + } + +#ifndef NDEBUG + for (auto I = F.getEntryBlock().begin(), E = F.getEntryBlock().end(); I != E; + I++) + if (isa<AllocaInst>(*I)) + InitialAllocaNum--; + assert(InitialAllocaNum == 0 && "We must not introduce any extra allocas"); +#endif +} + +/// Implement a unique function which doesn't require we sort the input +/// vector. Doing so has the effect of changing the output of a couple of +/// tests in ways which make them less useful in testing fused safepoints. +template <typename T> static void unique_unsorted(SmallVectorImpl<T> &Vec) { + DenseSet<T> Seen; + SmallVector<T, 128> TempVec; + TempVec.reserve(Vec.size()); + for (auto Element : Vec) + TempVec.push_back(Element); + Vec.clear(); + for (auto V : TempVec) { + if (Seen.insert(V).second) { + Vec.push_back(V); + } + } +} + +/// Insert holders so that each Value is obviously live through the entire +/// lifetime of the call. +static void insertUseHolderAfter(CallSite &CS, const ArrayRef<Value *> Values, + SmallVectorImpl<CallInst *> &Holders) { + if (Values.empty()) + // No values to hold live, might as well not insert the empty holder + return; + + Module *M = CS.getInstruction()->getParent()->getParent()->getParent(); + // Use a dummy vararg function to actually hold the values live + Function *Func = cast<Function>(M->getOrInsertFunction( + "__tmp_use", FunctionType::get(Type::getVoidTy(M->getContext()), true))); + if (CS.isCall()) { + // For call safepoints insert dummy calls right after safepoint + BasicBlock::iterator Next(CS.getInstruction()); + Next++; + Holders.push_back(CallInst::Create(Func, Values, "", Next)); + return; + } + // For invoke safepooints insert dummy calls both in normal and + // exceptional destination blocks + auto *II = cast<InvokeInst>(CS.getInstruction()); + Holders.push_back(CallInst::Create( + Func, Values, "", II->getNormalDest()->getFirstInsertionPt())); + Holders.push_back(CallInst::Create( + Func, Values, "", II->getUnwindDest()->getFirstInsertionPt())); +} + +static void findLiveReferences( + Function &F, DominatorTree &DT, Pass *P, ArrayRef<CallSite> toUpdate, + MutableArrayRef<struct PartiallyConstructedSafepointRecord> records) { + GCPtrLivenessData OriginalLivenessData; + computeLiveInValues(DT, F, OriginalLivenessData); + for (size_t i = 0; i < records.size(); i++) { + struct PartiallyConstructedSafepointRecord &info = records[i]; + const CallSite &CS = toUpdate[i]; + analyzeParsePointLiveness(DT, OriginalLivenessData, CS, info); + } +} + +/// Remove any vector of pointers from the liveset by scalarizing them over the +/// statepoint instruction. Adds the scalarized pieces to the liveset. It +/// would be preferrable to include the vector in the statepoint itself, but +/// the lowering code currently does not handle that. Extending it would be +/// slightly non-trivial since it requires a format change. Given how rare +/// such cases are (for the moment?) scalarizing is an acceptable comprimise. +static void splitVectorValues(Instruction *StatepointInst, + StatepointLiveSetTy &LiveSet, DominatorTree &DT) { + SmallVector<Value *, 16> ToSplit; + for (Value *V : LiveSet) + if (isa<VectorType>(V->getType())) + ToSplit.push_back(V); + + if (ToSplit.empty()) + return; + + Function &F = *(StatepointInst->getParent()->getParent()); + + DenseMap<Value *, AllocaInst *> AllocaMap; + // First is normal return, second is exceptional return (invoke only) + DenseMap<Value *, std::pair<Value *, Value *>> Replacements; + for (Value *V : ToSplit) { + LiveSet.erase(V); + + AllocaInst *Alloca = + new AllocaInst(V->getType(), "", F.getEntryBlock().getFirstNonPHI()); + AllocaMap[V] = Alloca; + + VectorType *VT = cast<VectorType>(V->getType()); + IRBuilder<> Builder(StatepointInst); + SmallVector<Value *, 16> Elements; + for (unsigned i = 0; i < VT->getNumElements(); i++) + Elements.push_back(Builder.CreateExtractElement(V, Builder.getInt32(i))); + LiveSet.insert(Elements.begin(), Elements.end()); + + auto InsertVectorReform = [&](Instruction *IP) { + Builder.SetInsertPoint(IP); + Builder.SetCurrentDebugLocation(IP->getDebugLoc()); + Value *ResultVec = UndefValue::get(VT); + for (unsigned i = 0; i < VT->getNumElements(); i++) + ResultVec = Builder.CreateInsertElement(ResultVec, Elements[i], + Builder.getInt32(i)); + return ResultVec; + }; + + if (isa<CallInst>(StatepointInst)) { + BasicBlock::iterator Next(StatepointInst); + Next++; + Instruction *IP = &*(Next); + Replacements[V].first = InsertVectorReform(IP); + Replacements[V].second = nullptr; + } else { + InvokeInst *Invoke = cast<InvokeInst>(StatepointInst); + // We've already normalized - check that we don't have shared destination + // blocks + BasicBlock *NormalDest = Invoke->getNormalDest(); + assert(!isa<PHINode>(NormalDest->begin())); + BasicBlock *UnwindDest = Invoke->getUnwindDest(); + assert(!isa<PHINode>(UnwindDest->begin())); + // Insert insert element sequences in both successors + Instruction *IP = &*(NormalDest->getFirstInsertionPt()); + Replacements[V].first = InsertVectorReform(IP); + IP = &*(UnwindDest->getFirstInsertionPt()); + Replacements[V].second = InsertVectorReform(IP); + } + } + for (Value *V : ToSplit) { + AllocaInst *Alloca = AllocaMap[V]; + + // Capture all users before we start mutating use lists + SmallVector<Instruction *, 16> Users; + for (User *U : V->users()) + Users.push_back(cast<Instruction>(U)); + + for (Instruction *I : Users) { + if (auto Phi = dyn_cast<PHINode>(I)) { + for (unsigned i = 0; i < Phi->getNumIncomingValues(); i++) + if (V == Phi->getIncomingValue(i)) { + LoadInst *Load = new LoadInst( + Alloca, "", Phi->getIncomingBlock(i)->getTerminator()); + Phi->setIncomingValue(i, Load); + } + } else { + LoadInst *Load = new LoadInst(Alloca, "", I); + I->replaceUsesOfWith(V, Load); + } + } + + // Store the original value and the replacement value into the alloca + StoreInst *Store = new StoreInst(V, Alloca); + if (auto I = dyn_cast<Instruction>(V)) + Store->insertAfter(I); + else + Store->insertAfter(Alloca); + + // Normal return for invoke, or call return + Instruction *Replacement = cast<Instruction>(Replacements[V].first); + (new StoreInst(Replacement, Alloca))->insertAfter(Replacement); + // Unwind return for invoke only + Replacement = cast_or_null<Instruction>(Replacements[V].second); + if (Replacement) + (new StoreInst(Replacement, Alloca))->insertAfter(Replacement); + } + + // apply mem2reg to promote alloca to SSA + SmallVector<AllocaInst *, 16> Allocas; + for (Value *V : ToSplit) + Allocas.push_back(AllocaMap[V]); + PromoteMemToReg(Allocas, DT); +} + +// Helper function for the "rematerializeLiveValues". It walks use chain +// starting from the "CurrentValue" until it meets "BaseValue". Only "simple" +// values are visited (currently it is GEP's and casts). Returns true if it +// sucessfully reached "BaseValue" and false otherwise. +// Fills "ChainToBase" array with all visited values. "BaseValue" is not +// recorded. +static bool findRematerializableChainToBasePointer( + SmallVectorImpl<Instruction*> &ChainToBase, + Value *CurrentValue, Value *BaseValue) { + + // We have found a base value + if (CurrentValue == BaseValue) { + return true; + } + + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(CurrentValue)) { + ChainToBase.push_back(GEP); + return findRematerializableChainToBasePointer(ChainToBase, + GEP->getPointerOperand(), + BaseValue); + } + + if (CastInst *CI = dyn_cast<CastInst>(CurrentValue)) { + Value *Def = CI->stripPointerCasts(); + + // This two checks are basically similar. First one is here for the + // consistency with findBasePointers logic. + assert(!isa<CastInst>(Def) && "not a pointer cast found"); + if (!CI->isNoopCast(CI->getModule()->getDataLayout())) + return false; + + ChainToBase.push_back(CI); + return findRematerializableChainToBasePointer(ChainToBase, Def, BaseValue); + } + + // Not supported instruction in the chain + return false; +} + +// Helper function for the "rematerializeLiveValues". Compute cost of the use +// chain we are going to rematerialize. +static unsigned +chainToBasePointerCost(SmallVectorImpl<Instruction*> &Chain, + TargetTransformInfo &TTI) { + unsigned Cost = 0; + + for (Instruction *Instr : Chain) { + if (CastInst *CI = dyn_cast<CastInst>(Instr)) { + assert(CI->isNoopCast(CI->getModule()->getDataLayout()) && + "non noop cast is found during rematerialization"); + + Type *SrcTy = CI->getOperand(0)->getType(); + Cost += TTI.getCastInstrCost(CI->getOpcode(), CI->getType(), SrcTy); + + } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) { + // Cost of the address calculation + Type *ValTy = GEP->getPointerOperandType()->getPointerElementType(); + Cost += TTI.getAddressComputationCost(ValTy); + + // And cost of the GEP itself + // TODO: Use TTI->getGEPCost here (it exists, but appears to be not + // allowed for the external usage) + if (!GEP->hasAllConstantIndices()) + Cost += 2; + + } else { + llvm_unreachable("unsupported instruciton type during rematerialization"); + } + } + + return Cost; +} + +// From the statepoint liveset pick values that are cheaper to recompute then to +// relocate. Remove this values from the liveset, rematerialize them after +// statepoint and record them in "Info" structure. Note that similar to +// relocated values we don't do any user adjustments here. +static void rematerializeLiveValues(CallSite CS, + PartiallyConstructedSafepointRecord &Info, + TargetTransformInfo &TTI) { + const unsigned int ChainLengthThreshold = 10; + + // Record values we are going to delete from this statepoint live set. + // We can not di this in following loop due to iterator invalidation. + SmallVector<Value *, 32> LiveValuesToBeDeleted; + + for (Value *LiveValue: Info.liveset) { + // For each live pointer find it's defining chain + SmallVector<Instruction *, 3> ChainToBase; + assert(Info.PointerToBase.find(LiveValue) != Info.PointerToBase.end()); + bool FoundChain = + findRematerializableChainToBasePointer(ChainToBase, + LiveValue, + Info.PointerToBase[LiveValue]); + // Nothing to do, or chain is too long + if (!FoundChain || + ChainToBase.size() == 0 || + ChainToBase.size() > ChainLengthThreshold) + continue; + + // Compute cost of this chain + unsigned Cost = chainToBasePointerCost(ChainToBase, TTI); + // TODO: We can also account for cases when we will be able to remove some + // of the rematerialized values by later optimization passes. I.e if + // we rematerialized several intersecting chains. Or if original values + // don't have any uses besides this statepoint. + + // For invokes we need to rematerialize each chain twice - for normal and + // for unwind basic blocks. Model this by multiplying cost by two. + if (CS.isInvoke()) { + Cost *= 2; + } + // If it's too expensive - skip it + if (Cost >= RematerializationThreshold) + continue; + + // Remove value from the live set + LiveValuesToBeDeleted.push_back(LiveValue); + + // Clone instructions and record them inside "Info" structure + + // Walk backwards to visit top-most instructions first + std::reverse(ChainToBase.begin(), ChainToBase.end()); + + // Utility function which clones all instructions from "ChainToBase" + // and inserts them before "InsertBefore". Returns rematerialized value + // which should be used after statepoint. + auto rematerializeChain = [&ChainToBase](Instruction *InsertBefore) { + Instruction *LastClonedValue = nullptr; + Instruction *LastValue = nullptr; + for (Instruction *Instr: ChainToBase) { + // Only GEP's and casts are suported as we need to be careful to not + // introduce any new uses of pointers not in the liveset. + // Note that it's fine to introduce new uses of pointers which were + // otherwise not used after this statepoint. + assert(isa<GetElementPtrInst>(Instr) || isa<CastInst>(Instr)); + + Instruction *ClonedValue = Instr->clone(); + ClonedValue->insertBefore(InsertBefore); + ClonedValue->setName(Instr->getName() + ".remat"); + + // If it is not first instruction in the chain then it uses previously + // cloned value. We should update it to use cloned value. + if (LastClonedValue) { + assert(LastValue); + ClonedValue->replaceUsesOfWith(LastValue, LastClonedValue); +#ifndef NDEBUG + // Assert that cloned instruction does not use any instructions from + // this chain other than LastClonedValue + for (auto OpValue : ClonedValue->operand_values()) { + assert(std::find(ChainToBase.begin(), ChainToBase.end(), OpValue) == + ChainToBase.end() && + "incorrect use in rematerialization chain"); + } +#endif + } + + LastClonedValue = ClonedValue; + LastValue = Instr; + } + assert(LastClonedValue); + return LastClonedValue; + }; + + // Different cases for calls and invokes. For invokes we need to clone + // instructions both on normal and unwind path. + if (CS.isCall()) { + Instruction *InsertBefore = CS.getInstruction()->getNextNode(); + assert(InsertBefore); + Instruction *RematerializedValue = rematerializeChain(InsertBefore); + Info.RematerializedValues[RematerializedValue] = LiveValue; + } else { + InvokeInst *Invoke = cast<InvokeInst>(CS.getInstruction()); + + Instruction *NormalInsertBefore = + Invoke->getNormalDest()->getFirstInsertionPt(); + Instruction *UnwindInsertBefore = + Invoke->getUnwindDest()->getFirstInsertionPt(); + + Instruction *NormalRematerializedValue = + rematerializeChain(NormalInsertBefore); + Instruction *UnwindRematerializedValue = + rematerializeChain(UnwindInsertBefore); + + Info.RematerializedValues[NormalRematerializedValue] = LiveValue; + Info.RematerializedValues[UnwindRematerializedValue] = LiveValue; + } + } + + // Remove rematerializaed values from the live set + for (auto LiveValue: LiveValuesToBeDeleted) { + Info.liveset.erase(LiveValue); + } +} + +static bool insertParsePoints(Function &F, DominatorTree &DT, Pass *P, + SmallVectorImpl<CallSite> &toUpdate) { +#ifndef NDEBUG + // sanity check the input + std::set<CallSite> uniqued; + uniqued.insert(toUpdate.begin(), toUpdate.end()); + assert(uniqued.size() == toUpdate.size() && "no duplicates please!"); + + for (size_t i = 0; i < toUpdate.size(); i++) { + CallSite &CS = toUpdate[i]; + assert(CS.getInstruction()->getParent()->getParent() == &F); + assert(isStatepoint(CS) && "expected to already be a deopt statepoint"); + } +#endif + + // When inserting gc.relocates for invokes, we need to be able to insert at + // the top of the successor blocks. See the comment on + // normalForInvokeSafepoint on exactly what is needed. Note that this step + // may restructure the CFG. + for (CallSite CS : toUpdate) { + if (!CS.isInvoke()) + continue; + InvokeInst *invoke = cast<InvokeInst>(CS.getInstruction()); + normalizeForInvokeSafepoint(invoke->getNormalDest(), invoke->getParent(), + P); + normalizeForInvokeSafepoint(invoke->getUnwindDest(), invoke->getParent(), + P); + } + + // A list of dummy calls added to the IR to keep various values obviously + // live in the IR. We'll remove all of these when done. + SmallVector<CallInst *, 64> holders; + + // Insert a dummy call with all of the arguments to the vm_state we'll need + // for the actual safepoint insertion. This ensures reference arguments in + // the deopt argument list are considered live through the safepoint (and + // thus makes sure they get relocated.) + for (size_t i = 0; i < toUpdate.size(); i++) { + CallSite &CS = toUpdate[i]; + Statepoint StatepointCS(CS); + + SmallVector<Value *, 64> DeoptValues; + for (Use &U : StatepointCS.vm_state_args()) { + Value *Arg = cast<Value>(&U); + assert(!isUnhandledGCPointerType(Arg->getType()) && + "support for FCA unimplemented"); + if (isHandledGCPointerType(Arg->getType())) + DeoptValues.push_back(Arg); + } + insertUseHolderAfter(CS, DeoptValues, holders); + } + + SmallVector<struct PartiallyConstructedSafepointRecord, 64> records; + records.reserve(toUpdate.size()); + for (size_t i = 0; i < toUpdate.size(); i++) { + struct PartiallyConstructedSafepointRecord info; + records.push_back(info); + } + assert(records.size() == toUpdate.size()); + + // A) Identify all gc pointers which are staticly live at the given call + // site. + findLiveReferences(F, DT, P, toUpdate, records); + + // Do a limited scalarization of any live at safepoint vector values which + // contain pointers. This enables this pass to run after vectorization at + // the cost of some possible performance loss. TODO: it would be nice to + // natively support vectors all the way through the backend so we don't need + // to scalarize here. + for (size_t i = 0; i < records.size(); i++) { + struct PartiallyConstructedSafepointRecord &info = records[i]; + Instruction *statepoint = toUpdate[i].getInstruction(); + splitVectorValues(cast<Instruction>(statepoint), info.liveset, DT); + } + + // B) Find the base pointers for each live pointer + /* scope for caching */ { + // Cache the 'defining value' relation used in the computation and + // insertion of base phis and selects. This ensures that we don't insert + // large numbers of duplicate base_phis. + DefiningValueMapTy DVCache; + + for (size_t i = 0; i < records.size(); i++) { + struct PartiallyConstructedSafepointRecord &info = records[i]; + CallSite &CS = toUpdate[i]; + findBasePointers(DT, DVCache, CS, info); + } + } // end of cache scope + + // The base phi insertion logic (for any safepoint) may have inserted new + // instructions which are now live at some safepoint. The simplest such + // example is: + // loop: + // phi a <-- will be a new base_phi here + // safepoint 1 <-- that needs to be live here + // gep a + 1 + // safepoint 2 + // br loop + // We insert some dummy calls after each safepoint to definitely hold live + // the base pointers which were identified for that safepoint. We'll then + // ask liveness for _every_ base inserted to see what is now live. Then we + // remove the dummy calls. + holders.reserve(holders.size() + records.size()); + for (size_t i = 0; i < records.size(); i++) { + struct PartiallyConstructedSafepointRecord &info = records[i]; + CallSite &CS = toUpdate[i]; + + SmallVector<Value *, 128> Bases; + for (auto Pair : info.PointerToBase) { + Bases.push_back(Pair.second); + } + insertUseHolderAfter(CS, Bases, holders); + } + + // By selecting base pointers, we've effectively inserted new uses. Thus, we + // need to rerun liveness. We may *also* have inserted new defs, but that's + // not the key issue. + recomputeLiveInValues(F, DT, P, toUpdate, records); + + if (PrintBasePointers) { + for (size_t i = 0; i < records.size(); i++) { + struct PartiallyConstructedSafepointRecord &info = records[i]; + errs() << "Base Pairs: (w/Relocation)\n"; + for (auto Pair : info.PointerToBase) { + errs() << " derived %" << Pair.first->getName() << " base %" + << Pair.second->getName() << "\n"; + } + } + } + for (size_t i = 0; i < holders.size(); i++) { + holders[i]->eraseFromParent(); + holders[i] = nullptr; + } + holders.clear(); + + // In order to reduce live set of statepoint we might choose to rematerialize + // some values instead of relocating them. This is purelly an optimization and + // does not influence correctness. + TargetTransformInfo &TTI = + P->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + + for (size_t i = 0; i < records.size(); i++) { + struct PartiallyConstructedSafepointRecord &info = records[i]; + CallSite &CS = toUpdate[i]; + + rematerializeLiveValues(CS, info, TTI); + } + + // Now run through and replace the existing statepoints with new ones with + // the live variables listed. We do not yet update uses of the values being + // relocated. We have references to live variables that need to + // survive to the last iteration of this loop. (By construction, the + // previous statepoint can not be a live variable, thus we can and remove + // the old statepoint calls as we go.) + for (size_t i = 0; i < records.size(); i++) { + struct PartiallyConstructedSafepointRecord &info = records[i]; + CallSite &CS = toUpdate[i]; + makeStatepointExplicit(DT, CS, P, info); + } + toUpdate.clear(); // prevent accident use of invalid CallSites + + // Do all the fixups of the original live variables to their relocated selves + SmallVector<Value *, 128> live; + for (size_t i = 0; i < records.size(); i++) { + struct PartiallyConstructedSafepointRecord &info = records[i]; + // We can't simply save the live set from the original insertion. One of + // the live values might be the result of a call which needs a safepoint. + // That Value* no longer exists and we need to use the new gc_result. + // Thankfully, the liveset is embedded in the statepoint (and updated), so + // we just grab that. + Statepoint statepoint(info.StatepointToken); + live.insert(live.end(), statepoint.gc_args_begin(), + statepoint.gc_args_end()); +#ifndef NDEBUG + // Do some basic sanity checks on our liveness results before performing + // relocation. Relocation can and will turn mistakes in liveness results + // into non-sensical code which is must harder to debug. + // TODO: It would be nice to test consistency as well + assert(DT.isReachableFromEntry(info.StatepointToken->getParent()) && + "statepoint must be reachable or liveness is meaningless"); + for (Value *V : statepoint.gc_args()) { + if (!isa<Instruction>(V)) + // Non-instruction values trivial dominate all possible uses + continue; + auto LiveInst = cast<Instruction>(V); + assert(DT.isReachableFromEntry(LiveInst->getParent()) && + "unreachable values should never be live"); + assert(DT.dominates(LiveInst, info.StatepointToken) && + "basic SSA liveness expectation violated by liveness analysis"); + } +#endif + } + unique_unsorted(live); + +#ifndef NDEBUG + // sanity check + for (auto ptr : live) { + assert(isGCPointerType(ptr->getType()) && "must be a gc pointer type"); + } +#endif + + relocationViaAlloca(F, DT, live, records); + return !records.empty(); +} + +/// Returns true if this function should be rewritten by this pass. The main +/// point of this function is as an extension point for custom logic. +static bool shouldRewriteStatepointsIn(Function &F) { + // TODO: This should check the GCStrategy + if (F.hasGC()) { + const char *FunctionGCName = F.getGC(); + const StringRef StatepointExampleName("statepoint-example"); + const StringRef CoreCLRName("coreclr"); + return (StatepointExampleName == FunctionGCName) || + (CoreCLRName == FunctionGCName); + } else + return false; +} + +bool RewriteStatepointsForGC::runOnFunction(Function &F) { + // Nothing to do for declarations. + if (F.isDeclaration() || F.empty()) + return false; + + // Policy choice says not to rewrite - the most common reason is that we're + // compiling code without a GCStrategy. + if (!shouldRewriteStatepointsIn(F)) + return false; + + DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + + // Gather all the statepoints which need rewritten. Be careful to only + // consider those in reachable code since we need to ask dominance queries + // when rewriting. We'll delete the unreachable ones in a moment. + SmallVector<CallSite, 64> ParsePointNeeded; + bool HasUnreachableStatepoint = false; + for (Instruction &I : inst_range(F)) { + // TODO: only the ones with the flag set! + if (isStatepoint(I)) { + if (DT.isReachableFromEntry(I.getParent())) + ParsePointNeeded.push_back(CallSite(&I)); + else + HasUnreachableStatepoint = true; + } + } + + bool MadeChange = false; + + // Delete any unreachable statepoints so that we don't have unrewritten + // statepoints surviving this pass. This makes testing easier and the + // resulting IR less confusing to human readers. Rather than be fancy, we + // just reuse a utility function which removes the unreachable blocks. + if (HasUnreachableStatepoint) + MadeChange |= removeUnreachableBlocks(F); + + // Return early if no work to do. + if (ParsePointNeeded.empty()) + return MadeChange; + + // As a prepass, go ahead and aggressively destroy single entry phi nodes. + // These are created by LCSSA. They have the effect of increasing the size + // of liveness sets for no good reason. It may be harder to do this post + // insertion since relocations and base phis can confuse things. + for (BasicBlock &BB : F) + if (BB.getUniquePredecessor()) { + MadeChange = true; + FoldSingleEntryPHINodes(&BB); + } + + MadeChange |= insertParsePoints(F, DT, this, ParsePointNeeded); + return MadeChange; +} + +// liveness computation via standard dataflow +// ------------------------------------------------------------------- + +// TODO: Consider using bitvectors for liveness, the set of potentially +// interesting values should be small and easy to pre-compute. + +/// Compute the live-in set for the location rbegin starting from +/// the live-out set of the basic block +static void computeLiveInValues(BasicBlock::reverse_iterator rbegin, + BasicBlock::reverse_iterator rend, + DenseSet<Value *> &LiveTmp) { + + for (BasicBlock::reverse_iterator ritr = rbegin; ritr != rend; ritr++) { + Instruction *I = &*ritr; + + // KILL/Def - Remove this definition from LiveIn + LiveTmp.erase(I); + + // Don't consider *uses* in PHI nodes, we handle their contribution to + // predecessor blocks when we seed the LiveOut sets + if (isa<PHINode>(I)) + continue; + + // USE - Add to the LiveIn set for this instruction + for (Value *V : I->operands()) { + assert(!isUnhandledGCPointerType(V->getType()) && + "support for FCA unimplemented"); + if (isHandledGCPointerType(V->getType()) && !isa<Constant>(V)) { + // The choice to exclude all things constant here is slightly subtle. + // There are two idependent reasons: + // - We assume that things which are constant (from LLVM's definition) + // do not move at runtime. For example, the address of a global + // variable is fixed, even though it's contents may not be. + // - Second, we can't disallow arbitrary inttoptr constants even + // if the language frontend does. Optimization passes are free to + // locally exploit facts without respect to global reachability. This + // can create sections of code which are dynamically unreachable and + // contain just about anything. (see constants.ll in tests) + LiveTmp.insert(V); + } + } + } +} + +static void computeLiveOutSeed(BasicBlock *BB, DenseSet<Value *> &LiveTmp) { + + for (BasicBlock *Succ : successors(BB)) { + const BasicBlock::iterator E(Succ->getFirstNonPHI()); + for (BasicBlock::iterator I = Succ->begin(); I != E; I++) { + PHINode *Phi = cast<PHINode>(&*I); + Value *V = Phi->getIncomingValueForBlock(BB); + assert(!isUnhandledGCPointerType(V->getType()) && + "support for FCA unimplemented"); + if (isHandledGCPointerType(V->getType()) && !isa<Constant>(V)) { + LiveTmp.insert(V); + } + } + } +} + +static DenseSet<Value *> computeKillSet(BasicBlock *BB) { + DenseSet<Value *> KillSet; + for (Instruction &I : *BB) + if (isHandledGCPointerType(I.getType())) + KillSet.insert(&I); + return KillSet; +} + +#ifndef NDEBUG +/// Check that the items in 'Live' dominate 'TI'. This is used as a basic +/// sanity check for the liveness computation. +static void checkBasicSSA(DominatorTree &DT, DenseSet<Value *> &Live, + TerminatorInst *TI, bool TermOkay = false) { + for (Value *V : Live) { + if (auto *I = dyn_cast<Instruction>(V)) { + // The terminator can be a member of the LiveOut set. LLVM's definition + // of instruction dominance states that V does not dominate itself. As + // such, we need to special case this to allow it. + if (TermOkay && TI == I) + continue; + assert(DT.dominates(I, TI) && + "basic SSA liveness expectation violated by liveness analysis"); + } + } +} + +/// Check that all the liveness sets used during the computation of liveness +/// obey basic SSA properties. This is useful for finding cases where we miss +/// a def. +static void checkBasicSSA(DominatorTree &DT, GCPtrLivenessData &Data, + BasicBlock &BB) { + checkBasicSSA(DT, Data.LiveSet[&BB], BB.getTerminator()); + checkBasicSSA(DT, Data.LiveOut[&BB], BB.getTerminator(), true); + checkBasicSSA(DT, Data.LiveIn[&BB], BB.getTerminator()); +} +#endif + +static void computeLiveInValues(DominatorTree &DT, Function &F, + GCPtrLivenessData &Data) { + + SmallSetVector<BasicBlock *, 200> Worklist; + auto AddPredsToWorklist = [&](BasicBlock *BB) { + // We use a SetVector so that we don't have duplicates in the worklist. + Worklist.insert(pred_begin(BB), pred_end(BB)); + }; + auto NextItem = [&]() { + BasicBlock *BB = Worklist.back(); + Worklist.pop_back(); + return BB; + }; + + // Seed the liveness for each individual block + for (BasicBlock &BB : F) { + Data.KillSet[&BB] = computeKillSet(&BB); + Data.LiveSet[&BB].clear(); + computeLiveInValues(BB.rbegin(), BB.rend(), Data.LiveSet[&BB]); + +#ifndef NDEBUG + for (Value *Kill : Data.KillSet[&BB]) + assert(!Data.LiveSet[&BB].count(Kill) && "live set contains kill"); +#endif + + Data.LiveOut[&BB] = DenseSet<Value *>(); + computeLiveOutSeed(&BB, Data.LiveOut[&BB]); + Data.LiveIn[&BB] = Data.LiveSet[&BB]; + set_union(Data.LiveIn[&BB], Data.LiveOut[&BB]); + set_subtract(Data.LiveIn[&BB], Data.KillSet[&BB]); + if (!Data.LiveIn[&BB].empty()) + AddPredsToWorklist(&BB); + } + + // Propagate that liveness until stable + while (!Worklist.empty()) { + BasicBlock *BB = NextItem(); + + // Compute our new liveout set, then exit early if it hasn't changed + // despite the contribution of our successor. + DenseSet<Value *> LiveOut = Data.LiveOut[BB]; + const auto OldLiveOutSize = LiveOut.size(); + for (BasicBlock *Succ : successors(BB)) { + assert(Data.LiveIn.count(Succ)); + set_union(LiveOut, Data.LiveIn[Succ]); + } + // assert OutLiveOut is a subset of LiveOut + if (OldLiveOutSize == LiveOut.size()) { + // If the sets are the same size, then we didn't actually add anything + // when unioning our successors LiveIn Thus, the LiveIn of this block + // hasn't changed. + continue; + } + Data.LiveOut[BB] = LiveOut; + + // Apply the effects of this basic block + DenseSet<Value *> LiveTmp = LiveOut; + set_union(LiveTmp, Data.LiveSet[BB]); + set_subtract(LiveTmp, Data.KillSet[BB]); + + assert(Data.LiveIn.count(BB)); + const DenseSet<Value *> &OldLiveIn = Data.LiveIn[BB]; + // assert: OldLiveIn is a subset of LiveTmp + if (OldLiveIn.size() != LiveTmp.size()) { + Data.LiveIn[BB] = LiveTmp; + AddPredsToWorklist(BB); + } + } // while( !worklist.empty() ) + +#ifndef NDEBUG + // Sanity check our ouput against SSA properties. This helps catch any + // missing kills during the above iteration. + for (BasicBlock &BB : F) { + checkBasicSSA(DT, Data, BB); + } +#endif +} + +static void findLiveSetAtInst(Instruction *Inst, GCPtrLivenessData &Data, + StatepointLiveSetTy &Out) { + + BasicBlock *BB = Inst->getParent(); + + // Note: The copy is intentional and required + assert(Data.LiveOut.count(BB)); + DenseSet<Value *> LiveOut = Data.LiveOut[BB]; + + // We want to handle the statepoint itself oddly. It's + // call result is not live (normal), nor are it's arguments + // (unless they're used again later). This adjustment is + // specifically what we need to relocate + BasicBlock::reverse_iterator rend(Inst); + computeLiveInValues(BB->rbegin(), rend, LiveOut); + LiveOut.erase(Inst); + Out.insert(LiveOut.begin(), LiveOut.end()); +} + +static void recomputeLiveInValues(GCPtrLivenessData &RevisedLivenessData, + const CallSite &CS, + PartiallyConstructedSafepointRecord &Info) { + Instruction *Inst = CS.getInstruction(); + StatepointLiveSetTy Updated; + findLiveSetAtInst(Inst, RevisedLivenessData, Updated); + +#ifndef NDEBUG + DenseSet<Value *> Bases; + for (auto KVPair : Info.PointerToBase) { + Bases.insert(KVPair.second); + } +#endif + // We may have base pointers which are now live that weren't before. We need + // to update the PointerToBase structure to reflect this. + for (auto V : Updated) + if (!Info.PointerToBase.count(V)) { + assert(Bases.count(V) && "can't find base for unexpected live value"); + Info.PointerToBase[V] = V; + continue; + } + +#ifndef NDEBUG + for (auto V : Updated) { + assert(Info.PointerToBase.count(V) && + "must be able to find base for live value"); + } +#endif + + // Remove any stale base mappings - this can happen since our liveness is + // more precise then the one inherent in the base pointer analysis + DenseSet<Value *> ToErase; + for (auto KVPair : Info.PointerToBase) + if (!Updated.count(KVPair.first)) + ToErase.insert(KVPair.first); + for (auto V : ToErase) + Info.PointerToBase.erase(V); + +#ifndef NDEBUG + for (auto KVPair : Info.PointerToBase) + assert(Updated.count(KVPair.first) && "record for non-live value"); +#endif + + Info.liveset = Updated; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp index cfc9a8e..bc068f7 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp @@ -25,6 +25,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" @@ -35,7 +36,6 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/Utils/Local.h" #include <algorithm> @@ -154,7 +154,7 @@ namespace { /// Constant Propagation. /// class SCCPSolver : public InstVisitor<SCCPSolver> { - const DataLayout *DL; + const DataLayout &DL; const TargetLibraryInfo *TLI; SmallPtrSet<BasicBlock*, 8> BBExecutable; // The BBs that are executable. DenseMap<Value*, LatticeVal> ValueState; // The state each value is in. @@ -206,8 +206,8 @@ class SCCPSolver : public InstVisitor<SCCPSolver> { typedef std::pair<BasicBlock*, BasicBlock*> Edge; DenseSet<Edge> KnownFeasibleEdges; public: - SCCPSolver(const DataLayout *DL, const TargetLibraryInfo *tli) - : DL(DL), TLI(tli) {} + SCCPSolver(const DataLayout &DL, const TargetLibraryInfo *tli) + : DL(DL), TLI(tli) {} /// MarkBlockExecutable - This method can be used by clients to mark all of /// the blocks that are known to be intrinsically live in the processed unit. @@ -1012,7 +1012,8 @@ void SCCPSolver::visitGetElementPtrInst(GetElementPtrInst &I) { Constant *Ptr = Operands[0]; auto Indices = makeArrayRef(Operands.begin() + 1, Operands.end()); - markConstant(&I, ConstantExpr::getGetElementPtr(Ptr, Indices)); + markConstant(&I, ConstantExpr::getGetElementPtr(I.getSourceElementType(), Ptr, + Indices)); } void SCCPSolver::visitStoreInst(StoreInst &SI) { @@ -1504,7 +1505,7 @@ namespace { /// struct SCCP : public FunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<TargetLibraryInfo>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); } static char ID; // Pass identification, replacement for typeid SCCP() : FunctionPass(ID) { @@ -1561,9 +1562,9 @@ bool SCCP::runOnFunction(Function &F) { return false; DEBUG(dbgs() << "SCCP on function '" << F.getName() << "'\n"); - const DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr; - const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>(); + const DataLayout &DL = F.getParent()->getDataLayout(); + const TargetLibraryInfo *TLI = + &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); SCCPSolver Solver(DL, TLI); // Mark the first block of the function as being executable. @@ -1637,7 +1638,7 @@ namespace { /// struct IPSCCP : public ModulePass { void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<TargetLibraryInfo>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); } static char ID; IPSCCP() : ModulePass(ID) { @@ -1651,7 +1652,7 @@ char IPSCCP::ID = 0; INITIALIZE_PASS_BEGIN(IPSCCP, "ipsccp", "Interprocedural Sparse Conditional Constant Propagation", false, false) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END(IPSCCP, "ipsccp", "Interprocedural Sparse Conditional Constant Propagation", false, false) @@ -1690,9 +1691,9 @@ static bool AddressIsTaken(const GlobalValue *GV) { } bool IPSCCP::runOnModule(Module &M) { - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr; - const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>(); + const DataLayout &DL = M.getDataLayout(); + const TargetLibraryInfo *TLI = + &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); SCCPSolver Solver(DL, TLI); // AddressTakenFunctions - This set keeps track of the address-taken functions diff --git a/contrib/llvm/lib/Transforms/Scalar/SROA.cpp b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp index ed161fd..056dd11 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp @@ -247,7 +247,7 @@ public: /// hold. void insert(ArrayRef<Slice> NewSlices) { int OldSize = Slices.size(); - std::move(NewSlices.begin(), NewSlices.end(), std::back_inserter(Slices)); + Slices.append(NewSlices.begin(), NewSlices.end()); auto SliceI = Slices.begin() + OldSize; std::sort(SliceI, Slices.end()); std::inplace_merge(Slices.begin(), SliceI, Slices.end()); @@ -701,6 +701,7 @@ private: // by writing out the code here where we have tho underlying allocation // size readily available. APInt GEPOffset = Offset; + const DataLayout &DL = GEPI.getModule()->getDataLayout(); for (gep_type_iterator GTI = gep_type_begin(GEPI), GTE = gep_type_end(GEPI); GTI != GTE; ++GTI) { @@ -750,6 +751,7 @@ private: if (!IsOffsetKnown) return PI.setAborted(&LI); + const DataLayout &DL = LI.getModule()->getDataLayout(); uint64_t Size = DL.getTypeStoreSize(LI.getType()); return handleLoadOrStore(LI.getType(), LI, Offset, Size, LI.isVolatile()); } @@ -761,6 +763,7 @@ private: if (!IsOffsetKnown) return PI.setAborted(&SI); + const DataLayout &DL = SI.getModule()->getDataLayout(); uint64_t Size = DL.getTypeStoreSize(ValOp->getType()); // If this memory access can be shown to *statically* extend outside the @@ -898,6 +901,7 @@ private: SmallVector<std::pair<Instruction *, Instruction *>, 4> Uses; Visited.insert(Root); Uses.push_back(std::make_pair(cast<Instruction>(*U), Root)); + const DataLayout &DL = Root->getModule()->getDataLayout(); // If there are no loads or stores, the access is dead. We mark that as // a size zero access. Size = 0; @@ -1084,7 +1088,8 @@ class AllocaPromoter : public LoadAndStorePromoter { SmallVector<DbgValueInst *, 4> DVIs; public: - AllocaPromoter(const SmallVectorImpl<Instruction *> &Insts, SSAUpdater &S, + AllocaPromoter(ArrayRef<const Instruction *> Insts, + SSAUpdater &S, AllocaInst &AI, DIBuilder &DIB) : LoadAndStorePromoter(Insts, S), AI(AI), DIB(DIB) {} @@ -1092,8 +1097,8 @@ public: // Retain the debug information attached to the alloca for use when // rewriting loads and stores. if (auto *L = LocalAsMetadata::getIfExists(&AI)) { - if (auto *DebugNode = MetadataAsValue::getIfExists(AI.getContext(), L)) { - for (User *U : DebugNode->users()) + if (auto *DINode = MetadataAsValue::getIfExists(AI.getContext(), L)) { + for (User *U : DINode->users()) if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U)) DDIs.push_back(DDI); else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(U)) @@ -1162,10 +1167,9 @@ public: } else { continue; } - Instruction *DbgVal = - DIB.insertDbgValueIntrinsic(Arg, 0, DIVariable(DVI->getVariable()), - DIExpression(DVI->getExpression()), Inst); - DbgVal->setDebugLoc(DVI->getDebugLoc()); + DIB.insertDbgValueIntrinsic(Arg, 0, DVI->getVariable(), + DVI->getExpression(), DVI->getDebugLoc(), + Inst); } } }; @@ -1194,7 +1198,6 @@ class SROA : public FunctionPass { const bool RequiresDomTree; LLVMContext *C; - const DataLayout *DL; DominatorTree *DT; AssumptionCache *AC; @@ -1243,7 +1246,7 @@ class SROA : public FunctionPass { public: SROA(bool RequiresDomTree = true) : FunctionPass(ID), RequiresDomTree(RequiresDomTree), C(nullptr), - DL(nullptr), DT(nullptr) { + DT(nullptr) { initializeSROAPass(*PassRegistry::getPassRegistry()); } bool runOnFunction(Function &F) override; @@ -1257,8 +1260,8 @@ private: friend class AllocaSliceRewriter; bool presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS); - bool rewritePartition(AllocaInst &AI, AllocaSlices &AS, - AllocaSlices::Partition &P); + AllocaInst *rewritePartition(AllocaInst &AI, AllocaSlices &AS, + AllocaSlices::Partition &P); bool splitAlloca(AllocaInst &AI, AllocaSlices &AS); bool runOnAlloca(AllocaInst &AI); void clobberUse(Use &U); @@ -1349,7 +1352,7 @@ static Type *findCommonType(AllocaSlices::const_iterator B, /// /// FIXME: This should be hoisted into a generic utility, likely in /// Transforms/Util/Local.h -static bool isSafePHIToSpeculate(PHINode &PN, const DataLayout *DL = nullptr) { +static bool isSafePHIToSpeculate(PHINode &PN) { // For now, we can only do this promotion if the load is in the same block // as the PHI, and if there are no stores between the phi and load. // TODO: Allow recursive phi users. @@ -1381,6 +1384,8 @@ static bool isSafePHIToSpeculate(PHINode &PN, const DataLayout *DL = nullptr) { if (!HaveLoad) return false; + const DataLayout &DL = PN.getModule()->getDataLayout(); + // We can only transform this if it is safe to push the loads into the // predecessor blocks. The only thing to watch out for is that we can't put // a possibly trapping load in the predecessor if it is a critical edge. @@ -1402,8 +1407,8 @@ static bool isSafePHIToSpeculate(PHINode &PN, const DataLayout *DL = nullptr) { // If this pointer is always safe to load, or if we can prove that there // is already a load in the block, then we can move the load to the pred // block. - if (InVal->isDereferenceablePointer(DL) || - isSafeToLoadUnconditionally(InVal, TI, MaxAlign, DL)) + if (isDereferenceablePointer(InVal, DL) || + isSafeToLoadUnconditionally(InVal, TI, MaxAlign)) continue; return false; @@ -1468,12 +1473,12 @@ static void speculatePHINodeLoads(PHINode &PN) { /// /// We can do this to a select if its only uses are loads and if the operand /// to the select can be loaded unconditionally. -static bool isSafeSelectToSpeculate(SelectInst &SI, - const DataLayout *DL = nullptr) { +static bool isSafeSelectToSpeculate(SelectInst &SI) { Value *TValue = SI.getTrueValue(); Value *FValue = SI.getFalseValue(); - bool TDerefable = TValue->isDereferenceablePointer(DL); - bool FDerefable = FValue->isDereferenceablePointer(DL); + const DataLayout &DL = SI.getModule()->getDataLayout(); + bool TDerefable = isDereferenceablePointer(TValue, DL); + bool FDerefable = isDereferenceablePointer(FValue, DL); for (User *U : SI.users()) { LoadInst *LI = dyn_cast<LoadInst>(U); @@ -1484,10 +1489,10 @@ static bool isSafeSelectToSpeculate(SelectInst &SI, // absolutely (e.g. allocas) or at this point because we can see other // accesses to it. if (!TDerefable && - !isSafeToLoadUnconditionally(TValue, LI, LI->getAlignment(), DL)) + !isSafeToLoadUnconditionally(TValue, LI, LI->getAlignment())) return false; if (!FDerefable && - !isSafeToLoadUnconditionally(FValue, LI, LI->getAlignment(), DL)) + !isSafeToLoadUnconditionally(FValue, LI, LI->getAlignment())) return false; } @@ -1547,7 +1552,8 @@ static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr, if (Indices.size() == 1 && cast<ConstantInt>(Indices.back())->isZero()) return BasePtr; - return IRB.CreateInBoundsGEP(BasePtr, Indices, NamePrefix + "sroa_idx"); + return IRB.CreateInBoundsGEP(nullptr, BasePtr, Indices, + NamePrefix + "sroa_idx"); } /// \brief Get a natural GEP off of the BasePtr walking through Ty toward @@ -1798,7 +1804,8 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, OffsetPtr = Int8PtrOffset == 0 ? Int8Ptr - : IRB.CreateInBoundsGEP(Int8Ptr, IRB.getInt(Int8PtrOffset), + : IRB.CreateInBoundsGEP(IRB.getInt8Ty(), Int8Ptr, + IRB.getInt(Int8PtrOffset), NamePrefix + "sroa_raw_idx"); } Ptr = OffsetPtr; @@ -3245,7 +3252,8 @@ private: void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) { assert(Ty->isSingleValueType()); // Load the single value and insert it using the indices. - Value *GEP = IRB.CreateInBoundsGEP(Ptr, GEPIndices, Name + ".gep"); + Value *GEP = + IRB.CreateInBoundsGEP(nullptr, Ptr, GEPIndices, Name + ".gep"); Value *Load = IRB.CreateLoad(GEP, Name + ".load"); Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert"); DEBUG(dbgs() << " to: " << *Load << "\n"); @@ -3278,7 +3286,7 @@ private: // Extract the single value and store it using the indices. Value *Store = IRB.CreateStore( IRB.CreateExtractValue(Agg, Indices, Name + ".extract"), - IRB.CreateInBoundsGEP(Ptr, GEPIndices, Name + ".gep")); + IRB.CreateInBoundsGEP(nullptr, Ptr, GEPIndices, Name + ".gep")); (void)Store; DEBUG(dbgs() << " to: " << *Store << "\n"); } @@ -3699,6 +3707,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { // them to the alloca slices. SmallDenseMap<LoadInst *, std::vector<LoadInst *>, 1> SplitLoadsMap; std::vector<LoadInst *> SplitLoads; + const DataLayout &DL = AI.getModule()->getDataLayout(); for (LoadInst *LI : Loads) { SplitLoads.clear(); @@ -3724,10 +3733,10 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8); auto *PartPtrTy = PartTy->getPointerTo(LI->getPointerAddressSpace()); LoadInst *PLoad = IRB.CreateAlignedLoad( - getAdjustedPtr(IRB, *DL, BasePtr, - APInt(DL->getPointerSizeInBits(), PartOffset), + getAdjustedPtr(IRB, DL, BasePtr, + APInt(DL.getPointerSizeInBits(), PartOffset), PartPtrTy, BasePtr->getName() + "."), - getAdjustedAlignment(LI, PartOffset, *DL), /*IsVolatile*/ false, + getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false, LI->getName()); // Append this load onto the list of split loads so we can find it later @@ -3777,10 +3786,10 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { PLoad->getType()->getPointerTo(SI->getPointerAddressSpace()); StoreInst *PStore = IRB.CreateAlignedStore( - PLoad, getAdjustedPtr(IRB, *DL, StoreBasePtr, - APInt(DL->getPointerSizeInBits(), PartOffset), + PLoad, getAdjustedPtr(IRB, DL, StoreBasePtr, + APInt(DL.getPointerSizeInBits(), PartOffset), PartPtrTy, StoreBasePtr->getName() + "."), - getAdjustedAlignment(SI, PartOffset, *DL), /*IsVolatile*/ false); + getAdjustedAlignment(SI, PartOffset, DL), /*IsVolatile*/ false); (void)PStore; DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n"); } @@ -3857,20 +3866,20 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { } else { IRB.SetInsertPoint(BasicBlock::iterator(LI)); PLoad = IRB.CreateAlignedLoad( - getAdjustedPtr(IRB, *DL, LoadBasePtr, - APInt(DL->getPointerSizeInBits(), PartOffset), + getAdjustedPtr(IRB, DL, LoadBasePtr, + APInt(DL.getPointerSizeInBits(), PartOffset), PartPtrTy, LoadBasePtr->getName() + "."), - getAdjustedAlignment(LI, PartOffset, *DL), /*IsVolatile*/ false, + getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false, LI->getName()); } // And store this partition. IRB.SetInsertPoint(BasicBlock::iterator(SI)); StoreInst *PStore = IRB.CreateAlignedStore( - PLoad, getAdjustedPtr(IRB, *DL, StoreBasePtr, - APInt(DL->getPointerSizeInBits(), PartOffset), + PLoad, getAdjustedPtr(IRB, DL, StoreBasePtr, + APInt(DL.getPointerSizeInBits(), PartOffset), PartPtrTy, StoreBasePtr->getName() + "."), - getAdjustedAlignment(SI, PartOffset, *DL), /*IsVolatile*/ false); + getAdjustedAlignment(SI, PartOffset, DL), /*IsVolatile*/ false); // Now build a new slice for the alloca. NewSlices.push_back( @@ -3964,31 +3973,32 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { /// appropriate new offsets. It also evaluates how successful the rewrite was /// at enabling promotion and if it was successful queues the alloca to be /// promoted. -bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, - AllocaSlices::Partition &P) { +AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, + AllocaSlices::Partition &P) { // Try to compute a friendly type for this partition of the alloca. This // won't always succeed, in which case we fall back to a legal integer type // or an i8 array of an appropriate size. Type *SliceTy = nullptr; + const DataLayout &DL = AI.getModule()->getDataLayout(); if (Type *CommonUseTy = findCommonType(P.begin(), P.end(), P.endOffset())) - if (DL->getTypeAllocSize(CommonUseTy) >= P.size()) + if (DL.getTypeAllocSize(CommonUseTy) >= P.size()) SliceTy = CommonUseTy; if (!SliceTy) - if (Type *TypePartitionTy = getTypePartition(*DL, AI.getAllocatedType(), + if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(), P.beginOffset(), P.size())) SliceTy = TypePartitionTy; if ((!SliceTy || (SliceTy->isArrayTy() && SliceTy->getArrayElementType()->isIntegerTy())) && - DL->isLegalInteger(P.size() * 8)) + DL.isLegalInteger(P.size() * 8)) SliceTy = Type::getIntNTy(*C, P.size() * 8); if (!SliceTy) SliceTy = ArrayType::get(Type::getInt8Ty(*C), P.size()); - assert(DL->getTypeAllocSize(SliceTy) >= P.size()); + assert(DL.getTypeAllocSize(SliceTy) >= P.size()); - bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, *DL); + bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL); VectorType *VecTy = - IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, *DL); + IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL); if (VecTy) SliceTy = VecTy; @@ -4003,18 +4013,19 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, NewAI = &AI; // FIXME: We should be able to bail at this point with "nothing changed". // FIXME: We might want to defer PHI speculation until after here. + // FIXME: return nullptr; } else { unsigned Alignment = AI.getAlignment(); if (!Alignment) { // The minimum alignment which users can rely on when the explicit // alignment is omitted or zero is that required by the ABI for this // type. - Alignment = DL->getABITypeAlignment(AI.getAllocatedType()); + Alignment = DL.getABITypeAlignment(AI.getAllocatedType()); } Alignment = MinAlign(Alignment, P.beginOffset()); // If we will get at least this much alignment from the type alone, leave // the alloca's alignment unconstrained. - if (Alignment <= DL->getABITypeAlignment(SliceTy)) + if (Alignment <= DL.getABITypeAlignment(SliceTy)) Alignment = 0; NewAI = new AllocaInst( SliceTy, nullptr, Alignment, @@ -4034,7 +4045,7 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, SmallPtrSet<PHINode *, 8> PHIUsers; SmallPtrSet<SelectInst *, 8> SelectUsers; - AllocaSliceRewriter Rewriter(*DL, AS, *this, AI, *NewAI, P.beginOffset(), + AllocaSliceRewriter Rewriter(DL, AS, *this, AI, *NewAI, P.beginOffset(), P.endOffset(), IsIntegerPromotable, VecTy, PHIUsers, SelectUsers); bool Promotable = true; @@ -4056,7 +4067,7 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, for (SmallPtrSetImpl<PHINode *>::iterator I = PHIUsers.begin(), E = PHIUsers.end(); I != E; ++I) - if (!isSafePHIToSpeculate(**I, DL)) { + if (!isSafePHIToSpeculate(**I)) { Promotable = false; PHIUsers.clear(); SelectUsers.clear(); @@ -4065,7 +4076,7 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, for (SmallPtrSetImpl<SelectInst *>::iterator I = SelectUsers.begin(), E = SelectUsers.end(); I != E; ++I) - if (!isSafeSelectToSpeculate(**I, DL)) { + if (!isSafeSelectToSpeculate(**I)) { Promotable = false; PHIUsers.clear(); SelectUsers.clear(); @@ -4098,7 +4109,7 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, PostPromotionWorklist.pop_back(); } - return true; + return NewAI; } /// \brief Walks the slices of an alloca and form partitions based on them, @@ -4109,6 +4120,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) { unsigned NumPartitions = 0; bool Changed = false; + const DataLayout &DL = AI.getModule()->getDataLayout(); // First try to pre-split loads and stores. Changed |= presplitLoadsAndStores(AI, AS); @@ -4126,7 +4138,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) { // confident that the above handling of splittable loads and stores is // completely sufficient before we forcibly disable the remaining handling. if (S.beginOffset() == 0 && - S.endOffset() >= DL->getTypeAllocSize(AI.getAllocatedType())) + S.endOffset() >= DL.getTypeAllocSize(AI.getAllocatedType())) continue; if (isa<LoadInst>(S.getUse()->getUser()) || isa<StoreInst>(S.getUse()->getUser())) { @@ -4137,9 +4149,29 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) { if (!IsSorted) std::sort(AS.begin(), AS.end()); + /// \brief Describes the allocas introduced by rewritePartition + /// in order to migrate the debug info. + struct Piece { + AllocaInst *Alloca; + uint64_t Offset; + uint64_t Size; + Piece(AllocaInst *AI, uint64_t O, uint64_t S) + : Alloca(AI), Offset(O), Size(S) {} + }; + SmallVector<Piece, 4> Pieces; + // Rewrite each partition. for (auto &P : AS.partitions()) { - Changed |= rewritePartition(AI, AS, P); + if (AllocaInst *NewAI = rewritePartition(AI, AS, P)) { + Changed = true; + if (NewAI != &AI) { + uint64_t SizeOfByte = 8; + uint64_t AllocaSize = DL.getTypeSizeInBits(NewAI->getAllocatedType()); + // Don't include any padding. + uint64_t Size = std::min(AllocaSize, P.size() * SizeOfByte); + Pieces.push_back(Piece(NewAI, P.beginOffset() * SizeOfByte, Size)); + } + } ++NumPartitions; } @@ -4147,6 +4179,42 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) { MaxPartitionsPerAlloca = std::max<unsigned>(NumPartitions, MaxPartitionsPerAlloca); + // Migrate debug information from the old alloca to the new alloca(s) + // and the individial partitions. + if (DbgDeclareInst *DbgDecl = FindAllocaDbgDeclare(&AI)) { + auto *Var = DbgDecl->getVariable(); + auto *Expr = DbgDecl->getExpression(); + DIBuilder DIB(*AI.getParent()->getParent()->getParent(), + /*AllowUnresolved*/ false); + bool IsSplit = Pieces.size() > 1; + for (auto Piece : Pieces) { + // Create a piece expression describing the new partition or reuse AI's + // expression if there is only one partition. + auto *PieceExpr = Expr; + if (IsSplit || Expr->isBitPiece()) { + // If this alloca is already a scalar replacement of a larger aggregate, + // Piece.Offset describes the offset inside the scalar. + uint64_t Offset = Expr->isBitPiece() ? Expr->getBitPieceOffset() : 0; + uint64_t Start = Offset + Piece.Offset; + uint64_t Size = Piece.Size; + if (Expr->isBitPiece()) { + uint64_t AbsEnd = Expr->getBitPieceOffset() + Expr->getBitPieceSize(); + if (Start >= AbsEnd) + // No need to describe a SROAed padding. + continue; + Size = std::min(Size, AbsEnd - Start); + } + PieceExpr = DIB.createBitPieceExpression(Start, Size); + } + + // Remove any existing dbg.declare intrinsic describing the same alloca. + if (DbgDeclareInst *OldDDI = FindAllocaDbgDeclare(Piece.Alloca)) + OldDDI->eraseFromParent(); + + DIB.insertDeclare(Piece.Alloca, Var, PieceExpr, DbgDecl->getDebugLoc(), + &AI); + } + } return Changed; } @@ -4179,21 +4247,22 @@ bool SROA::runOnAlloca(AllocaInst &AI) { AI.eraseFromParent(); return true; } + const DataLayout &DL = AI.getModule()->getDataLayout(); // Skip alloca forms that this analysis can't handle. if (AI.isArrayAllocation() || !AI.getAllocatedType()->isSized() || - DL->getTypeAllocSize(AI.getAllocatedType()) == 0) + DL.getTypeAllocSize(AI.getAllocatedType()) == 0) return false; bool Changed = false; // First, split any FCA loads and stores touching this alloca to promote // better splitting and promotion opportunities. - AggLoadStoreRewriter AggRewriter(*DL); + AggLoadStoreRewriter AggRewriter(DL); Changed |= AggRewriter.rewrite(AI); // Build the slices using a recursive instruction-visiting builder. - AllocaSlices AS(*DL, AI); + AllocaSlices AS(DL, AI); DEBUG(AS.print(dbgs())); if (AS.isEscaped()) return Changed; @@ -4258,8 +4327,11 @@ void SROA::deleteDeadInstructions( DeadInsts.insert(U); } - if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) + if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) { DeletedAllocas.insert(AI); + if (DbgDeclareInst *DbgDecl = FindAllocaDbgDeclare(AI)) + DbgDecl->eraseFromParent(); + } ++NumDeleted; I->eraseFromParent(); @@ -4363,12 +4435,6 @@ bool SROA::runOnFunction(Function &F) { DEBUG(dbgs() << "SROA function: " << F.getName() << "\n"); C = &F.getContext(); - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - if (!DLP) { - DEBUG(dbgs() << " Skipping SROA -- no target data!\n"); - return false; - } - DL = &DLP->getDataLayout(); DominatorTreeWrapperPass *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); DT = DTWP ? &DTWP->getDomTree() : nullptr; @@ -4376,9 +4442,10 @@ bool SROA::runOnFunction(Function &F) { BasicBlock &EntryBB = F.getEntryBlock(); for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end()); - I != E; ++I) + I != E; ++I) { if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) Worklist.insert(AI); + } bool Changed = false; // A set of deleted alloca instruction pointers which should be removed from diff --git a/contrib/llvm/lib/Transforms/Scalar/SampleProfile.cpp b/contrib/llvm/lib/Transforms/Scalar/SampleProfile.cpp index 179bbf7..3480cd4 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SampleProfile.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SampleProfile.cpp @@ -95,7 +95,7 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired<LoopInfo>(); + AU.addRequired<LoopInfoWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<PostDominatorTree>(); } @@ -217,13 +217,16 @@ void SampleProfileLoader::printBlockWeight(raw_ostream &OS, BasicBlock *BB) { /// \returns The profiled weight of I. unsigned SampleProfileLoader::getInstWeight(Instruction &Inst) { DebugLoc DLoc = Inst.getDebugLoc(); + if (!DLoc) + return 0; + unsigned Lineno = DLoc.getLine(); if (Lineno < HeaderLineno) return 0; - DILocation DIL(DLoc.getAsMDNode(*Ctx)); + const DILocation *DIL = DLoc; int LOffset = Lineno - HeaderLineno; - unsigned Discriminator = DIL.getDiscriminator(); + unsigned Discriminator = DIL->getDiscriminator(); unsigned Weight = Samples->samplesAt(LOffset, Discriminator); DEBUG(dbgs() << " " << Lineno << "." << Discriminator << ":" << Inst << " (line offset: " << LOffset << "." << Discriminator @@ -577,6 +580,10 @@ void SampleProfileLoader::propagateWeights(Function &F) { bool Changed = true; unsigned i = 0; + // Add an entry count to the function using the samples gathered + // at the function entry. + F.setEntryCount(Samples->getHeadSamples()); + // Before propagation starts, build, for each block, a list of // unique predecessors and successors. This is necessary to handle // identical edges in multiway branches. Since we visit all blocks and all @@ -639,9 +646,8 @@ void SampleProfileLoader::propagateWeights(Function &F) { /// \returns the line number where \p F is defined. If it returns 0, /// it means that there is no debug information available for \p F. unsigned SampleProfileLoader::getFunctionLoc(Function &F) { - DISubprogram S = getDISubprogram(&F); - if (S.isSubprogram()) - return S.getLineNumber(); + if (DISubprogram *S = getDISubprogram(&F)) + return S->getLine(); // If could not find the start of \p F, emit a diagnostic to inform the user // about the missed opportunity. @@ -731,7 +737,7 @@ INITIALIZE_PASS_BEGIN(SampleProfileLoader, "sample-profile", "Sample Profile loader", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(PostDominatorTree) -INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(AddDiscriminators) INITIALIZE_PASS_END(SampleProfileLoader, "sample-profile", "Sample Profile loader", false, false) @@ -762,7 +768,7 @@ bool SampleProfileLoader::runOnFunction(Function &F) { DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); PDT = &getAnalysis<PostDominatorTree>(); - LI = &getAnalysis<LoopInfo>(); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); Ctx = &F.getParent()->getContext(); Samples = Reader->getSamplesFor(F); if (!Samples->empty()) diff --git a/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp index a16e9e2..d5d3605 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp @@ -20,7 +20,7 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/Verifier.h" #include "llvm/InitializePasses.h" -#include "llvm/PassManager.h" +#include "llvm/IR/LegacyPassManager.h" using namespace llvm; @@ -28,6 +28,7 @@ using namespace llvm; /// ScalarOpts library. void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeADCEPass(Registry); + initializeBDCEPass(Registry); initializeAlignmentFromAssumptionsPass(Registry); initializeSampleProfileLoaderPass(Registry); initializeConstantHoistingPass(Registry); @@ -38,13 +39,16 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeScalarizerPass(Registry); initializeDSEPass(Registry); initializeGVNPass(Registry); - initializeEarlyCSEPass(Registry); + initializeEarlyCSELegacyPassPass(Registry); initializeFlattenCFGPassPass(Registry); + initializeInductiveRangeCheckEliminationPass(Registry); initializeIndVarSimplifyPass(Registry); initializeJumpThreadingPass(Registry); initializeLICMPass(Registry); initializeLoopDeletionPass(Registry); + initializeLoopAccessAnalysisPass(Registry); initializeLoopInstSimplifyPass(Registry); + initializeLoopInterchangePass(Registry); initializeLoopRotatePass(Registry); initializeLoopStrengthReducePass(Registry); initializeLoopRerollPass(Registry); @@ -55,9 +59,11 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeLowerExpectIntrinsicPass(Registry); initializeMemCpyOptPass(Registry); initializeMergedLoadStoreMotionPass(Registry); + initializeNaryReassociatePass(Registry); initializePartiallyInlineLibCallsPass(Registry); initializeReassociatePass(Registry); initializeRegToMemPass(Registry); + initializeRewriteStatepointsForGCPass(Registry); initializeSCCPPass(Registry); initializeIPSCCPPass(Registry); initializeSROAPass(Registry); @@ -68,7 +74,13 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeSinkingPass(Registry); initializeTailCallElimPass(Registry); initializeSeparateConstOffsetFromGEPPass(Registry); + initializeSpeculativeExecutionPass(Registry); + initializeStraightLineStrengthReducePass(Registry); initializeLoadCombinePass(Registry); + initializePlaceBackedgeSafepointsImplPass(Registry); + initializePlaceSafepointsPass(Registry); + initializeFloat2IntPass(Registry); + initializeLoopDistributePass(Registry); } void LLVMInitializeScalarOpts(LLVMPassRegistryRef R) { @@ -79,6 +91,10 @@ void LLVMAddAggressiveDCEPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createAggressiveDCEPass()); } +void LLVMAddBitTrackingDCEPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createBitTrackingDCEPass()); +} + void LLVMAddAlignmentFromAssumptionsPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createAlignmentFromAssumptionsPass()); } @@ -198,7 +214,6 @@ void LLVMAddDemoteMemoryToRegisterPass(LLVMPassManagerRef PM) { void LLVMAddVerifierPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createVerifierPass()); - // FIXME: should this also add createDebugInfoVerifierPass()? } void LLVMAddCorrelatedValuePropagationPass(LLVMPassManagerRef PM) { diff --git a/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp index 5c49a55..d955da7 100644 --- a/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp @@ -89,7 +89,6 @@ namespace { private: bool HasDomTree; - const DataLayout *DL; /// DeadInsts - Keep track of instructions we have made dead, so that /// we can remove them after we are done working. @@ -159,9 +158,10 @@ namespace { void isSafeMemAccess(uint64_t Offset, uint64_t MemSize, Type *MemOpType, bool isStore, AllocaInfo &Info, Instruction *TheAccess, bool AllowWholeAccess); - bool TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size); - uint64_t FindElementAndOffset(Type *&T, uint64_t &Offset, - Type *&IdxTy); + bool TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size, + const DataLayout &DL); + uint64_t FindElementAndOffset(Type *&T, uint64_t &Offset, Type *&IdxTy, + const DataLayout &DL); void DoScalarReplacement(AllocaInst *AI, std::vector<AllocaInst*> &WorkList); @@ -699,9 +699,9 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, // If the source and destination are both to the same alloca, then this is // a noop copy-to-self, just delete it. Otherwise, emit a load and store // as appropriate. - AllocaInst *OrigAI = cast<AllocaInst>(GetUnderlyingObject(Ptr, &DL, 0)); + AllocaInst *OrigAI = cast<AllocaInst>(GetUnderlyingObject(Ptr, DL, 0)); - if (GetUnderlyingObject(MTI->getSource(), &DL, 0) != OrigAI) { + if (GetUnderlyingObject(MTI->getSource(), DL, 0) != OrigAI) { // Dest must be OrigAI, change this to be a load from the original // pointer (bitcasted), then a store to our new alloca. assert(MTI->getRawDest() == Ptr && "Neither use is of pointer?"); @@ -717,7 +717,7 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, LoadInst *SrcVal = Builder.CreateLoad(SrcPtr, "srcval"); SrcVal->setAlignment(MTI->getAlignment()); Builder.CreateStore(SrcVal, NewAI); - } else if (GetUnderlyingObject(MTI->getDest(), &DL, 0) != OrigAI) { + } else if (GetUnderlyingObject(MTI->getDest(), DL, 0) != OrigAI) { // Src must be OrigAI, change this to be a load from NewAI then a store // through the original dest pointer (bitcasted). assert(MTI->getRawSource() == Ptr && "Neither use is of pointer?"); @@ -1032,17 +1032,8 @@ bool SROA::runOnFunction(Function &F) { if (skipOptnoneFunction(F)) return false; - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - DL = DLP ? &DLP->getDataLayout() : nullptr; - bool Changed = performPromotion(F); - // FIXME: ScalarRepl currently depends on DataLayout more than it - // theoretically needs to. It should be refactored in order to support - // target-independent IR. Until this is done, just skip the actual - // scalar-replacement portion of this pass. - if (!DL) return Changed; - while (1) { bool LocalChange = performScalarRepl(F); if (!LocalChange) break; // No need to repromote if no scalarrepl @@ -1061,7 +1052,7 @@ class AllocaPromoter : public LoadAndStorePromoter { SmallVector<DbgDeclareInst *, 4> DDIs; SmallVector<DbgValueInst *, 4> DVIs; public: - AllocaPromoter(const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S, + AllocaPromoter(ArrayRef<Instruction*> Insts, SSAUpdater &S, DIBuilder *DB) : LoadAndStorePromoter(Insts, S), AI(nullptr), DIB(DB) {} @@ -1069,8 +1060,8 @@ public: // Remember which alloca we're promoting (for isInstInList). this->AI = AI; if (auto *L = LocalAsMetadata::getIfExists(AI)) { - if (auto *DebugNode = MetadataAsValue::getIfExists(AI->getContext(), L)) { - for (User *U : DebugNode->users()) + if (auto *DINode = MetadataAsValue::getIfExists(AI->getContext(), L)) { + for (User *U : DINode->users()) if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U)) DDIs.push_back(DDI); else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(U)) @@ -1126,10 +1117,9 @@ public: } else { continue; } - Instruction *DbgVal = DIB->insertDbgValueIntrinsic( - Arg, 0, DIVariable(DVI->getVariable()), - DIExpression(DVI->getExpression()), Inst); - DbgVal->setDebugLoc(DVI->getDebugLoc()); + DIB->insertDbgValueIntrinsic(Arg, 0, DVI->getVariable(), + DVI->getExpression(), DVI->getDebugLoc(), + Inst); } } }; @@ -1148,9 +1138,10 @@ public: /// /// We can do this to a select if its only uses are loads and if the operand to /// the select can be loaded unconditionally. -static bool isSafeSelectToSpeculate(SelectInst *SI, const DataLayout *DL) { - bool TDerefable = SI->getTrueValue()->isDereferenceablePointer(DL); - bool FDerefable = SI->getFalseValue()->isDereferenceablePointer(DL); +static bool isSafeSelectToSpeculate(SelectInst *SI) { + const DataLayout &DL = SI->getModule()->getDataLayout(); + bool TDerefable = isDereferenceablePointer(SI->getTrueValue(), DL); + bool FDerefable = isDereferenceablePointer(SI->getFalseValue(), DL); for (User *U : SI->users()) { LoadInst *LI = dyn_cast<LoadInst>(U); @@ -1158,11 +1149,13 @@ static bool isSafeSelectToSpeculate(SelectInst *SI, const DataLayout *DL) { // Both operands to the select need to be dereferencable, either absolutely // (e.g. allocas) or at this point because we can see other accesses to it. - if (!TDerefable && !isSafeToLoadUnconditionally(SI->getTrueValue(), LI, - LI->getAlignment(), DL)) + if (!TDerefable && + !isSafeToLoadUnconditionally(SI->getTrueValue(), LI, + LI->getAlignment())) return false; - if (!FDerefable && !isSafeToLoadUnconditionally(SI->getFalseValue(), LI, - LI->getAlignment(), DL)) + if (!FDerefable && + !isSafeToLoadUnconditionally(SI->getFalseValue(), LI, + LI->getAlignment())) return false; } @@ -1185,7 +1178,7 @@ static bool isSafeSelectToSpeculate(SelectInst *SI, const DataLayout *DL) { /// /// We can do this to a select if its only uses are loads and if the operand to /// the select can be loaded unconditionally. -static bool isSafePHIToSpeculate(PHINode *PN, const DataLayout *DL) { +static bool isSafePHIToSpeculate(PHINode *PN) { // For now, we can only do this promotion if the load is in the same block as // the PHI, and if there are no stores between the phi and load. // TODO: Allow recursive phi users. @@ -1209,6 +1202,8 @@ static bool isSafePHIToSpeculate(PHINode *PN, const DataLayout *DL) { MaxAlign = std::max(MaxAlign, LI->getAlignment()); } + const DataLayout &DL = PN->getModule()->getDataLayout(); + // Okay, we know that we have one or more loads in the same block as the PHI. // We can transform this if it is safe to push the loads into the predecessor // blocks. The only thing to watch out for is that we can't put a possibly @@ -1233,8 +1228,8 @@ static bool isSafePHIToSpeculate(PHINode *PN, const DataLayout *DL) { // If this pointer is always safe to load, or if we can prove that there is // already a load in the block, then we can move the load to the pred block. - if (InVal->isDereferenceablePointer(DL) || - isSafeToLoadUnconditionally(InVal, Pred->getTerminator(), MaxAlign, DL)) + if (isDereferenceablePointer(InVal, DL) || + isSafeToLoadUnconditionally(InVal, Pred->getTerminator(), MaxAlign)) continue; return false; @@ -1248,7 +1243,7 @@ static bool isSafePHIToSpeculate(PHINode *PN, const DataLayout *DL) { /// direct (non-volatile) loads and stores to it. If the alloca is close but /// not quite there, this will transform the code to allow promotion. As such, /// it is a non-pure predicate. -static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout *DL) { +static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout &DL) { SetVector<Instruction*, SmallVector<Instruction*, 4>, SmallPtrSet<Instruction*, 4> > InstsToRewrite; for (User *U : AI->users()) { @@ -1279,7 +1274,7 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout *DL) { // If it is safe to turn "load (select c, AI, ptr)" into a select of two // loads, then we can transform this by rewriting the select. - if (!isSafeSelectToSpeculate(SI, DL)) + if (!isSafeSelectToSpeculate(SI)) return false; InstsToRewrite.insert(SI); @@ -1294,7 +1289,7 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout *DL) { // If it is safe to turn "load (phi [AI, ptr, ...])" into a PHI of loads // in the pred blocks, then we can transform this by rewriting the PHI. - if (!isSafePHIToSpeculate(PN, DL)) + if (!isSafePHIToSpeculate(PN)) return false; InstsToRewrite.insert(PN); @@ -1416,6 +1411,7 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout *DL) { bool SROA::performPromotion(Function &F) { std::vector<AllocaInst*> Allocas; + const DataLayout &DL = F.getParent()->getDataLayout(); DominatorTree *DT = nullptr; if (HasDomTree) DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); @@ -1479,6 +1475,7 @@ bool SROA::ShouldAttemptScalarRepl(AllocaInst *AI) { // bool SROA::performScalarRepl(Function &F) { std::vector<AllocaInst*> WorkList; + const DataLayout &DL = F.getParent()->getDataLayout(); // Scan the entry basic block, adding allocas to the worklist. BasicBlock &BB = F.getEntryBlock(); @@ -1508,7 +1505,7 @@ bool SROA::performScalarRepl(Function &F) { // transform the allocation instruction if it is an array allocation // (allocations OF arrays are ok though), and an allocation of a scalar // value cannot be decomposed at all. - uint64_t AllocaSize = DL->getTypeAllocSize(AI->getAllocatedType()); + uint64_t AllocaSize = DL.getTypeAllocSize(AI->getAllocatedType()); // Do not promote [0 x %struct]. if (AllocaSize == 0) continue; @@ -1531,8 +1528,9 @@ bool SROA::performScalarRepl(Function &F) { // promoted itself. If so, we don't want to transform it needlessly. Note // that we can't just check based on the type: the alloca may be of an i32 // but that has pointer arithmetic to set byte 3 of it or something. - if (AllocaInst *NewAI = ConvertToScalarInfo( - (unsigned)AllocaSize, *DL, ScalarLoadThreshold).TryConvert(AI)) { + if (AllocaInst *NewAI = + ConvertToScalarInfo((unsigned)AllocaSize, DL, ScalarLoadThreshold) + .TryConvert(AI)) { NewAI->takeName(AI); AI->eraseFromParent(); ++NumConverted; @@ -1610,6 +1608,7 @@ void SROA::DeleteDeadInstructions() { /// referenced by this instruction. void SROA::isSafeForScalarRepl(Instruction *I, uint64_t Offset, AllocaInfo &Info) { + const DataLayout &DL = I->getModule()->getDataLayout(); for (Use &U : I->uses()) { Instruction *User = cast<Instruction>(U.getUser()); @@ -1632,8 +1631,8 @@ void SROA::isSafeForScalarRepl(Instruction *I, uint64_t Offset, if (!LI->isSimple()) return MarkUnsafe(Info, User); Type *LIType = LI->getType(); - isSafeMemAccess(Offset, DL->getTypeAllocSize(LIType), - LIType, false, Info, LI, true /*AllowWholeAccess*/); + isSafeMemAccess(Offset, DL.getTypeAllocSize(LIType), LIType, false, Info, + LI, true /*AllowWholeAccess*/); Info.hasALoadOrStore = true; } else if (StoreInst *SI = dyn_cast<StoreInst>(User)) { @@ -1642,8 +1641,8 @@ void SROA::isSafeForScalarRepl(Instruction *I, uint64_t Offset, return MarkUnsafe(Info, User); Type *SIType = SI->getOperand(0)->getType(); - isSafeMemAccess(Offset, DL->getTypeAllocSize(SIType), - SIType, true, Info, SI, true /*AllowWholeAccess*/); + isSafeMemAccess(Offset, DL.getTypeAllocSize(SIType), SIType, true, Info, + SI, true /*AllowWholeAccess*/); Info.hasALoadOrStore = true; } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) { if (II->getIntrinsicID() != Intrinsic::lifetime_start && @@ -1675,6 +1674,7 @@ void SROA::isSafePHISelectUseForScalarRepl(Instruction *I, uint64_t Offset, if (!Info.CheckedPHIs.insert(PN).second) return; + const DataLayout &DL = I->getModule()->getDataLayout(); for (User *U : I->users()) { Instruction *UI = cast<Instruction>(U); @@ -1691,8 +1691,8 @@ void SROA::isSafePHISelectUseForScalarRepl(Instruction *I, uint64_t Offset, if (!LI->isSimple()) return MarkUnsafe(Info, UI); Type *LIType = LI->getType(); - isSafeMemAccess(Offset, DL->getTypeAllocSize(LIType), - LIType, false, Info, LI, false /*AllowWholeAccess*/); + isSafeMemAccess(Offset, DL.getTypeAllocSize(LIType), LIType, false, Info, + LI, false /*AllowWholeAccess*/); Info.hasALoadOrStore = true; } else if (StoreInst *SI = dyn_cast<StoreInst>(UI)) { @@ -1701,8 +1701,8 @@ void SROA::isSafePHISelectUseForScalarRepl(Instruction *I, uint64_t Offset, return MarkUnsafe(Info, UI); Type *SIType = SI->getOperand(0)->getType(); - isSafeMemAccess(Offset, DL->getTypeAllocSize(SIType), - SIType, true, Info, SI, false /*AllowWholeAccess*/); + isSafeMemAccess(Offset, DL.getTypeAllocSize(SIType), SIType, true, Info, + SI, false /*AllowWholeAccess*/); Info.hasALoadOrStore = true; } else if (isa<PHINode>(UI) || isa<SelectInst>(UI)) { isSafePHISelectUseForScalarRepl(UI, Offset, Info); @@ -1746,9 +1746,11 @@ void SROA::isSafeGEP(GetElementPtrInst *GEPI, // constant part of the offset. if (NonConstant) Indices.pop_back(); - Offset += DL->getIndexedOffset(GEPI->getPointerOperandType(), Indices); - if (!TypeHasComponent(Info.AI->getAllocatedType(), Offset, - NonConstantIdxSize)) + + const DataLayout &DL = GEPI->getModule()->getDataLayout(); + Offset += DL.getIndexedOffset(GEPI->getPointerOperandType(), Indices); + if (!TypeHasComponent(Info.AI->getAllocatedType(), Offset, NonConstantIdxSize, + DL)) MarkUnsafe(Info, GEPI); } @@ -1803,9 +1805,10 @@ void SROA::isSafeMemAccess(uint64_t Offset, uint64_t MemSize, Type *MemOpType, bool isStore, AllocaInfo &Info, Instruction *TheAccess, bool AllowWholeAccess) { + const DataLayout &DL = TheAccess->getModule()->getDataLayout(); // Check if this is a load/store of the entire alloca. if (Offset == 0 && AllowWholeAccess && - MemSize == DL->getTypeAllocSize(Info.AI->getAllocatedType())) { + MemSize == DL.getTypeAllocSize(Info.AI->getAllocatedType())) { // This can be safe for MemIntrinsics (where MemOpType is 0) and integer // loads/stores (which are essentially the same as the MemIntrinsics with // regard to copying padding between elements). But, if an alloca is @@ -1828,7 +1831,7 @@ void SROA::isSafeMemAccess(uint64_t Offset, uint64_t MemSize, } // Check if the offset/size correspond to a component within the alloca type. Type *T = Info.AI->getAllocatedType(); - if (TypeHasComponent(T, Offset, MemSize)) { + if (TypeHasComponent(T, Offset, MemSize, DL)) { Info.hasSubelementAccess = true; return; } @@ -1838,24 +1841,25 @@ void SROA::isSafeMemAccess(uint64_t Offset, uint64_t MemSize, /// TypeHasComponent - Return true if T has a component type with the /// specified offset and size. If Size is zero, do not check the size. -bool SROA::TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size) { +bool SROA::TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size, + const DataLayout &DL) { Type *EltTy; uint64_t EltSize; if (StructType *ST = dyn_cast<StructType>(T)) { - const StructLayout *Layout = DL->getStructLayout(ST); + const StructLayout *Layout = DL.getStructLayout(ST); unsigned EltIdx = Layout->getElementContainingOffset(Offset); EltTy = ST->getContainedType(EltIdx); - EltSize = DL->getTypeAllocSize(EltTy); + EltSize = DL.getTypeAllocSize(EltTy); Offset -= Layout->getElementOffset(EltIdx); } else if (ArrayType *AT = dyn_cast<ArrayType>(T)) { EltTy = AT->getElementType(); - EltSize = DL->getTypeAllocSize(EltTy); + EltSize = DL.getTypeAllocSize(EltTy); if (Offset >= AT->getNumElements() * EltSize) return false; Offset %= EltSize; } else if (VectorType *VT = dyn_cast<VectorType>(T)) { EltTy = VT->getElementType(); - EltSize = DL->getTypeAllocSize(EltTy); + EltSize = DL.getTypeAllocSize(EltTy); if (Offset >= VT->getNumElements() * EltSize) return false; Offset %= EltSize; @@ -1867,7 +1871,7 @@ bool SROA::TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size) { // Check if the component spans multiple elements. if (Offset + Size > EltSize) return false; - return TypeHasComponent(EltTy, Offset, Size); + return TypeHasComponent(EltTy, Offset, Size, DL); } /// RewriteForScalarRepl - Alloca AI is being split into NewElts, so rewrite @@ -1876,6 +1880,7 @@ bool SROA::TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size) { /// instruction. void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, SmallVectorImpl<AllocaInst *> &NewElts) { + const DataLayout &DL = I->getModule()->getDataLayout(); for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI!=E;) { Use &TheUse = *UI++; Instruction *User = cast<Instruction>(TheUse.getUser()); @@ -1893,8 +1898,7 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(User)) { ConstantInt *Length = dyn_cast<ConstantInt>(MI->getLength()); uint64_t MemSize = Length->getZExtValue(); - if (Offset == 0 && - MemSize == DL->getTypeAllocSize(AI->getAllocatedType())) + if (Offset == 0 && MemSize == DL.getTypeAllocSize(AI->getAllocatedType())) RewriteMemIntrinUserOfAlloca(MI, I, AI, NewElts); // Otherwise the intrinsic can only touch a single element and the // address operand will be updated, so nothing else needs to be done. @@ -1930,8 +1934,8 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, LI->replaceAllUsesWith(Insert); DeadInsts.push_back(LI); } else if (LIType->isIntegerTy() && - DL->getTypeAllocSize(LIType) == - DL->getTypeAllocSize(AI->getAllocatedType())) { + DL.getTypeAllocSize(LIType) == + DL.getTypeAllocSize(AI->getAllocatedType())) { // If this is a load of the entire alloca to an integer, rewrite it. RewriteLoadUserOfWholeAlloca(LI, AI, NewElts); } @@ -1957,8 +1961,8 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, } DeadInsts.push_back(SI); } else if (SIType->isIntegerTy() && - DL->getTypeAllocSize(SIType) == - DL->getTypeAllocSize(AI->getAllocatedType())) { + DL.getTypeAllocSize(SIType) == + DL.getTypeAllocSize(AI->getAllocatedType())) { // If this is a store of the entire alloca from an integer, rewrite it. RewriteStoreUserOfWholeAlloca(SI, AI, NewElts); } @@ -2001,7 +2005,8 @@ void SROA::RewriteBitCast(BitCastInst *BC, AllocaInst *AI, uint64_t Offset, Type *T = AI->getAllocatedType(); uint64_t EltOffset = 0; Type *IdxTy; - uint64_t Idx = FindElementAndOffset(T, EltOffset, IdxTy); + uint64_t Idx = FindElementAndOffset(T, EltOffset, IdxTy, + BC->getModule()->getDataLayout()); Instruction *Val = NewElts[Idx]; if (Val->getType() != BC->getDestTy()) { Val = new BitCastInst(Val, BC->getDestTy(), "", BC); @@ -2016,11 +2021,12 @@ void SROA::RewriteBitCast(BitCastInst *BC, AllocaInst *AI, uint64_t Offset, /// Sets T to the type of the element and Offset to the offset within that /// element. IdxTy is set to the type of the index result to be used in a /// GEP instruction. -uint64_t SROA::FindElementAndOffset(Type *&T, uint64_t &Offset, - Type *&IdxTy) { +uint64_t SROA::FindElementAndOffset(Type *&T, uint64_t &Offset, Type *&IdxTy, + const DataLayout &DL) { uint64_t Idx = 0; + if (StructType *ST = dyn_cast<StructType>(T)) { - const StructLayout *Layout = DL->getStructLayout(ST); + const StructLayout *Layout = DL.getStructLayout(ST); Idx = Layout->getElementContainingOffset(Offset); T = ST->getContainedType(Idx); Offset -= Layout->getElementOffset(Idx); @@ -2028,7 +2034,7 @@ uint64_t SROA::FindElementAndOffset(Type *&T, uint64_t &Offset, return Idx; } else if (ArrayType *AT = dyn_cast<ArrayType>(T)) { T = AT->getElementType(); - uint64_t EltSize = DL->getTypeAllocSize(T); + uint64_t EltSize = DL.getTypeAllocSize(T); Idx = Offset / EltSize; Offset -= Idx * EltSize; IdxTy = Type::getInt64Ty(T->getContext()); @@ -2036,7 +2042,7 @@ uint64_t SROA::FindElementAndOffset(Type *&T, uint64_t &Offset, } VectorType *VT = cast<VectorType>(T); T = VT->getElementType(); - uint64_t EltSize = DL->getTypeAllocSize(T); + uint64_t EltSize = DL.getTypeAllocSize(T); Idx = Offset / EltSize; Offset -= Idx * EltSize; IdxTy = Type::getInt64Ty(T->getContext()); @@ -2049,6 +2055,7 @@ uint64_t SROA::FindElementAndOffset(Type *&T, uint64_t &Offset, void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset, SmallVectorImpl<AllocaInst *> &NewElts) { uint64_t OldOffset = Offset; + const DataLayout &DL = GEPI->getModule()->getDataLayout(); SmallVector<Value*, 8> Indices(GEPI->op_begin() + 1, GEPI->op_end()); // If the GEP was dynamic then it must have been a dynamic vector lookup. // In this case, it must be the last GEP operand which is dynamic so keep that @@ -2057,19 +2064,19 @@ void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset, Value* NonConstantIdx = nullptr; if (!GEPI->hasAllConstantIndices()) NonConstantIdx = Indices.pop_back_val(); - Offset += DL->getIndexedOffset(GEPI->getPointerOperandType(), Indices); + Offset += DL.getIndexedOffset(GEPI->getPointerOperandType(), Indices); RewriteForScalarRepl(GEPI, AI, Offset, NewElts); Type *T = AI->getAllocatedType(); Type *IdxTy; - uint64_t OldIdx = FindElementAndOffset(T, OldOffset, IdxTy); + uint64_t OldIdx = FindElementAndOffset(T, OldOffset, IdxTy, DL); if (GEPI->getOperand(0) == AI) OldIdx = ~0ULL; // Force the GEP to be rewritten. T = AI->getAllocatedType(); uint64_t EltOffset = Offset; - uint64_t Idx = FindElementAndOffset(T, EltOffset, IdxTy); + uint64_t Idx = FindElementAndOffset(T, EltOffset, IdxTy, DL); // If this GEP does not move the pointer across elements of the alloca // being split, then it does not needs to be rewritten. @@ -2080,7 +2087,7 @@ void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset, SmallVector<Value*, 8> NewArgs; NewArgs.push_back(Constant::getNullValue(i32Ty)); while (EltOffset != 0) { - uint64_t EltIdx = FindElementAndOffset(T, EltOffset, IdxTy); + uint64_t EltIdx = FindElementAndOffset(T, EltOffset, IdxTy, DL); NewArgs.push_back(ConstantInt::get(IdxTy, EltIdx)); } if (NonConstantIdx) { @@ -2114,9 +2121,10 @@ void SROA::RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI, // Put matching lifetime markers on everything from Offset up to // Offset+OldSize. Type *AIType = AI->getAllocatedType(); + const DataLayout &DL = II->getModule()->getDataLayout(); uint64_t NewOffset = Offset; Type *IdxTy; - uint64_t Idx = FindElementAndOffset(AIType, NewOffset, IdxTy); + uint64_t Idx = FindElementAndOffset(AIType, NewOffset, IdxTy, DL); IRBuilder<> Builder(II); uint64_t Size = OldSize->getLimitedValue(); @@ -2126,10 +2134,10 @@ void SROA::RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI, // split the alloca again later. unsigned AS = AI->getType()->getAddressSpace(); Value *V = Builder.CreateBitCast(NewElts[Idx], Builder.getInt8PtrTy(AS)); - V = Builder.CreateGEP(V, Builder.getInt64(NewOffset)); + V = Builder.CreateGEP(Builder.getInt8Ty(), V, Builder.getInt64(NewOffset)); IdxTy = NewElts[Idx]->getAllocatedType(); - uint64_t EltSize = DL->getTypeAllocSize(IdxTy) - NewOffset; + uint64_t EltSize = DL.getTypeAllocSize(IdxTy) - NewOffset; if (EltSize > Size) { EltSize = Size; Size = 0; @@ -2145,7 +2153,7 @@ void SROA::RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI, for (; Idx != NewElts.size() && Size; ++Idx) { IdxTy = NewElts[Idx]->getAllocatedType(); - uint64_t EltSize = DL->getTypeAllocSize(IdxTy); + uint64_t EltSize = DL.getTypeAllocSize(IdxTy); if (EltSize > Size) { EltSize = Size; Size = 0; @@ -2221,6 +2229,7 @@ SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, bool SROADest = MI->getRawDest() == Inst; Constant *Zero = Constant::getNullValue(Type::getInt32Ty(MI->getContext())); + const DataLayout &DL = MI->getModule()->getDataLayout(); for (unsigned i = 0, e = NewElts.size(); i != e; ++i) { // If this is a memcpy/memmove, emit a GEP of the other element address. @@ -2237,10 +2246,10 @@ SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, PointerType *OtherPtrTy = cast<PointerType>(OtherPtr->getType()); Type *OtherTy = OtherPtrTy->getElementType(); if (StructType *ST = dyn_cast<StructType>(OtherTy)) { - EltOffset = DL->getStructLayout(ST)->getElementOffset(i); + EltOffset = DL.getStructLayout(ST)->getElementOffset(i); } else { Type *EltTy = cast<SequentialType>(OtherTy)->getElementType(); - EltOffset = DL->getTypeAllocSize(EltTy)*i; + EltOffset = DL.getTypeAllocSize(EltTy) * i; } // The alignment of the other pointer is the guaranteed alignment of the @@ -2281,7 +2290,7 @@ SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, Type *ValTy = EltTy->getScalarType(); // Construct an integer with the right value. - unsigned EltSize = DL->getTypeSizeInBits(ValTy); + unsigned EltSize = DL.getTypeSizeInBits(ValTy); APInt OneVal(EltSize, CI->getZExtValue()); APInt TotalVal(OneVal); // Set each byte. @@ -2311,7 +2320,7 @@ SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, // this element. } - unsigned EltSize = DL->getTypeAllocSize(EltTy); + unsigned EltSize = DL.getTypeAllocSize(EltTy); if (!EltSize) continue; @@ -2345,12 +2354,13 @@ SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI, // and store the element value to the individual alloca. Value *SrcVal = SI->getOperand(0); Type *AllocaEltTy = AI->getAllocatedType(); - uint64_t AllocaSizeBits = DL->getTypeAllocSizeInBits(AllocaEltTy); + const DataLayout &DL = SI->getModule()->getDataLayout(); + uint64_t AllocaSizeBits = DL.getTypeAllocSizeInBits(AllocaEltTy); IRBuilder<> Builder(SI); // Handle tail padding by extending the operand - if (DL->getTypeSizeInBits(SrcVal->getType()) != AllocaSizeBits) + if (DL.getTypeSizeInBits(SrcVal->getType()) != AllocaSizeBits) SrcVal = Builder.CreateZExt(SrcVal, IntegerType::get(SI->getContext(), AllocaSizeBits)); @@ -2360,15 +2370,15 @@ SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI, // There are two forms here: AI could be an array or struct. Both cases // have different ways to compute the element offset. if (StructType *EltSTy = dyn_cast<StructType>(AllocaEltTy)) { - const StructLayout *Layout = DL->getStructLayout(EltSTy); + const StructLayout *Layout = DL.getStructLayout(EltSTy); for (unsigned i = 0, e = NewElts.size(); i != e; ++i) { // Get the number of bits to shift SrcVal to get the value. Type *FieldTy = EltSTy->getElementType(i); uint64_t Shift = Layout->getElementOffsetInBits(i); - if (DL->isBigEndian()) - Shift = AllocaSizeBits-Shift-DL->getTypeAllocSizeInBits(FieldTy); + if (DL.isBigEndian()) + Shift = AllocaSizeBits - Shift - DL.getTypeAllocSizeInBits(FieldTy); Value *EltVal = SrcVal; if (Shift) { @@ -2377,7 +2387,7 @@ SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI, } // Truncate down to an integer of the right size. - uint64_t FieldSizeBits = DL->getTypeSizeInBits(FieldTy); + uint64_t FieldSizeBits = DL.getTypeSizeInBits(FieldTy); // Ignore zero sized fields like {}, they obviously contain no data. if (FieldSizeBits == 0) continue; @@ -2402,12 +2412,12 @@ SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI, } else { ArrayType *ATy = cast<ArrayType>(AllocaEltTy); Type *ArrayEltTy = ATy->getElementType(); - uint64_t ElementOffset = DL->getTypeAllocSizeInBits(ArrayEltTy); - uint64_t ElementSizeBits = DL->getTypeSizeInBits(ArrayEltTy); + uint64_t ElementOffset = DL.getTypeAllocSizeInBits(ArrayEltTy); + uint64_t ElementSizeBits = DL.getTypeSizeInBits(ArrayEltTy); uint64_t Shift; - if (DL->isBigEndian()) + if (DL.isBigEndian()) Shift = AllocaSizeBits-ElementOffset; else Shift = 0; @@ -2441,7 +2451,7 @@ SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI, } new StoreInst(EltVal, DestField, SI); - if (DL->isBigEndian()) + if (DL.isBigEndian()) Shift -= ElementOffset; else Shift += ElementOffset; @@ -2459,7 +2469,8 @@ SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, // Extract each element out of the NewElts according to its structure offset // and form the result value. Type *AllocaEltTy = AI->getAllocatedType(); - uint64_t AllocaSizeBits = DL->getTypeAllocSizeInBits(AllocaEltTy); + const DataLayout &DL = LI->getModule()->getDataLayout(); + uint64_t AllocaSizeBits = DL.getTypeAllocSizeInBits(AllocaEltTy); DEBUG(dbgs() << "PROMOTING LOAD OF WHOLE ALLOCA: " << *AI << '\n' << *LI << '\n'); @@ -2469,10 +2480,10 @@ SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, const StructLayout *Layout = nullptr; uint64_t ArrayEltBitOffset = 0; if (StructType *EltSTy = dyn_cast<StructType>(AllocaEltTy)) { - Layout = DL->getStructLayout(EltSTy); + Layout = DL.getStructLayout(EltSTy); } else { Type *ArrayEltTy = cast<ArrayType>(AllocaEltTy)->getElementType(); - ArrayEltBitOffset = DL->getTypeAllocSizeInBits(ArrayEltTy); + ArrayEltBitOffset = DL.getTypeAllocSizeInBits(ArrayEltTy); } Value *ResultVal = @@ -2484,7 +2495,7 @@ SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, Value *SrcField = NewElts[i]; Type *FieldTy = cast<PointerType>(SrcField->getType())->getElementType(); - uint64_t FieldSizeBits = DL->getTypeSizeInBits(FieldTy); + uint64_t FieldSizeBits = DL.getTypeSizeInBits(FieldTy); // Ignore zero sized fields like {}, they obviously contain no data. if (FieldSizeBits == 0) continue; @@ -2515,7 +2526,7 @@ SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, else // Array case. Shift = i*ArrayEltBitOffset; - if (DL->isBigEndian()) + if (DL.isBigEndian()) Shift = AllocaSizeBits-Shift-FieldIntTy->getBitWidth(); if (Shift) { @@ -2532,7 +2543,7 @@ SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, } // Handle tail padding by truncating the result - if (DL->getTypeSizeInBits(LI->getType()) != AllocaSizeBits) + if (DL.getTypeSizeInBits(LI->getType()) != AllocaSizeBits) ResultVal = new TruncInst(ResultVal, LI->getType(), "", LI); LI->replaceAllUsesWith(ResultVal); @@ -2589,13 +2600,15 @@ bool SROA::isSafeAllocaToScalarRepl(AllocaInst *AI) { return false; } + const DataLayout &DL = AI->getModule()->getDataLayout(); + // Okay, we know all the users are promotable. If the aggregate is a memcpy // source and destination, we have to be careful. In particular, the memcpy // could be moving around elements that live in structure padding of the LLVM // types, but may actually be used. In these cases, we refuse to promote the // struct. if (Info.isMemCpySrc && Info.isMemCpyDst && - HasPadding(AI->getAllocatedType(), *DL)) + HasPadding(AI->getAllocatedType(), DL)) return false; // If the alloca never has an access to just *part* of it, but is accessed diff --git a/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp index 6036c09..d55dc6a 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp @@ -165,7 +165,7 @@ private: void gather(Instruction *, const ValueVector &); bool canTransferMetadata(unsigned Kind); void transferMetadata(Instruction *, const ValueVector &); - bool getVectorLayout(Type *, unsigned, VectorLayout &); + bool getVectorLayout(Type *, unsigned, VectorLayout &, const DataLayout &); bool finish(); template<typename T> bool splitBinary(Instruction &, const T &); @@ -173,7 +173,6 @@ private: ScatterMap Scattered; GatherList Gathered; unsigned ParallelLoopAccessMDKind; - const DataLayout *DL; bool ScalarizeLoadStore; }; @@ -214,7 +213,7 @@ Value *Scatterer::operator[](unsigned I) { CV[0] = Builder.CreateBitCast(V, Ty, V->getName() + ".i0"); } if (I != 0) - CV[I] = Builder.CreateConstGEP1_32(CV[0], I, + CV[I] = Builder.CreateConstGEP1_32(nullptr, CV[0], I, V->getName() + ".i" + Twine(I)); } else { // Search through a chain of InsertElementInsts looking for element I. @@ -248,8 +247,6 @@ bool Scalarizer::doInitialization(Module &M) { } bool Scalarizer::runOnFunction(Function &F) { - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - DL = DLP ? &DLP->getDataLayout() : nullptr; for (Function::iterator BBI = F.begin(), BBE = F.end(); BBI != BBE; ++BBI) { BasicBlock *BB = BBI; for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE;) { @@ -345,10 +342,7 @@ void Scalarizer::transferMetadata(Instruction *Op, const ValueVector &CV) { // Try to fill in Layout from Ty, returning true on success. Alignment is // the alignment of the vector, or 0 if the ABI default should be used. bool Scalarizer::getVectorLayout(Type *Ty, unsigned Alignment, - VectorLayout &Layout) { - if (!DL) - return false; - + VectorLayout &Layout, const DataLayout &DL) { // Make sure we're dealing with a vector. Layout.VecTy = dyn_cast<VectorType>(Ty); if (!Layout.VecTy) @@ -356,15 +350,15 @@ bool Scalarizer::getVectorLayout(Type *Ty, unsigned Alignment, // Check that we're dealing with full-byte elements. Layout.ElemTy = Layout.VecTy->getElementType(); - if (DL->getTypeSizeInBits(Layout.ElemTy) != - DL->getTypeStoreSizeInBits(Layout.ElemTy)) + if (DL.getTypeSizeInBits(Layout.ElemTy) != + DL.getTypeStoreSizeInBits(Layout.ElemTy)) return false; if (Alignment) Layout.VecAlign = Alignment; else - Layout.VecAlign = DL->getABITypeAlignment(Layout.VecTy); - Layout.ElemSize = DL->getTypeStoreSize(Layout.ElemTy); + Layout.VecAlign = DL.getABITypeAlignment(Layout.VecTy); + Layout.ElemSize = DL.getTypeStoreSize(Layout.ElemTy); return true; } @@ -456,7 +450,7 @@ bool Scalarizer::visitGetElementPtrInst(GetElementPtrInst &GEPI) { Indices.resize(NumIndices); for (unsigned J = 0; J < NumIndices; ++J) Indices[J] = Ops[J][I]; - Res[I] = Builder.CreateGEP(Base[I], Indices, + Res[I] = Builder.CreateGEP(GEPI.getSourceElementType(), Base[I], Indices, GEPI.getName() + ".i" + Twine(I)); if (GEPI.isInBounds()) if (GetElementPtrInst *NewGEPI = dyn_cast<GetElementPtrInst>(Res[I])) @@ -595,7 +589,8 @@ bool Scalarizer::visitLoadInst(LoadInst &LI) { return false; VectorLayout Layout; - if (!getVectorLayout(LI.getType(), LI.getAlignment(), Layout)) + if (!getVectorLayout(LI.getType(), LI.getAlignment(), Layout, + LI.getModule()->getDataLayout())) return false; unsigned NumElems = Layout.VecTy->getNumElements(); @@ -619,7 +614,8 @@ bool Scalarizer::visitStoreInst(StoreInst &SI) { VectorLayout Layout; Value *FullValue = SI.getValueOperand(); - if (!getVectorLayout(FullValue->getType(), SI.getAlignment(), Layout)) + if (!getVectorLayout(FullValue->getType(), SI.getAlignment(), Layout, + SI.getModule()->getDataLayout())) return false; unsigned NumElems = Layout.VecTy->getNumElements(); diff --git a/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp index 6157746..3a782d1 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -160,6 +160,7 @@ #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" @@ -167,6 +168,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetSubtargetInfo.h" #include "llvm/IR/IRBuilder.h" @@ -177,6 +179,13 @@ static cl::opt<bool> DisableSeparateConstOffsetFromGEP( "disable-separate-const-offset-from-gep", cl::init(false), cl::desc("Do not separate the constant offset from a GEP instruction"), cl::Hidden); +// Setting this flag may emit false positives when the input module already +// contains dead instructions. Therefore, we set it only in unit tests that are +// free of dead code. +static cl::opt<bool> + VerifyNoDeadCode("reassociate-geps-verify-no-dead-code", cl::init(false), + cl::desc("Verify this pass produces no dead code"), + cl::Hidden); namespace { @@ -194,23 +203,26 @@ namespace { /// 5); nor can we transform (3 * (a + 5)) to (3 * a + 5), however in this case, /// -instcombine probably already optimized (3 * (a + 5)) to (3 * a + 15). class ConstantOffsetExtractor { - public: +public: /// Extracts a constant offset from the given GEP index. It returns the /// new index representing the remainder (equal to the original index minus /// the constant offset), or nullptr if we cannot extract a constant offset. - /// \p Idx The given GEP index - /// \p DL The datalayout of the module - /// \p GEP The given GEP - static Value *Extract(Value *Idx, const DataLayout *DL, - GetElementPtrInst *GEP); + /// \p Idx The given GEP index + /// \p GEP The given GEP + /// \p UserChainTail Outputs the tail of UserChain so that we can + /// garbage-collect unused instructions in UserChain. + static Value *Extract(Value *Idx, GetElementPtrInst *GEP, + User *&UserChainTail, const DominatorTree *DT); /// Looks for a constant offset from the given GEP index without extracting /// it. It returns the numeric value of the extracted constant offset (0 if /// failed). The meaning of the arguments are the same as Extract. - static int64_t Find(Value *Idx, const DataLayout *DL, GetElementPtrInst *GEP); + static int64_t Find(Value *Idx, GetElementPtrInst *GEP, + const DominatorTree *DT); - private: - ConstantOffsetExtractor(const DataLayout *Layout, Instruction *InsertionPt) - : DL(Layout), IP(InsertionPt) {} +private: + ConstantOffsetExtractor(Instruction *InsertionPt, const DominatorTree *DT) + : IP(InsertionPt), DL(InsertionPt->getModule()->getDataLayout()), DT(DT) { + } /// Searches the expression that computes V for a non-zero constant C s.t. /// V can be reassociated into the form V' + C. If the searching is /// successful, returns C and update UserChain as a def-use chain from C to V; @@ -268,12 +280,6 @@ class ConstantOffsetExtractor { /// returns "sext i32 (zext i16 V to i32) to i64". Value *applyExts(Value *V); - /// Returns true if LHS and RHS have no bits in common, i.e., LHS | RHS == 0. - bool NoCommonBits(Value *LHS, Value *RHS) const; - /// Computes which bits are known to be one or zero. - /// \p KnownOne Mask of all bits that are known to be one. - /// \p KnownZero Mask of all bits that are known to be zero. - void ComputeKnownBits(Value *V, APInt &KnownOne, APInt &KnownZero) const; /// A helper function that returns whether we can trace into the operands /// of binary operator BO for a constant offset. /// @@ -294,39 +300,36 @@ class ConstantOffsetExtractor { /// A data structure used in rebuildWithoutConstOffset. Contains all /// sext/zext instructions along UserChain. SmallVector<CastInst *, 16> ExtInsts; - /// The data layout of the module. Used in ComputeKnownBits. - const DataLayout *DL; Instruction *IP; /// Insertion position of cloned instructions. + const DataLayout &DL; + const DominatorTree *DT; }; /// \brief A pass that tries to split every GEP in the function into a variadic /// base and a constant offset. It is a FunctionPass because searching for the /// constant offset may inspect other basic blocks. class SeparateConstOffsetFromGEP : public FunctionPass { - public: +public: static char ID; SeparateConstOffsetFromGEP(const TargetMachine *TM = nullptr, bool LowerGEP = false) - : FunctionPass(ID), TM(TM), LowerGEP(LowerGEP) { + : FunctionPass(ID), DL(nullptr), DT(nullptr), TM(TM), LowerGEP(LowerGEP) { initializeSeparateConstOffsetFromGEPPass(*PassRegistry::getPassRegistry()); } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<DataLayoutPass>(); - AU.addRequired<TargetTransformInfo>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.setPreservesCFG(); } bool doInitialization(Module &M) override { - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - if (DLP == nullptr) - report_fatal_error("data layout missing"); - DL = &DLP->getDataLayout(); + DL = &M.getDataLayout(); return false; } - bool runOnFunction(Function &F) override; - private: +private: /// Tries to split the given GEP into a variadic base and a constant offset, /// and returns true if the splitting succeeds. bool splitGEP(GetElementPtrInst *GEP); @@ -370,8 +373,11 @@ class SeparateConstOffsetFromGEP : public FunctionPass { /// /// Verified in @i32_add in split-gep.ll bool canonicalizeArrayIndicesToPointerSize(GetElementPtrInst *GEP); + /// Verify F is free of dead code. + void verifyNoDeadCode(Function &F); const DataLayout *DL; + const DominatorTree *DT; const TargetMachine *TM; /// Whether to lower a GEP with multiple indices into arithmetic operations or /// multiple GEPs with a single index. @@ -384,8 +390,8 @@ INITIALIZE_PASS_BEGIN( SeparateConstOffsetFromGEP, "separate-const-offset-from-gep", "Split GEPs to a variadic base and a constant offset for better CSE", false, false) -INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) -INITIALIZE_PASS_DEPENDENCY(DataLayoutPass) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_END( SeparateConstOffsetFromGEP, "separate-const-offset-from-gep", "Split GEPs to a variadic base and a constant offset for better CSE", false, @@ -413,7 +419,8 @@ bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended, Value *LHS = BO->getOperand(0), *RHS = BO->getOperand(1); // Do not trace into "or" unless it is equivalent to "add". If LHS and RHS // don't have common bits, (LHS | RHS) is equivalent to (LHS + RHS). - if (BO->getOpcode() == Instruction::Or && !NoCommonBits(LHS, RHS)) + if (BO->getOpcode() == Instruction::Or && + !haveNoCommonBitsSet(LHS, RHS, DL, nullptr, BO, DT)) return false; // In addition, tracing into BO requires that its surrounding s/zext (if @@ -498,9 +505,8 @@ APInt ConstantOffsetExtractor::find(Value *V, bool SignExtended, ConstantOffset = CI->getValue(); } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(V)) { // Trace into subexpressions for more hoisting opportunities. - if (CanTraceInto(SignExtended, ZeroExtended, BO, NonNegative)) { + if (CanTraceInto(SignExtended, ZeroExtended, BO, NonNegative)) ConstantOffset = findInEitherOperand(BO, SignExtended, ZeroExtended); - } } else if (isa<SExtInst>(V)) { ConstantOffset = find(U->getOperand(0), /* SignExtended */ true, ZeroExtended, NonNegative).sext(BitWidth); @@ -597,6 +603,11 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) { } BinaryOperator *BO = cast<BinaryOperator>(UserChain[ChainIndex]); + assert(BO->getNumUses() <= 1 && + "distributeExtsAndCloneChain clones each BinaryOperator in " + "UserChain, so no one should be used more than " + "once"); + unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1); assert(BO->getOperand(OpNo) == UserChain[ChainIndex - 1]); Value *NextInChain = removeConstOffset(ChainIndex - 1); @@ -609,6 +620,7 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) { return TheOther; } + BinaryOperator::BinaryOps NewOp = BO->getOpcode(); if (BO->getOpcode() == Instruction::Or) { // Rebuild "or" as "add", because "or" may be invalid for the new // epxression. @@ -623,68 +635,46 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) { // // Replacing the "or" with "add" is fine, because // a | (b + 5) = a + (b + 5) = (a + b) + 5 - if (OpNo == 0) { - return BinaryOperator::CreateAdd(NextInChain, TheOther, BO->getName(), - IP); - } else { - return BinaryOperator::CreateAdd(TheOther, NextInChain, BO->getName(), - IP); - } + NewOp = Instruction::Add; } - // We can reuse BO in this case, because the new expression shares the same - // instruction type and BO is used at most once. - assert(BO->getNumUses() <= 1 && - "distributeExtsAndCloneChain clones each BinaryOperator in " - "UserChain, so no one should be used more than " - "once"); - BO->setOperand(OpNo, NextInChain); - BO->setHasNoSignedWrap(false); - BO->setHasNoUnsignedWrap(false); - // Make sure it appears after all instructions we've inserted so far. - BO->moveBefore(IP); - return BO; + BinaryOperator *NewBO; + if (OpNo == 0) { + NewBO = BinaryOperator::Create(NewOp, NextInChain, TheOther, "", IP); + } else { + NewBO = BinaryOperator::Create(NewOp, TheOther, NextInChain, "", IP); + } + NewBO->takeName(BO); + return NewBO; } -Value *ConstantOffsetExtractor::Extract(Value *Idx, const DataLayout *DL, - GetElementPtrInst *GEP) { - ConstantOffsetExtractor Extractor(DL, GEP); +Value *ConstantOffsetExtractor::Extract(Value *Idx, GetElementPtrInst *GEP, + User *&UserChainTail, + const DominatorTree *DT) { + ConstantOffsetExtractor Extractor(GEP, DT); // Find a non-zero constant offset first. APInt ConstantOffset = Extractor.find(Idx, /* SignExtended */ false, /* ZeroExtended */ false, GEP->isInBounds()); - if (ConstantOffset == 0) + if (ConstantOffset == 0) { + UserChainTail = nullptr; return nullptr; + } // Separates the constant offset from the GEP index. - return Extractor.rebuildWithoutConstOffset(); + Value *IdxWithoutConstOffset = Extractor.rebuildWithoutConstOffset(); + UserChainTail = Extractor.UserChain.back(); + return IdxWithoutConstOffset; } -int64_t ConstantOffsetExtractor::Find(Value *Idx, const DataLayout *DL, - GetElementPtrInst *GEP) { +int64_t ConstantOffsetExtractor::Find(Value *Idx, GetElementPtrInst *GEP, + const DominatorTree *DT) { // If Idx is an index of an inbound GEP, Idx is guaranteed to be non-negative. - return ConstantOffsetExtractor(DL, GEP) + return ConstantOffsetExtractor(GEP, DT) .find(Idx, /* SignExtended */ false, /* ZeroExtended */ false, GEP->isInBounds()) .getSExtValue(); } -void ConstantOffsetExtractor::ComputeKnownBits(Value *V, APInt &KnownOne, - APInt &KnownZero) const { - IntegerType *IT = cast<IntegerType>(V->getType()); - KnownOne = APInt(IT->getBitWidth(), 0); - KnownZero = APInt(IT->getBitWidth(), 0); - llvm::computeKnownBits(V, KnownZero, KnownOne, DL, 0); -} - -bool ConstantOffsetExtractor::NoCommonBits(Value *LHS, Value *RHS) const { - assert(LHS->getType() == RHS->getType() && - "LHS and RHS should have the same type"); - APInt LHSKnownOne, LHSKnownZero, RHSKnownOne, RHSKnownZero; - ComputeKnownBits(LHS, LHSKnownOne, LHSKnownZero); - ComputeKnownBits(RHS, RHSKnownOne, RHSKnownZero); - return (LHSKnownZero | RHSKnownZero).isAllOnesValue(); -} - bool SeparateConstOffsetFromGEP::canonicalizeArrayIndicesToPointerSize( GetElementPtrInst *GEP) { bool Changed = false; @@ -713,7 +703,7 @@ SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP, if (isa<SequentialType>(*GTI)) { // Tries to extract a constant offset from this GEP index. int64_t ConstantOffset = - ConstantOffsetExtractor::Find(GEP->getOperand(I), DL, GEP); + ConstantOffsetExtractor::Find(GEP->getOperand(I), GEP, DT); if (ConstantOffset != 0) { NeedsExtraction = true; // A GEP may have multiple indices. We accumulate the extracted @@ -770,14 +760,16 @@ void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs( } } // Create an ugly GEP with a single index for each index. - ResultPtr = Builder.CreateGEP(ResultPtr, Idx, "uglygep"); + ResultPtr = + Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Idx, "uglygep"); } } // Create a GEP with the constant offset index. if (AccumulativeByteOffset != 0) { Value *Offset = ConstantInt::get(IntPtrTy, AccumulativeByteOffset); - ResultPtr = Builder.CreateGEP(ResultPtr, Offset, "uglygep"); + ResultPtr = + Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Offset, "uglygep"); } if (ResultPtr->getType() != Variadic->getType()) ResultPtr = Builder.CreateBitCast(ResultPtr, Variadic->getType()); @@ -857,7 +849,9 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { // of variable indices. Therefore, we don't check for addressing modes in that // case. if (!LowerGEP) { - TargetTransformInfo &TTI = getAnalysis<TargetTransformInfo>(); + TargetTransformInfo &TTI = + getAnalysis<TargetTransformInfoWrapperPass>().getTTI( + *GEP->getParent()->getParent()); if (!TTI.isLegalAddressingMode(GEP->getType()->getElementType(), /*BaseGV=*/nullptr, AccumulativeByteOffset, /*HasBaseReg=*/true, /*Scale=*/0)) { @@ -877,10 +871,17 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { if (isa<SequentialType>(*GTI)) { // Splits this GEP index into a variadic part and a constant offset, and // uses the variadic part as the new index. + Value *OldIdx = GEP->getOperand(I); + User *UserChainTail; Value *NewIdx = - ConstantOffsetExtractor::Extract(GEP->getOperand(I), DL, GEP); + ConstantOffsetExtractor::Extract(OldIdx, GEP, UserChainTail, DT); if (NewIdx != nullptr) { + // Switches to the index with the constant offset removed. GEP->setOperand(I, NewIdx); + // After switching to the new index, we can garbage-collect UserChain + // and the old index if they are not used. + RecursivelyDeleteTriviallyDeadInstructions(UserChainTail); + RecursivelyDeleteTriviallyDeadInstructions(OldIdx); } } } @@ -910,7 +911,7 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { if (LowerGEP) { // As currently BasicAA does not analyze ptrtoint/inttoptr, do not lower to // arithmetic operations if the target uses alias analysis in codegen. - if (TM && TM->getSubtarget<TargetSubtargetInfo>().useAA()) + if (TM && TM->getSubtargetImpl(*GEP->getParent()->getParent())->useAA()) lowerToSingleIndexGEPs(GEP, AccumulativeByteOffset); else lowerToArithmetics(GEP, AccumulativeByteOffset); @@ -962,8 +963,9 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { // Very likely. As long as %gep is natually aligned, the byte offset we // extracted should be a multiple of sizeof(*%gep). int64_t Index = AccumulativeByteOffset / ElementTypeSizeOfGEP; - NewGEP = GetElementPtrInst::Create( - NewGEP, ConstantInt::get(IntPtrTy, Index, true), GEP->getName(), GEP); + NewGEP = GetElementPtrInst::Create(GEP->getResultElementType(), NewGEP, + ConstantInt::get(IntPtrTy, Index, true), + GEP->getName(), GEP); } else { // Unlikely but possible. For example, // #pragma pack(1) @@ -983,8 +985,9 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { GEP->getPointerAddressSpace()); NewGEP = new BitCastInst(NewGEP, I8PtrTy, "", GEP); NewGEP = GetElementPtrInst::Create( - NewGEP, ConstantInt::get(IntPtrTy, AccumulativeByteOffset, true), - "uglygep", GEP); + Type::getInt8Ty(GEP->getContext()), NewGEP, + ConstantInt::get(IntPtrTy, AccumulativeByteOffset, true), "uglygep", + GEP); if (GEP->getType() != I8PtrTy) NewGEP = new BitCastInst(NewGEP, GEP->getType(), GEP->getName(), GEP); } @@ -996,9 +999,14 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { } bool SeparateConstOffsetFromGEP::runOnFunction(Function &F) { + if (skipOptnoneFunction(F)) + return false; + if (DisableSeparateConstOffsetFromGEP) return false; + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + bool Changed = false; for (Function::iterator B = F.begin(), BE = F.end(); B != BE; ++B) { for (BasicBlock::iterator I = B->begin(), IE = B->end(); I != IE; ) { @@ -1009,5 +1017,22 @@ bool SeparateConstOffsetFromGEP::runOnFunction(Function &F) { // already. } } + + if (VerifyNoDeadCode) + verifyNoDeadCode(F); + return Changed; } + +void SeparateConstOffsetFromGEP::verifyNoDeadCode(Function &F) { + for (auto &B : F) { + for (auto &I : B) { + if (isInstructionTriviallyDead(&I)) { + std::string ErrMessage; + raw_string_ostream RSO(ErrMessage); + RSO << "Dead instruction detected!\n" << I << "\n"; + llvm_unreachable(RSO.str().c_str()); + } + } + } +} diff --git a/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp index 2e317f9..8566cd9 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp @@ -21,7 +21,7 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/SimplifyCFG.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" @@ -37,6 +37,7 @@ #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Scalar.h" using namespace llvm; #define DEBUG_TYPE "simplifycfg" @@ -47,36 +48,6 @@ UserBonusInstThreshold("bonus-inst-threshold", cl::Hidden, cl::init(1), STATISTIC(NumSimpl, "Number of blocks simplified"); -namespace { -struct CFGSimplifyPass : public FunctionPass { - static char ID; // Pass identification, replacement for typeid - unsigned BonusInstThreshold; - CFGSimplifyPass(int T = -1) : FunctionPass(ID) { - BonusInstThreshold = (T == -1) ? UserBonusInstThreshold : unsigned(T); - initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry()); - } - bool runOnFunction(Function &F) override; - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AssumptionCacheTracker>(); - AU.addRequired<TargetTransformInfo>(); - } -}; -} - -char CFGSimplifyPass::ID = 0; -INITIALIZE_PASS_BEGIN(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false, - false) -INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) -INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_END(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false, - false) - -// Public interface to the CFGSimplification pass -FunctionPass *llvm::createCFGSimplificationPass(int Threshold) { - return new CFGSimplifyPass(Threshold); -} - /// mergeEmptyReturnBlocks - If we have more than one empty (other than phi /// node) return blocks, merge them together to promote recursive block merging. static bool mergeEmptyReturnBlocks(Function &F) { @@ -156,7 +127,7 @@ static bool mergeEmptyReturnBlocks(Function &F) { /// iterativelySimplifyCFG - Call SimplifyCFG on all the blocks in the function, /// iterating until no more changes are made. static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI, - const DataLayout *DL, AssumptionCache *AC, + AssumptionCache *AC, unsigned BonusInstThreshold) { bool Changed = false; bool LocalChange = true; @@ -166,7 +137,7 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI, // Loop over all of the basic blocks and remove them if they are unneeded... // for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) { - if (SimplifyCFG(BBIt++, TTI, BonusInstThreshold, DL, AC)) { + if (SimplifyCFG(BBIt++, TTI, BonusInstThreshold, AC)) { LocalChange = true; ++NumSimpl; } @@ -176,21 +147,11 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI, return Changed; } -// It is possible that we may require multiple passes over the code to fully -// simplify the CFG. -// -bool CFGSimplifyPass::runOnFunction(Function &F) { - if (skipOptnoneFunction(F)) - return false; - - AssumptionCache *AC = - &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); - const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfo>(); - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr; +static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI, + AssumptionCache *AC, int BonusInstThreshold) { bool EverChanged = removeUnreachableBlocks(F); EverChanged |= mergeEmptyReturnBlocks(F); - EverChanged |= iterativelySimplifyCFG(F, TTI, DL, AC, BonusInstThreshold); + EverChanged |= iterativelySimplifyCFG(F, TTI, AC, BonusInstThreshold); // If neither pass changed anything, we're done. if (!EverChanged) return false; @@ -204,9 +165,66 @@ bool CFGSimplifyPass::runOnFunction(Function &F) { return true; do { - EverChanged = iterativelySimplifyCFG(F, TTI, DL, AC, BonusInstThreshold); + EverChanged = iterativelySimplifyCFG(F, TTI, AC, BonusInstThreshold); EverChanged |= removeUnreachableBlocks(F); } while (EverChanged); return true; } + +SimplifyCFGPass::SimplifyCFGPass() + : BonusInstThreshold(UserBonusInstThreshold) {} + +SimplifyCFGPass::SimplifyCFGPass(int BonusInstThreshold) + : BonusInstThreshold(BonusInstThreshold) {} + +PreservedAnalyses SimplifyCFGPass::run(Function &F, + AnalysisManager<Function> *AM) { + auto &TTI = AM->getResult<TargetIRAnalysis>(F); + auto &AC = AM->getResult<AssumptionAnalysis>(F); + + if (!simplifyFunctionCFG(F, TTI, &AC, BonusInstThreshold)) + return PreservedAnalyses::none(); + + return PreservedAnalyses::all(); +} + +namespace { +struct CFGSimplifyPass : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + unsigned BonusInstThreshold; + CFGSimplifyPass(int T = -1) : FunctionPass(ID) { + BonusInstThreshold = (T == -1) ? UserBonusInstThreshold : unsigned(T); + initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry()); + } + bool runOnFunction(Function &F) override { + if (skipOptnoneFunction(F)) + return false; + + AssumptionCache *AC = + &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + const TargetTransformInfo &TTI = + getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + return simplifyFunctionCFG(F, TTI, AC, BonusInstThreshold); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); + } +}; +} + +char CFGSimplifyPass::ID = 0; +INITIALIZE_PASS_BEGIN(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false, + false) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_END(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false, + false) + +// Public interface to the CFGSimplification pass +FunctionPass *llvm::createCFGSimplificationPass(int Threshold) { + return new CFGSimplifyPass(Threshold); +} + diff --git a/contrib/llvm/lib/Transforms/Scalar/Sink.cpp b/contrib/llvm/lib/Transforms/Scalar/Sink.cpp index 903b675..b169d56 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Sink.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Sink.cpp @@ -21,6 +21,7 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -35,7 +36,6 @@ namespace { DominatorTree *DT; LoopInfo *LI; AliasAnalysis *AA; - const DataLayout *DL; public: static char ID; // Pass identification @@ -50,9 +50,9 @@ namespace { FunctionPass::getAnalysisUsage(AU); AU.addRequired<AliasAnalysis>(); AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<LoopInfo>(); + AU.addRequired<LoopInfoWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addPreserved<LoopInfo>(); + AU.addPreserved<LoopInfoWrapperPass>(); } private: bool ProcessBlock(BasicBlock &BB); @@ -64,7 +64,7 @@ namespace { char Sinking::ID = 0; INITIALIZE_PASS_BEGIN(Sinking, "sink", "Code sinking", false, false) -INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) INITIALIZE_PASS_END(Sinking, "sink", "Code sinking", false, false) @@ -98,10 +98,8 @@ bool Sinking::AllUsesDominatedByBlock(Instruction *Inst, bool Sinking::runOnFunction(Function &F) { DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - LI = &getAnalysis<LoopInfo>(); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); AA = &getAnalysis<AliasAnalysis>(); - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - DL = DLP ? &DLP->getDataLayout() : nullptr; bool MadeChange, EverMadeChange = false; @@ -196,7 +194,7 @@ bool Sinking::IsAcceptableTarget(Instruction *Inst, if (SuccToSinkTo->getUniquePredecessor() != Inst->getParent()) { // We cannot sink a load across a critical edge - there may be stores in // other code paths. - if (!isSafeToSpeculativelyExecute(Inst, DL)) + if (!isSafeToSpeculativelyExecute(Inst)) return false; // We don't want to sink across a critical edge if we don't dominate the diff --git a/contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp b/contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp new file mode 100644 index 0000000..ff3f00a --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp @@ -0,0 +1,243 @@ +//===- SpeculativeExecution.cpp ---------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass hoists instructions to enable speculative execution on +// targets where branches are expensive. This is aimed at GPUs. It +// currently works on simple if-then and if-then-else +// patterns. +// +// Removing branches is not the only motivation for this +// pass. E.g. consider this code and assume that there is no +// addressing mode for multiplying by sizeof(*a): +// +// if (b > 0) +// c = a[i + 1] +// if (d > 0) +// e = a[i + 2] +// +// turns into +// +// p = &a[i + 1]; +// if (b > 0) +// c = *p; +// q = &a[i + 2]; +// if (d > 0) +// e = *q; +// +// which could later be optimized to +// +// r = &a[i]; +// if (b > 0) +// c = r[1]; +// if (d > 0) +// e = r[2]; +// +// Later passes sink back much of the speculated code that did not enable +// further optimization. +// +// This pass is more aggressive than the function SpeculativeyExecuteBB in +// SimplifyCFG. SimplifyCFG will not speculate if no selects are introduced and +// it will speculate at most one instruction. It also will not speculate if +// there is a value defined in the if-block that is only used in the then-block. +// These restrictions make sense since the speculation in SimplifyCFG seems +// aimed at introducing cheap selects, while this pass is intended to do more +// aggressive speculation while counting on later passes to either capitalize on +// that or clean it up. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/SmallSet.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Operator.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define DEBUG_TYPE "speculative-execution" + +// The risk that speculation will not pay off increases with the +// number of instructions speculated, so we put a limit on that. +static cl::opt<unsigned> SpecExecMaxSpeculationCost( + "spec-exec-max-speculation-cost", cl::init(7), cl::Hidden, + cl::desc("Speculative execution is not applied to basic blocks where " + "the cost of the instructions to speculatively execute " + "exceeds this limit.")); + +// Speculating just a few instructions from a larger block tends not +// to be profitable and this limit prevents that. A reason for that is +// that small basic blocks are more likely to be candidates for +// further optimization. +static cl::opt<unsigned> SpecExecMaxNotHoisted( + "spec-exec-max-not-hoisted", cl::init(5), cl::Hidden, + cl::desc("Speculative execution is not applied to basic blocks where the " + "number of instructions that would not be speculatively executed " + "exceeds this limit.")); + +namespace { +class SpeculativeExecution : public FunctionPass { + public: + static char ID; + SpeculativeExecution(): FunctionPass(ID) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override; + bool runOnFunction(Function &F) override; + + private: + bool runOnBasicBlock(BasicBlock &B); + bool considerHoistingFromTo(BasicBlock &FromBlock, BasicBlock &ToBlock); + + const TargetTransformInfo *TTI = nullptr; +}; +} // namespace + +char SpeculativeExecution::ID = 0; +INITIALIZE_PASS_BEGIN(SpeculativeExecution, "speculative-execution", + "Speculatively execute instructions", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_END(SpeculativeExecution, "speculative-execution", + "Speculatively execute instructions", false, false) + +void SpeculativeExecution::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<TargetTransformInfoWrapperPass>(); +} + +bool SpeculativeExecution::runOnFunction(Function &F) { + if (skipOptnoneFunction(F)) + return false; + + TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + + bool Changed = false; + for (auto& B : F) { + Changed |= runOnBasicBlock(B); + } + return Changed; +} + +bool SpeculativeExecution::runOnBasicBlock(BasicBlock &B) { + BranchInst *BI = dyn_cast<BranchInst>(B.getTerminator()); + if (BI == nullptr) + return false; + + if (BI->getNumSuccessors() != 2) + return false; + BasicBlock &Succ0 = *BI->getSuccessor(0); + BasicBlock &Succ1 = *BI->getSuccessor(1); + + if (&B == &Succ0 || &B == &Succ1 || &Succ0 == &Succ1) { + return false; + } + + // Hoist from if-then (triangle). + if (Succ0.getSinglePredecessor() != nullptr && + Succ0.getSingleSuccessor() == &Succ1) { + return considerHoistingFromTo(Succ0, B); + } + + // Hoist from if-else (triangle). + if (Succ1.getSinglePredecessor() != nullptr && + Succ1.getSingleSuccessor() == &Succ0) { + return considerHoistingFromTo(Succ1, B); + } + + // Hoist from if-then-else (diamond), but only if it is equivalent to + // an if-else or if-then due to one of the branches doing nothing. + if (Succ0.getSinglePredecessor() != nullptr && + Succ1.getSinglePredecessor() != nullptr && + Succ1.getSingleSuccessor() != nullptr && + Succ1.getSingleSuccessor() != &B && + Succ1.getSingleSuccessor() == Succ0.getSingleSuccessor()) { + // If a block has only one instruction, then that is a terminator + // instruction so that the block does nothing. This does happen. + if (Succ1.size() == 1) // equivalent to if-then + return considerHoistingFromTo(Succ0, B); + if (Succ0.size() == 1) // equivalent to if-else + return considerHoistingFromTo(Succ1, B); + } + + return false; +} + +static unsigned ComputeSpeculationCost(const Instruction *I, + const TargetTransformInfo &TTI) { + switch (Operator::getOpcode(I)) { + case Instruction::GetElementPtr: + case Instruction::Add: + case Instruction::Mul: + case Instruction::And: + case Instruction::Or: + case Instruction::Select: + case Instruction::Shl: + case Instruction::Sub: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::Xor: + case Instruction::ZExt: + case Instruction::SExt: + return TTI.getUserCost(I); + + default: + return UINT_MAX; // Disallow anything not whitelisted. + } +} + +bool SpeculativeExecution::considerHoistingFromTo(BasicBlock &FromBlock, + BasicBlock &ToBlock) { + SmallSet<const Instruction *, 8> NotHoisted; + const auto AllPrecedingUsesFromBlockHoisted = [&NotHoisted](User *U) { + for (Value* V : U->operand_values()) { + if (Instruction *I = dyn_cast<Instruction>(V)) { + if (NotHoisted.count(I) > 0) + return false; + } + } + return true; + }; + + unsigned TotalSpeculationCost = 0; + for (auto& I : FromBlock) { + const unsigned Cost = ComputeSpeculationCost(&I, *TTI); + if (Cost != UINT_MAX && isSafeToSpeculativelyExecute(&I) && + AllPrecedingUsesFromBlockHoisted(&I)) { + TotalSpeculationCost += Cost; + if (TotalSpeculationCost > SpecExecMaxSpeculationCost) + return false; // too much to hoist + } else { + NotHoisted.insert(&I); + if (NotHoisted.size() > SpecExecMaxNotHoisted) + return false; // too much left behind + } + } + + if (TotalSpeculationCost == 0) + return false; // nothing to hoist + + for (auto I = FromBlock.begin(); I != FromBlock.end();) { + // We have to increment I before moving Current as moving Current + // changes the list that I is iterating through. + auto Current = I; + ++I; + if (!NotHoisted.count(Current)) { + Current->moveBefore(ToBlock.getTerminator()); + } + } + return true; +} + +namespace llvm { + +FunctionPass *createSpeculativeExecutionPass() { + return new SpeculativeExecution(); +} + +} // namespace llvm diff --git a/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp new file mode 100644 index 0000000..453503a --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp @@ -0,0 +1,710 @@ +//===-- StraightLineStrengthReduce.cpp - ------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements straight-line strength reduction (SLSR). Unlike loop +// strength reduction, this algorithm is designed to reduce arithmetic +// redundancy in straight-line code instead of loops. It has proven to be +// effective in simplifying arithmetic statements derived from an unrolled loop. +// It can also simplify the logic of SeparateConstOffsetFromGEP. +// +// There are many optimizations we can perform in the domain of SLSR. This file +// for now contains only an initial step. Specifically, we look for strength +// reduction candidates in the following forms: +// +// Form 1: B + i * S +// Form 2: (B + i) * S +// Form 3: &B[i * S] +// +// where S is an integer variable, and i is a constant integer. If we found two +// candidates S1 and S2 in the same form and S1 dominates S2, we may rewrite S2 +// in a simpler way with respect to S1. For example, +// +// S1: X = B + i * S +// S2: Y = B + i' * S => X + (i' - i) * S +// +// S1: X = (B + i) * S +// S2: Y = (B + i') * S => X + (i' - i) * S +// +// S1: X = &B[i * S] +// S2: Y = &B[i' * S] => &X[(i' - i) * S] +// +// Note: (i' - i) * S is folded to the extent possible. +// +// This rewriting is in general a good idea. The code patterns we focus on +// usually come from loop unrolling, so (i' - i) * S is likely the same +// across iterations and can be reused. When that happens, the optimized form +// takes only one add starting from the second iteration. +// +// When such rewriting is possible, we call S1 a "basis" of S2. When S2 has +// multiple bases, we choose to rewrite S2 with respect to its "immediate" +// basis, the basis that is the closest ancestor in the dominator tree. +// +// TODO: +// +// - Floating point arithmetics when fast math is enabled. +// +// - SLSR may decrease ILP at the architecture level. Targets that are very +// sensitive to ILP may want to disable it. Having SLSR to consider ILP is +// left as future work. +// +// - When (i' - i) is constant but i and i' are not, we could still perform +// SLSR. +#include <vector> + +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/FoldingSet.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" + +using namespace llvm; +using namespace PatternMatch; + +namespace { + +class StraightLineStrengthReduce : public FunctionPass { +public: + // SLSR candidate. Such a candidate must be in one of the forms described in + // the header comments. + struct Candidate : public ilist_node<Candidate> { + enum Kind { + Invalid, // reserved for the default constructor + Add, // B + i * S + Mul, // (B + i) * S + GEP, // &B[..][i * S][..] + }; + + Candidate() + : CandidateKind(Invalid), Base(nullptr), Index(nullptr), + Stride(nullptr), Ins(nullptr), Basis(nullptr) {} + Candidate(Kind CT, const SCEV *B, ConstantInt *Idx, Value *S, + Instruction *I) + : CandidateKind(CT), Base(B), Index(Idx), Stride(S), Ins(I), + Basis(nullptr) {} + Kind CandidateKind; + const SCEV *Base; + // Note that Index and Stride of a GEP candidate do not necessarily have the + // same integer type. In that case, during rewriting, Stride will be + // sign-extended or truncated to Index's type. + ConstantInt *Index; + Value *Stride; + // The instruction this candidate corresponds to. It helps us to rewrite a + // candidate with respect to its immediate basis. Note that one instruction + // can correspond to multiple candidates depending on how you associate the + // expression. For instance, + // + // (a + 1) * (b + 2) + // + // can be treated as + // + // <Base: a, Index: 1, Stride: b + 2> + // + // or + // + // <Base: b, Index: 2, Stride: a + 1> + Instruction *Ins; + // Points to the immediate basis of this candidate, or nullptr if we cannot + // find any basis for this candidate. + Candidate *Basis; + }; + + static char ID; + + StraightLineStrengthReduce() + : FunctionPass(ID), DL(nullptr), DT(nullptr), TTI(nullptr) { + initializeStraightLineStrengthReducePass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<ScalarEvolution>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); + // We do not modify the shape of the CFG. + AU.setPreservesCFG(); + } + + bool doInitialization(Module &M) override { + DL = &M.getDataLayout(); + return false; + } + + bool runOnFunction(Function &F) override; + +private: + // Returns true if Basis is a basis for C, i.e., Basis dominates C and they + // share the same base and stride. + bool isBasisFor(const Candidate &Basis, const Candidate &C); + // Returns whether the candidate can be folded into an addressing mode. + bool isFoldable(const Candidate &C, TargetTransformInfo *TTI, + const DataLayout *DL); + // Returns true if C is already in a simplest form and not worth being + // rewritten. + bool isSimplestForm(const Candidate &C); + // Checks whether I is in a candidate form. If so, adds all the matching forms + // to Candidates, and tries to find the immediate basis for each of them. + void allocateCandidatesAndFindBasis(Instruction *I); + // Allocate candidates and find bases for Add instructions. + void allocateCandidatesAndFindBasisForAdd(Instruction *I); + // Given I = LHS + RHS, factors RHS into i * S and makes (LHS + i * S) a + // candidate. + void allocateCandidatesAndFindBasisForAdd(Value *LHS, Value *RHS, + Instruction *I); + // Allocate candidates and find bases for Mul instructions. + void allocateCandidatesAndFindBasisForMul(Instruction *I); + // Splits LHS into Base + Index and, if succeeds, calls + // allocateCandidatesAndFindBasis. + void allocateCandidatesAndFindBasisForMul(Value *LHS, Value *RHS, + Instruction *I); + // Allocate candidates and find bases for GetElementPtr instructions. + void allocateCandidatesAndFindBasisForGEP(GetElementPtrInst *GEP); + // A helper function that scales Idx with ElementSize before invoking + // allocateCandidatesAndFindBasis. + void allocateCandidatesAndFindBasisForGEP(const SCEV *B, ConstantInt *Idx, + Value *S, uint64_t ElementSize, + Instruction *I); + // Adds the given form <CT, B, Idx, S> to Candidates, and finds its immediate + // basis. + void allocateCandidatesAndFindBasis(Candidate::Kind CT, const SCEV *B, + ConstantInt *Idx, Value *S, + Instruction *I); + // Rewrites candidate C with respect to Basis. + void rewriteCandidateWithBasis(const Candidate &C, const Candidate &Basis); + // A helper function that factors ArrayIdx to a product of a stride and a + // constant index, and invokes allocateCandidatesAndFindBasis with the + // factorings. + void factorArrayIndex(Value *ArrayIdx, const SCEV *Base, uint64_t ElementSize, + GetElementPtrInst *GEP); + // Emit code that computes the "bump" from Basis to C. If the candidate is a + // GEP and the bump is not divisible by the element size of the GEP, this + // function sets the BumpWithUglyGEP flag to notify its caller to bump the + // basis using an ugly GEP. + static Value *emitBump(const Candidate &Basis, const Candidate &C, + IRBuilder<> &Builder, const DataLayout *DL, + bool &BumpWithUglyGEP); + + const DataLayout *DL; + DominatorTree *DT; + ScalarEvolution *SE; + TargetTransformInfo *TTI; + ilist<Candidate> Candidates; + // Temporarily holds all instructions that are unlinked (but not deleted) by + // rewriteCandidateWithBasis. These instructions will be actually removed + // after all rewriting finishes. + std::vector<Instruction *> UnlinkedInstructions; +}; +} // anonymous namespace + +char StraightLineStrengthReduce::ID = 0; +INITIALIZE_PASS_BEGIN(StraightLineStrengthReduce, "slsr", + "Straight line strength reduction", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_END(StraightLineStrengthReduce, "slsr", + "Straight line strength reduction", false, false) + +FunctionPass *llvm::createStraightLineStrengthReducePass() { + return new StraightLineStrengthReduce(); +} + +bool StraightLineStrengthReduce::isBasisFor(const Candidate &Basis, + const Candidate &C) { + return (Basis.Ins != C.Ins && // skip the same instruction + // Basis must dominate C in order to rewrite C with respect to Basis. + DT->dominates(Basis.Ins->getParent(), C.Ins->getParent()) && + // They share the same base, stride, and candidate kind. + Basis.Base == C.Base && + Basis.Stride == C.Stride && + Basis.CandidateKind == C.CandidateKind); +} + +static bool isGEPFoldable(GetElementPtrInst *GEP, + const TargetTransformInfo *TTI, + const DataLayout *DL) { + GlobalVariable *BaseGV = nullptr; + int64_t BaseOffset = 0; + bool HasBaseReg = false; + int64_t Scale = 0; + + if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getPointerOperand())) + BaseGV = GV; + else + HasBaseReg = true; + + gep_type_iterator GTI = gep_type_begin(GEP); + for (auto I = GEP->idx_begin(); I != GEP->idx_end(); ++I, ++GTI) { + if (isa<SequentialType>(*GTI)) { + int64_t ElementSize = DL->getTypeAllocSize(GTI.getIndexedType()); + if (ConstantInt *ConstIdx = dyn_cast<ConstantInt>(*I)) { + BaseOffset += ConstIdx->getSExtValue() * ElementSize; + } else { + // Needs scale register. + if (Scale != 0) { + // No addressing mode takes two scale registers. + return false; + } + Scale = ElementSize; + } + } else { + StructType *STy = cast<StructType>(*GTI); + uint64_t Field = cast<ConstantInt>(*I)->getZExtValue(); + BaseOffset += DL->getStructLayout(STy)->getElementOffset(Field); + } + } + return TTI->isLegalAddressingMode(GEP->getType()->getElementType(), BaseGV, + BaseOffset, HasBaseReg, Scale); +} + +// Returns whether (Base + Index * Stride) can be folded to an addressing mode. +static bool isAddFoldable(const SCEV *Base, ConstantInt *Index, Value *Stride, + TargetTransformInfo *TTI) { + return TTI->isLegalAddressingMode(Base->getType(), nullptr, 0, true, + Index->getSExtValue()); +} + +bool StraightLineStrengthReduce::isFoldable(const Candidate &C, + TargetTransformInfo *TTI, + const DataLayout *DL) { + if (C.CandidateKind == Candidate::Add) + return isAddFoldable(C.Base, C.Index, C.Stride, TTI); + if (C.CandidateKind == Candidate::GEP) + return isGEPFoldable(cast<GetElementPtrInst>(C.Ins), TTI, DL); + return false; +} + +// Returns true if GEP has zero or one non-zero index. +static bool hasOnlyOneNonZeroIndex(GetElementPtrInst *GEP) { + unsigned NumNonZeroIndices = 0; + for (auto I = GEP->idx_begin(); I != GEP->idx_end(); ++I) { + ConstantInt *ConstIdx = dyn_cast<ConstantInt>(*I); + if (ConstIdx == nullptr || !ConstIdx->isZero()) + ++NumNonZeroIndices; + } + return NumNonZeroIndices <= 1; +} + +bool StraightLineStrengthReduce::isSimplestForm(const Candidate &C) { + if (C.CandidateKind == Candidate::Add) { + // B + 1 * S or B + (-1) * S + return C.Index->isOne() || C.Index->isMinusOne(); + } + if (C.CandidateKind == Candidate::Mul) { + // (B + 0) * S + return C.Index->isZero(); + } + if (C.CandidateKind == Candidate::GEP) { + // (char*)B + S or (char*)B - S + return ((C.Index->isOne() || C.Index->isMinusOne()) && + hasOnlyOneNonZeroIndex(cast<GetElementPtrInst>(C.Ins))); + } + return false; +} + +// TODO: We currently implement an algorithm whose time complexity is linear in +// the number of existing candidates. However, we could do better by using +// ScopedHashTable. Specifically, while traversing the dominator tree, we could +// maintain all the candidates that dominate the basic block being traversed in +// a ScopedHashTable. This hash table is indexed by the base and the stride of +// a candidate. Therefore, finding the immediate basis of a candidate boils down +// to one hash-table look up. +void StraightLineStrengthReduce::allocateCandidatesAndFindBasis( + Candidate::Kind CT, const SCEV *B, ConstantInt *Idx, Value *S, + Instruction *I) { + Candidate C(CT, B, Idx, S, I); + // SLSR can complicate an instruction in two cases: + // + // 1. If we can fold I into an addressing mode, computing I is likely free or + // takes only one instruction. + // + // 2. I is already in a simplest form. For example, when + // X = B + 8 * S + // Y = B + S, + // rewriting Y to X - 7 * S is probably a bad idea. + // + // In the above cases, we still add I to the candidate list so that I can be + // the basis of other candidates, but we leave I's basis blank so that I + // won't be rewritten. + if (!isFoldable(C, TTI, DL) && !isSimplestForm(C)) { + // Try to compute the immediate basis of C. + unsigned NumIterations = 0; + // Limit the scan radius to avoid running in quadratice time. + static const unsigned MaxNumIterations = 50; + for (auto Basis = Candidates.rbegin(); + Basis != Candidates.rend() && NumIterations < MaxNumIterations; + ++Basis, ++NumIterations) { + if (isBasisFor(*Basis, C)) { + C.Basis = &(*Basis); + break; + } + } + } + // Regardless of whether we find a basis for C, we need to push C to the + // candidate list so that it can be the basis of other candidates. + Candidates.push_back(C); +} + +void StraightLineStrengthReduce::allocateCandidatesAndFindBasis( + Instruction *I) { + switch (I->getOpcode()) { + case Instruction::Add: + allocateCandidatesAndFindBasisForAdd(I); + break; + case Instruction::Mul: + allocateCandidatesAndFindBasisForMul(I); + break; + case Instruction::GetElementPtr: + allocateCandidatesAndFindBasisForGEP(cast<GetElementPtrInst>(I)); + break; + } +} + +void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForAdd( + Instruction *I) { + // Try matching B + i * S. + if (!isa<IntegerType>(I->getType())) + return; + + assert(I->getNumOperands() == 2 && "isn't I an add?"); + Value *LHS = I->getOperand(0), *RHS = I->getOperand(1); + allocateCandidatesAndFindBasisForAdd(LHS, RHS, I); + if (LHS != RHS) + allocateCandidatesAndFindBasisForAdd(RHS, LHS, I); +} + +void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForAdd( + Value *LHS, Value *RHS, Instruction *I) { + Value *S = nullptr; + ConstantInt *Idx = nullptr; + if (match(RHS, m_Mul(m_Value(S), m_ConstantInt(Idx)))) { + // I = LHS + RHS = LHS + Idx * S + allocateCandidatesAndFindBasis(Candidate::Add, SE->getSCEV(LHS), Idx, S, I); + } else if (match(RHS, m_Shl(m_Value(S), m_ConstantInt(Idx)))) { + // I = LHS + RHS = LHS + (S << Idx) = LHS + S * (1 << Idx) + APInt One(Idx->getBitWidth(), 1); + Idx = ConstantInt::get(Idx->getContext(), One << Idx->getValue()); + allocateCandidatesAndFindBasis(Candidate::Add, SE->getSCEV(LHS), Idx, S, I); + } else { + // At least, I = LHS + 1 * RHS + ConstantInt *One = ConstantInt::get(cast<IntegerType>(I->getType()), 1); + allocateCandidatesAndFindBasis(Candidate::Add, SE->getSCEV(LHS), One, RHS, + I); + } +} + +// Returns true if A matches B + C where C is constant. +static bool matchesAdd(Value *A, Value *&B, ConstantInt *&C) { + return (match(A, m_Add(m_Value(B), m_ConstantInt(C))) || + match(A, m_Add(m_ConstantInt(C), m_Value(B)))); +} + +// Returns true if A matches B | C where C is constant. +static bool matchesOr(Value *A, Value *&B, ConstantInt *&C) { + return (match(A, m_Or(m_Value(B), m_ConstantInt(C))) || + match(A, m_Or(m_ConstantInt(C), m_Value(B)))); +} + +void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForMul( + Value *LHS, Value *RHS, Instruction *I) { + Value *B = nullptr; + ConstantInt *Idx = nullptr; + if (matchesAdd(LHS, B, Idx)) { + // If LHS is in the form of "Base + Index", then I is in the form of + // "(Base + Index) * RHS". + allocateCandidatesAndFindBasis(Candidate::Mul, SE->getSCEV(B), Idx, RHS, I); + } else if (matchesOr(LHS, B, Idx) && haveNoCommonBitsSet(B, Idx, *DL)) { + // If LHS is in the form of "Base | Index" and Base and Index have no common + // bits set, then + // Base | Index = Base + Index + // and I is thus in the form of "(Base + Index) * RHS". + allocateCandidatesAndFindBasis(Candidate::Mul, SE->getSCEV(B), Idx, RHS, I); + } else { + // Otherwise, at least try the form (LHS + 0) * RHS. + ConstantInt *Zero = ConstantInt::get(cast<IntegerType>(I->getType()), 0); + allocateCandidatesAndFindBasis(Candidate::Mul, SE->getSCEV(LHS), Zero, RHS, + I); + } +} + +void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForMul( + Instruction *I) { + // Try matching (B + i) * S. + // TODO: we could extend SLSR to float and vector types. + if (!isa<IntegerType>(I->getType())) + return; + + assert(I->getNumOperands() == 2 && "isn't I a mul?"); + Value *LHS = I->getOperand(0), *RHS = I->getOperand(1); + allocateCandidatesAndFindBasisForMul(LHS, RHS, I); + if (LHS != RHS) { + // Symmetrically, try to split RHS to Base + Index. + allocateCandidatesAndFindBasisForMul(RHS, LHS, I); + } +} + +void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForGEP( + const SCEV *B, ConstantInt *Idx, Value *S, uint64_t ElementSize, + Instruction *I) { + // I = B + sext(Idx *nsw S) * ElementSize + // = B + (sext(Idx) * sext(S)) * ElementSize + // = B + (sext(Idx) * ElementSize) * sext(S) + // Casting to IntegerType is safe because we skipped vector GEPs. + IntegerType *IntPtrTy = cast<IntegerType>(DL->getIntPtrType(I->getType())); + ConstantInt *ScaledIdx = ConstantInt::get( + IntPtrTy, Idx->getSExtValue() * (int64_t)ElementSize, true); + allocateCandidatesAndFindBasis(Candidate::GEP, B, ScaledIdx, S, I); +} + +void StraightLineStrengthReduce::factorArrayIndex(Value *ArrayIdx, + const SCEV *Base, + uint64_t ElementSize, + GetElementPtrInst *GEP) { + // At least, ArrayIdx = ArrayIdx *nsw 1. + allocateCandidatesAndFindBasisForGEP( + Base, ConstantInt::get(cast<IntegerType>(ArrayIdx->getType()), 1), + ArrayIdx, ElementSize, GEP); + Value *LHS = nullptr; + ConstantInt *RHS = nullptr; + // One alternative is matching the SCEV of ArrayIdx instead of ArrayIdx + // itself. This would allow us to handle the shl case for free. However, + // matching SCEVs has two issues: + // + // 1. this would complicate rewriting because the rewriting procedure + // would have to translate SCEVs back to IR instructions. This translation + // is difficult when LHS is further evaluated to a composite SCEV. + // + // 2. ScalarEvolution is designed to be control-flow oblivious. It tends + // to strip nsw/nuw flags which are critical for SLSR to trace into + // sext'ed multiplication. + if (match(ArrayIdx, m_NSWMul(m_Value(LHS), m_ConstantInt(RHS)))) { + // SLSR is currently unsafe if i * S may overflow. + // GEP = Base + sext(LHS *nsw RHS) * ElementSize + allocateCandidatesAndFindBasisForGEP(Base, RHS, LHS, ElementSize, GEP); + } else if (match(ArrayIdx, m_NSWShl(m_Value(LHS), m_ConstantInt(RHS)))) { + // GEP = Base + sext(LHS <<nsw RHS) * ElementSize + // = Base + sext(LHS *nsw (1 << RHS)) * ElementSize + APInt One(RHS->getBitWidth(), 1); + ConstantInt *PowerOf2 = + ConstantInt::get(RHS->getContext(), One << RHS->getValue()); + allocateCandidatesAndFindBasisForGEP(Base, PowerOf2, LHS, ElementSize, GEP); + } +} + +void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForGEP( + GetElementPtrInst *GEP) { + // TODO: handle vector GEPs + if (GEP->getType()->isVectorTy()) + return; + + SmallVector<const SCEV *, 4> IndexExprs; + for (auto I = GEP->idx_begin(); I != GEP->idx_end(); ++I) + IndexExprs.push_back(SE->getSCEV(*I)); + + gep_type_iterator GTI = gep_type_begin(GEP); + for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I) { + if (!isa<SequentialType>(*GTI++)) + continue; + + const SCEV *OrigIndexExpr = IndexExprs[I - 1]; + IndexExprs[I - 1] = SE->getConstant(OrigIndexExpr->getType(), 0); + + // The base of this candidate is GEP's base plus the offsets of all + // indices except this current one. + const SCEV *BaseExpr = SE->getGEPExpr(GEP->getSourceElementType(), + SE->getSCEV(GEP->getPointerOperand()), + IndexExprs, GEP->isInBounds()); + Value *ArrayIdx = GEP->getOperand(I); + uint64_t ElementSize = DL->getTypeAllocSize(*GTI); + factorArrayIndex(ArrayIdx, BaseExpr, ElementSize, GEP); + // When ArrayIdx is the sext of a value, we try to factor that value as + // well. Handling this case is important because array indices are + // typically sign-extended to the pointer size. + Value *TruncatedArrayIdx = nullptr; + if (match(ArrayIdx, m_SExt(m_Value(TruncatedArrayIdx)))) + factorArrayIndex(TruncatedArrayIdx, BaseExpr, ElementSize, GEP); + + IndexExprs[I - 1] = OrigIndexExpr; + } +} + +// A helper function that unifies the bitwidth of A and B. +static void unifyBitWidth(APInt &A, APInt &B) { + if (A.getBitWidth() < B.getBitWidth()) + A = A.sext(B.getBitWidth()); + else if (A.getBitWidth() > B.getBitWidth()) + B = B.sext(A.getBitWidth()); +} + +Value *StraightLineStrengthReduce::emitBump(const Candidate &Basis, + const Candidate &C, + IRBuilder<> &Builder, + const DataLayout *DL, + bool &BumpWithUglyGEP) { + APInt Idx = C.Index->getValue(), BasisIdx = Basis.Index->getValue(); + unifyBitWidth(Idx, BasisIdx); + APInt IndexOffset = Idx - BasisIdx; + + BumpWithUglyGEP = false; + if (Basis.CandidateKind == Candidate::GEP) { + APInt ElementSize( + IndexOffset.getBitWidth(), + DL->getTypeAllocSize( + cast<GetElementPtrInst>(Basis.Ins)->getType()->getElementType())); + APInt Q, R; + APInt::sdivrem(IndexOffset, ElementSize, Q, R); + if (R.getSExtValue() == 0) + IndexOffset = Q; + else + BumpWithUglyGEP = true; + } + + // Compute Bump = C - Basis = (i' - i) * S. + // Common case 1: if (i' - i) is 1, Bump = S. + if (IndexOffset.getSExtValue() == 1) + return C.Stride; + // Common case 2: if (i' - i) is -1, Bump = -S. + if (IndexOffset.getSExtValue() == -1) + return Builder.CreateNeg(C.Stride); + + // Otherwise, Bump = (i' - i) * sext/trunc(S). Note that (i' - i) and S may + // have different bit widths. + IntegerType *DeltaType = + IntegerType::get(Basis.Ins->getContext(), IndexOffset.getBitWidth()); + Value *ExtendedStride = Builder.CreateSExtOrTrunc(C.Stride, DeltaType); + if (IndexOffset.isPowerOf2()) { + // If (i' - i) is a power of 2, Bump = sext/trunc(S) << log(i' - i). + ConstantInt *Exponent = ConstantInt::get(DeltaType, IndexOffset.logBase2()); + return Builder.CreateShl(ExtendedStride, Exponent); + } + if ((-IndexOffset).isPowerOf2()) { + // If (i - i') is a power of 2, Bump = -sext/trunc(S) << log(i' - i). + ConstantInt *Exponent = + ConstantInt::get(DeltaType, (-IndexOffset).logBase2()); + return Builder.CreateNeg(Builder.CreateShl(ExtendedStride, Exponent)); + } + Constant *Delta = ConstantInt::get(DeltaType, IndexOffset); + return Builder.CreateMul(ExtendedStride, Delta); +} + +void StraightLineStrengthReduce::rewriteCandidateWithBasis( + const Candidate &C, const Candidate &Basis) { + assert(C.CandidateKind == Basis.CandidateKind && C.Base == Basis.Base && + C.Stride == Basis.Stride); + // We run rewriteCandidateWithBasis on all candidates in a post-order, so the + // basis of a candidate cannot be unlinked before the candidate. + assert(Basis.Ins->getParent() != nullptr && "the basis is unlinked"); + + // An instruction can correspond to multiple candidates. Therefore, instead of + // simply deleting an instruction when we rewrite it, we mark its parent as + // nullptr (i.e. unlink it) so that we can skip the candidates whose + // instruction is already rewritten. + if (!C.Ins->getParent()) + return; + + IRBuilder<> Builder(C.Ins); + bool BumpWithUglyGEP; + Value *Bump = emitBump(Basis, C, Builder, DL, BumpWithUglyGEP); + Value *Reduced = nullptr; // equivalent to but weaker than C.Ins + switch (C.CandidateKind) { + case Candidate::Add: + case Candidate::Mul: + // C = Basis + Bump + if (BinaryOperator::isNeg(Bump)) { + // If Bump is a neg instruction, emit C = Basis - (-Bump). + Reduced = + Builder.CreateSub(Basis.Ins, BinaryOperator::getNegArgument(Bump)); + // We only use the negative argument of Bump, and Bump itself may be + // trivially dead. + RecursivelyDeleteTriviallyDeadInstructions(Bump); + } else { + Reduced = Builder.CreateAdd(Basis.Ins, Bump); + } + break; + case Candidate::GEP: + { + Type *IntPtrTy = DL->getIntPtrType(C.Ins->getType()); + bool InBounds = cast<GetElementPtrInst>(C.Ins)->isInBounds(); + if (BumpWithUglyGEP) { + // C = (char *)Basis + Bump + unsigned AS = Basis.Ins->getType()->getPointerAddressSpace(); + Type *CharTy = Type::getInt8PtrTy(Basis.Ins->getContext(), AS); + Reduced = Builder.CreateBitCast(Basis.Ins, CharTy); + if (InBounds) + Reduced = + Builder.CreateInBoundsGEP(Builder.getInt8Ty(), Reduced, Bump); + else + Reduced = Builder.CreateGEP(Builder.getInt8Ty(), Reduced, Bump); + Reduced = Builder.CreateBitCast(Reduced, C.Ins->getType()); + } else { + // C = gep Basis, Bump + // Canonicalize bump to pointer size. + Bump = Builder.CreateSExtOrTrunc(Bump, IntPtrTy); + if (InBounds) + Reduced = Builder.CreateInBoundsGEP(nullptr, Basis.Ins, Bump); + else + Reduced = Builder.CreateGEP(nullptr, Basis.Ins, Bump); + } + } + break; + default: + llvm_unreachable("C.CandidateKind is invalid"); + }; + Reduced->takeName(C.Ins); + C.Ins->replaceAllUsesWith(Reduced); + // Unlink C.Ins so that we can skip other candidates also corresponding to + // C.Ins. The actual deletion is postponed to the end of runOnFunction. + C.Ins->removeFromParent(); + UnlinkedInstructions.push_back(C.Ins); +} + +bool StraightLineStrengthReduce::runOnFunction(Function &F) { + if (skipOptnoneFunction(F)) + return false; + + TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + SE = &getAnalysis<ScalarEvolution>(); + // Traverse the dominator tree in the depth-first order. This order makes sure + // all bases of a candidate are in Candidates when we process it. + for (auto node = GraphTraits<DominatorTree *>::nodes_begin(DT); + node != GraphTraits<DominatorTree *>::nodes_end(DT); ++node) { + for (auto &I : *node->getBlock()) + allocateCandidatesAndFindBasis(&I); + } + + // Rewrite candidates in the reverse depth-first order. This order makes sure + // a candidate being rewritten is not a basis for any other candidate. + while (!Candidates.empty()) { + const Candidate &C = Candidates.back(); + if (C.Basis != nullptr) { + rewriteCandidateWithBasis(C, *C.Basis); + } + Candidates.pop_back(); + } + + // Delete all unlink instructions. + for (auto *UnlinkedInst : UnlinkedInstructions) { + for (unsigned I = 0, E = UnlinkedInst->getNumOperands(); I != E; ++I) { + Value *Op = UnlinkedInst->getOperand(I); + UnlinkedInst->setOperand(I, nullptr); + RecursivelyDeleteTriviallyDeadInstructions(Op); + } + delete UnlinkedInst; + } + bool Ret = !UnlinkedInstructions.empty(); + UnlinkedInstructions.clear(); + return Ret; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp index 7fe87f9..4f23e20 100644 --- a/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -9,6 +9,7 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/MapVector.h" +#include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SCCIterator.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/RegionInfo.h" @@ -16,6 +17,8 @@ #include "llvm/Analysis/RegionPass.h" #include "llvm/IR/Module.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/SSAUpdater.h" using namespace llvm; @@ -249,7 +252,7 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequiredID(LowerSwitchID); AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<LoopInfo>(); + AU.addRequired<LoopInfoWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); RegionPass::getAnalysisUsage(AU); } @@ -281,11 +284,65 @@ bool StructurizeCFG::doInitialization(Region *R, RGPassManager &RGM) { /// \brief Build up the general order of nodes void StructurizeCFG::orderNodes() { - scc_iterator<Region *> I = scc_begin(ParentRegion); - for (Order.clear(); !I.isAtEnd(); ++I) { - const std::vector<RegionNode *> &Nodes = *I; - Order.append(Nodes.begin(), Nodes.end()); + RNVector TempOrder; + ReversePostOrderTraversal<Region*> RPOT(ParentRegion); + TempOrder.append(RPOT.begin(), RPOT.end()); + + std::map<Loop*, unsigned> LoopBlocks; + + + // The reverse post-order traversal of the list gives us an ordering close + // to what we want. The only problem with it is that sometimes backedges + // for outer loops will be visited before backedges for inner loops. + for (RegionNode *RN : TempOrder) { + BasicBlock *BB = RN->getEntry(); + Loop *Loop = LI->getLoopFor(BB); + if (!LoopBlocks.count(Loop)) { + LoopBlocks[Loop] = 1; + continue; + } + LoopBlocks[Loop]++; + } + + unsigned CurrentLoopDepth = 0; + Loop *CurrentLoop = nullptr; + BBSet TempVisited; + for (RNVector::iterator I = TempOrder.begin(), E = TempOrder.end(); I != E; ++I) { + BasicBlock *BB = (*I)->getEntry(); + unsigned LoopDepth = LI->getLoopDepth(BB); + + if (std::find(Order.begin(), Order.end(), *I) != Order.end()) + continue; + + if (LoopDepth < CurrentLoopDepth) { + // Make sure we have visited all blocks in this loop before moving back to + // the outer loop. + + RNVector::iterator LoopI = I; + while(LoopBlocks[CurrentLoop]) { + LoopI++; + BasicBlock *LoopBB = (*LoopI)->getEntry(); + if (LI->getLoopFor(LoopBB) == CurrentLoop) { + LoopBlocks[CurrentLoop]--; + Order.push_back(*LoopI); + } + } + } + + CurrentLoop = LI->getLoopFor(BB); + if (CurrentLoop) { + LoopBlocks[CurrentLoop]--; + } + + CurrentLoopDepth = LoopDepth; + Order.push_back(*I); } + + // This pass originally used a post-order traversal and then operated on + // the list in reverse. Now that we are using a reverse post-order traversal + // rather than re-working the whole pass to operate on the list in order, + // we just reverse the list and continue to operate on it in reverse. + std::reverse(Order.begin(), Order.end()); } /// \brief Determine the end of the loops @@ -304,7 +361,7 @@ void StructurizeCFG::analyzeLoops(RegionNode *N) { for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) { BasicBlock *Succ = Term->getSuccessor(i); - if (Visited.count(Succ) && LI->isLoopHeader(Succ) ) { + if (Visited.count(Succ)) { Loops[Succ] = BB; } } @@ -441,6 +498,10 @@ void StructurizeCFG::collectInfos() { for (RNVector::reverse_iterator OI = Order.rbegin(), OE = Order.rend(); OI != OE; ++OI) { + DEBUG(dbgs() << "Visiting: " << + ((*OI)->isSubRegion() ? "SubRegion with entry: " : "") << + (*OI)->getEntry()->getName() << " Loop Depth: " << LI->getLoopDepth((*OI)->getEntry()) << "\n"); + // Analyze all the conditions leading to a node gatherPredicates(*OI); @@ -826,7 +887,7 @@ void StructurizeCFG::createFlow() { /// no longer dominate all their uses. Not sure if this is really nessasary void StructurizeCFG::rebuildSSA() { SSAUpdater Updater; - for (const auto &BB : ParentRegion->blocks()) + for (auto *BB : ParentRegion->blocks()) for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE; ++II) { @@ -866,7 +927,7 @@ bool StructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) { ParentRegion = R; DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - LI = &getAnalysis<LoopInfo>(); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); orderNodes(); collectInfos(); diff --git a/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp index f3c3e30..9eef132 100644 --- a/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp @@ -54,8 +54,8 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/InlineCost.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/Loads.h" @@ -87,7 +87,6 @@ STATISTIC(NumAccumAdded, "Number of accumulators introduced"); namespace { struct TailCallElim : public FunctionPass { const TargetTransformInfo *TTI; - const DataLayout *DL; static char ID; // Pass identification, replacement for typeid TailCallElim() : FunctionPass(ID) { @@ -126,7 +125,7 @@ namespace { char TailCallElim::ID = 0; INITIALIZE_PASS_BEGIN(TailCallElim, "tailcallelim", "Tail Call Elimination", false, false) -INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_END(TailCallElim, "tailcallelim", "Tail Call Elimination", false, false) @@ -136,7 +135,7 @@ FunctionPass *llvm::createTailCallEliminationPass() { } void TailCallElim::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<TargetTransformInfo>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); } /// \brief Scan the specified function for alloca instructions. @@ -159,8 +158,6 @@ bool TailCallElim::runOnFunction(Function &F) { if (skipOptnoneFunction(F)) return false; - DL = F.getParent()->getDataLayout(); - bool AllCallsAreTailCalls = false; bool Modified = markTails(F, AllCallsAreTailCalls); if (AllCallsAreTailCalls) @@ -386,16 +383,15 @@ bool TailCallElim::runTRE(Function &F) { // right, so don't even try to convert it... if (F.getFunctionType()->isVarArg()) return false; - TTI = &getAnalysis<TargetTransformInfo>(); + TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); BasicBlock *OldEntry = nullptr; bool TailCallsAreMarkedTail = false; SmallVector<PHINode*, 8> ArgumentPHIs; bool MadeChange = false; - // CanTRETailMarkedCall - If false, we cannot perform TRE on tail calls - // marked with the 'tail' attribute, because doing so would cause the stack - // size to increase (real TRE would deallocate variable sized allocas, TRE - // doesn't). + // If false, we cannot perform TRE on tail calls marked with the 'tail' + // attribute, because doing so would cause the stack size to increase (real + // TRE would deallocate variable sized allocas, TRE doesn't). bool CanTRETailMarkedCall = CanTRE(F); // Change any tail recursive calls to loops. @@ -404,28 +400,19 @@ bool TailCallElim::runTRE(Function &F) { // alloca' is changed from being a static alloca to being a dynamic alloca. // Until this is resolved, disable this transformation if that would ever // happen. This bug is PR962. - SmallVector<BasicBlock*, 8> BBToErase; - for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { + for (Function::iterator BBI = F.begin(), E = F.end(); BBI != E; /*in loop*/) { + BasicBlock *BB = BBI++; // FoldReturnAndProcessPred may delete BB. if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) { bool Change = ProcessReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail, ArgumentPHIs, !CanTRETailMarkedCall); - if (!Change && BB->getFirstNonPHIOrDbg() == Ret) { + if (!Change && BB->getFirstNonPHIOrDbg() == Ret) Change = FoldReturnAndProcessPred(BB, Ret, OldEntry, TailCallsAreMarkedTail, ArgumentPHIs, !CanTRETailMarkedCall); - // FoldReturnAndProcessPred may have emptied some BB. Remember to - // erase them. - if (Change && BB->empty()) - BBToErase.push_back(BB); - - } MadeChange |= Change; } } - for (auto BB: BBToErase) - BB->eraseFromParent(); - // If we eliminated any tail recursions, it's possible that we inserted some // silly PHI nodes which just merge an initial value (the incoming operand) // with themselves. Check to see if we did and clean up our mess if so. This @@ -435,7 +422,7 @@ bool TailCallElim::runTRE(Function &F) { PHINode *PN = ArgumentPHIs[i]; // If the PHI Node is a dynamic constant, replace it with the value it is. - if (Value *PNV = SimplifyInstruction(PN)) { + if (Value *PNV = SimplifyInstruction(PN, F.getParent()->getDataLayout())) { PN->replaceAllUsesWith(PNV); PN->eraseFromParent(); } @@ -445,7 +432,7 @@ bool TailCallElim::runTRE(Function &F) { } -/// CanMoveAboveCall - Return true if it is safe to move the specified +/// Return true if it is safe to move the specified /// instruction from after the call to before the call, assuming that all /// instructions between the call and this instruction are movable. /// @@ -464,7 +451,7 @@ bool TailCallElim::CanMoveAboveCall(Instruction *I, CallInst *CI) { // being loaded from. if (CI->mayWriteToMemory() || !isSafeToLoadUnconditionally(L->getPointerOperand(), L, - L->getAlignment(), DL)) + L->getAlignment())) return false; } } @@ -480,13 +467,11 @@ bool TailCallElim::CanMoveAboveCall(Instruction *I, CallInst *CI) { return true; } -// isDynamicConstant - Return true if the specified value is the same when the -// return would exit as it was when the initial iteration of the recursive -// function was executed. -// -// We currently handle static constants and arguments that are not modified as -// part of the recursion. -// +/// Return true if the specified value is the same when the return would exit +/// as it was when the initial iteration of the recursive function was executed. +/// +/// We currently handle static constants and arguments that are not modified as +/// part of the recursion. static bool isDynamicConstant(Value *V, CallInst *CI, ReturnInst *RI) { if (isa<Constant>(V)) return true; // Static constants are always dyn consts @@ -518,10 +503,9 @@ static bool isDynamicConstant(Value *V, CallInst *CI, ReturnInst *RI) { return false; } -// getCommonReturnValue - Check to see if the function containing the specified -// tail call consistently returns the same runtime-constant value at all exit -// points except for IgnoreRI. If so, return the returned value. -// +/// Check to see if the function containing the specified tail call consistently +/// returns the same runtime-constant value at all exit points except for +/// IgnoreRI. If so, return the returned value. static Value *getCommonReturnValue(ReturnInst *IgnoreRI, CallInst *CI) { Function *F = CI->getParent()->getParent(); Value *ReturnedValue = nullptr; @@ -545,10 +529,9 @@ static Value *getCommonReturnValue(ReturnInst *IgnoreRI, CallInst *CI) { return ReturnedValue; } -/// CanTransformAccumulatorRecursion - If the specified instruction can be -/// transformed using accumulator recursion elimination, return the constant -/// which is the start of the accumulator value. Otherwise return null. -/// +/// If the specified instruction can be transformed using accumulator recursion +/// elimination, return the constant which is the start of the accumulator +/// value. Otherwise return null. Value *TailCallElim::CanTransformAccumulatorRecursion(Instruction *I, CallInst *CI) { if (!I->isAssociative() || !I->isCommutative()) return nullptr; @@ -836,14 +819,11 @@ bool TailCallElim::FoldReturnAndProcessPred(BasicBlock *BB, ReturnInst *RI = FoldReturnIntoUncondBranch(Ret, BB, Pred); // Cleanup: if all predecessors of BB have been eliminated by - // FoldReturnIntoUncondBranch, we would like to delete it, but we - // can not just nuke it as it is being used as an iterator by our caller. - // Just empty it, and the caller will erase it when it is safe to do so. - // It is important to empty it, because the ret instruction in there is - // still using a value which EliminateRecursiveTailCall will attempt - // to remove. + // FoldReturnIntoUncondBranch, delete it. It is important to empty it, + // because the ret instruction in there is still using a value which + // EliminateRecursiveTailCall will attempt to remove. if (!BB->hasAddressTaken() && pred_begin(BB) == pred_end(BB)) - BB->getInstList().clear(); + BB->eraseFromParent(); EliminateRecursiveTailCall(CI, RI, OldEntry, TailCallsAreMarkedTail, ArgumentPHIs, diff --git a/contrib/llvm/lib/Transforms/Utils/ASanStackFrameLayout.cpp b/contrib/llvm/lib/Transforms/Utils/ASanStackFrameLayout.cpp index cce016a..03c3a80 100644 --- a/contrib/llvm/lib/Transforms/Utils/ASanStackFrameLayout.cpp +++ b/contrib/llvm/lib/Transforms/Utils/ASanStackFrameLayout.cpp @@ -13,6 +13,7 @@ #include "llvm/Transforms/Utils/ASanStackFrameLayout.h" #include "llvm/ADT/SmallString.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Support/MathExtras.h" #include <algorithm> namespace llvm { @@ -33,11 +34,6 @@ static inline bool CompareVars(const ASanStackVariableDescription &a, // with e.g. alignment 1 and alignment 16 do not get reordered by CompareVars. static const size_t kMinAlignment = 16; -static size_t RoundUpTo(size_t X, size_t RoundTo) { - assert((RoundTo & (RoundTo - 1)) == 0); - return (X + RoundTo - 1) & ~(RoundTo - 1); -} - // The larger the variable Size the larger is the redzone. // The resulting frame size is a multiple of Alignment. static size_t VarAndRedzoneSize(size_t Size, size_t Alignment) { @@ -48,7 +44,7 @@ static size_t VarAndRedzoneSize(size_t Size, size_t Alignment) { else if (Size <= 512) Res = Size + 64; else if (Size <= 4096) Res = Size + 128; else Res = Size + 256; - return RoundUpTo(Res, Alignment); + return RoundUpToAlignment(Res, Alignment); } void diff --git a/contrib/llvm/lib/Transforms/Utils/AddDiscriminators.cpp b/contrib/llvm/lib/Transforms/Utils/AddDiscriminators.cpp index 820544b..e9f6239 100644 --- a/contrib/llvm/lib/Transforms/Utils/AddDiscriminators.cpp +++ b/contrib/llvm/lib/Transforms/Utils/AddDiscriminators.cpp @@ -174,42 +174,51 @@ bool AddDiscriminators::runOnFunction(Function &F) { for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) { BasicBlock *B = I; TerminatorInst *Last = B->getTerminator(); - DebugLoc LastLoc = Last->getDebugLoc(); - if (LastLoc.isUnknown()) continue; - DILocation LastDIL(LastLoc.getAsMDNode(Ctx)); + const DILocation *LastDIL = Last->getDebugLoc(); + if (!LastDIL) + continue; for (unsigned I = 0; I < Last->getNumSuccessors(); ++I) { BasicBlock *Succ = Last->getSuccessor(I); Instruction *First = Succ->getFirstNonPHIOrDbgOrLifetime(); - DebugLoc FirstLoc = First->getDebugLoc(); - if (FirstLoc.isUnknown()) continue; - DILocation FirstDIL(FirstLoc.getAsMDNode(Ctx)); + const DILocation *FirstDIL = First->getDebugLoc(); + if (!FirstDIL) + continue; // If the first instruction (First) of Succ is at the same file // location as B's last instruction (Last), add a new // discriminator for First's location and all the instructions // in Succ that share the same location with First. - if (FirstDIL.atSameLineAs(LastDIL)) { + if (!FirstDIL->canDiscriminate(*LastDIL)) { // Create a new lexical scope and compute a new discriminator // number for it. - StringRef Filename = FirstDIL.getFilename(); - DIScope Scope = FirstDIL.getScope(); - DIFile File = Builder.createFile(Filename, Scope.getDirectory()); - unsigned Discriminator = FirstDIL.computeNewDiscriminator(Ctx); - DILexicalBlockFile NewScope = + StringRef Filename = FirstDIL->getFilename(); + auto *Scope = FirstDIL->getScope(); + auto *File = Builder.createFile(Filename, Scope->getDirectory()); + + // FIXME: Calculate the discriminator here, based on local information, + // and delete DILocation::computeNewDiscriminator(). The current + // solution gives different results depending on other modules in the + // same context. All we really need is to discriminate between + // FirstDIL and LastDIL -- a local map would suffice. + unsigned Discriminator = FirstDIL->computeNewDiscriminator(); + auto *NewScope = Builder.createLexicalBlockFile(Scope, File, Discriminator); - DILocation NewDIL = FirstDIL.copyWithNewScope(Ctx, NewScope); - DebugLoc newDebugLoc = DebugLoc::getFromDILocation(NewDIL); + auto *NewDIL = + DILocation::get(Ctx, FirstDIL->getLine(), FirstDIL->getColumn(), + NewScope, FirstDIL->getInlinedAt()); + DebugLoc newDebugLoc = NewDIL; // Attach this new debug location to First and every // instruction following First that shares the same location. for (BasicBlock::iterator I1(*First), E1 = Succ->end(); I1 != E1; ++I1) { - if (I1->getDebugLoc() != FirstLoc) break; + if (I1->getDebugLoc().get() != FirstDIL) + break; I1->setDebugLoc(newDebugLoc); - DEBUG(dbgs() << NewDIL.getFilename() << ":" << NewDIL.getLineNumber() - << ":" << NewDIL.getColumnNumber() << ":" - << NewDIL.getDiscriminator() << *I1 << "\n"); + DEBUG(dbgs() << NewDIL->getFilename() << ":" << NewDIL->getLine() + << ":" << NewDIL->getColumn() << ":" + << NewDIL->getDiscriminator() << *I1 << "\n"); } DEBUG(dbgs() << "\n"); Changed = true; diff --git a/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp index 983f025..f3c8013 100644 --- a/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp +++ b/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp @@ -65,16 +65,10 @@ void llvm::DeleteDeadBlock(BasicBlock *BB) { /// any single-entry PHI nodes in it, fold them away. This handles the case /// when all entries to the PHI nodes in a block are guaranteed equal, such as /// when the block has exactly one predecessor. -void llvm::FoldSingleEntryPHINodes(BasicBlock *BB, Pass *P) { +void llvm::FoldSingleEntryPHINodes(BasicBlock *BB, AliasAnalysis *AA, + MemoryDependenceAnalysis *MemDep) { if (!isa<PHINode>(BB->begin())) return; - AliasAnalysis *AA = nullptr; - MemoryDependenceAnalysis *MemDep = nullptr; - if (P) { - AA = P->getAnalysisIfAvailable<AliasAnalysis>(); - MemDep = P->getAnalysisIfAvailable<MemoryDependenceAnalysis>(); - } - while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) { if (PN->getIncomingValue(0) != PN) PN->replaceAllUsesWith(PN->getIncomingValue(0)); @@ -113,7 +107,9 @@ bool llvm::DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI) { /// MergeBlockIntoPredecessor - Attempts to merge a block into its predecessor, /// if possible. The return value indicates success or failure. -bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, Pass *P) { +bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DominatorTree *DT, + LoopInfo *LI, AliasAnalysis *AA, + MemoryDependenceAnalysis *MemDep) { // Don't merge away blocks who have their address taken. if (BB->hasAddressTaken()) return false; @@ -140,8 +136,8 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, Pass *P) { // Can't merge if there is PHI loop. for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE; ++BI) { if (PHINode *PN = dyn_cast<PHINode>(BI)) { - for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) - if (PN->getIncomingValue(i) == PN) + for (Value *IncValue : PN->incoming_values()) + if (IncValue == PN) return false; } else break; @@ -149,7 +145,7 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, Pass *P) { // Begin by getting rid of unneeded PHIs. if (isa<PHINode>(BB->front())) - FoldSingleEntryPHINodes(BB, P); + FoldSingleEntryPHINodes(BB, AA, MemDep); // Delete the unconditional branch from the predecessor... PredBB->getInstList().pop_back(); @@ -166,28 +162,23 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, Pass *P) { PredBB->takeName(BB); // Finally, erase the old block and update dominator info. - if (P) { - if (DominatorTreeWrapperPass *DTWP = - P->getAnalysisIfAvailable<DominatorTreeWrapperPass>()) { - DominatorTree &DT = DTWP->getDomTree(); - if (DomTreeNode *DTN = DT.getNode(BB)) { - DomTreeNode *PredDTN = DT.getNode(PredBB); - SmallVector<DomTreeNode*, 8> Children(DTN->begin(), DTN->end()); - for (SmallVectorImpl<DomTreeNode *>::iterator DI = Children.begin(), - DE = Children.end(); DI != DE; ++DI) - DT.changeImmediateDominator(*DI, PredDTN); - - DT.eraseNode(BB); - } + if (DT) + if (DomTreeNode *DTN = DT->getNode(BB)) { + DomTreeNode *PredDTN = DT->getNode(PredBB); + SmallVector<DomTreeNode *, 8> Children(DTN->begin(), DTN->end()); + for (SmallVectorImpl<DomTreeNode *>::iterator DI = Children.begin(), + DE = Children.end(); + DI != DE; ++DI) + DT->changeImmediateDominator(*DI, PredDTN); + + DT->eraseNode(BB); + } - if (LoopInfo *LI = P->getAnalysisIfAvailable<LoopInfo>()) - LI->removeBlock(BB); + if (LI) + LI->removeBlock(BB); - if (MemoryDependenceAnalysis *MD = - P->getAnalysisIfAvailable<MemoryDependenceAnalysis>()) - MD->invalidateCachedPredecessors(); - } - } + if (MemDep) + MemDep->invalidateCachedPredecessors(); BB->eraseFromParent(); return true; @@ -240,12 +231,14 @@ void llvm::ReplaceInstWithInst(Instruction *From, Instruction *To) { /// SplitEdge - Split the edge connecting specified block. Pass P must /// not be NULL. -BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, Pass *P) { +BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, DominatorTree *DT, + LoopInfo *LI) { unsigned SuccNum = GetSuccessorNumber(BB, Succ); // If this is a critical edge, let SplitCriticalEdge do it. TerminatorInst *LatchTerm = BB->getTerminator(); - if (SplitCriticalEdge(LatchTerm, SuccNum, P)) + if (SplitCriticalEdge(LatchTerm, SuccNum, CriticalEdgeSplittingOptions(DT, LI) + .setPreserveLCSSA())) return LatchTerm->getSuccessor(SuccNum); // If the edge isn't critical, then BB has a single successor or Succ has a @@ -255,23 +248,25 @@ BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, Pass *P) { // block. assert(SP == BB && "CFG broken"); SP = nullptr; - return SplitBlock(Succ, Succ->begin(), P); + return SplitBlock(Succ, Succ->begin(), DT, LI); } // Otherwise, if BB has a single successor, split it at the bottom of the // block. assert(BB->getTerminator()->getNumSuccessors() == 1 && "Should have a single succ!"); - return SplitBlock(BB, BB->getTerminator(), P); + return SplitBlock(BB, BB->getTerminator(), DT, LI); } -unsigned llvm::SplitAllCriticalEdges(Function &F, Pass *P) { +unsigned +llvm::SplitAllCriticalEdges(Function &F, + const CriticalEdgeSplittingOptions &Options) { unsigned NumBroken = 0; for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) { TerminatorInst *TI = I->getTerminator(); if (TI->getNumSuccessors() > 1 && !isa<IndirectBrInst>(TI)) for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) - if (SplitCriticalEdge(TI, i, P)) + if (SplitCriticalEdge(TI, i, Options)) ++NumBroken; } return NumBroken; @@ -282,7 +277,8 @@ unsigned llvm::SplitAllCriticalEdges(Function &F, Pass *P) { /// to a new block. The two blocks are joined by an unconditional branch and /// the loop info is updated. /// -BasicBlock *llvm::SplitBlock(BasicBlock *Old, Instruction *SplitPt, Pass *P) { +BasicBlock *llvm::SplitBlock(BasicBlock *Old, Instruction *SplitPt, + DominatorTree *DT, LoopInfo *LI) { BasicBlock::iterator SplitIt = SplitPt; while (isa<PHINode>(SplitIt) || isa<LandingPadInst>(SplitIt)) ++SplitIt; @@ -290,26 +286,23 @@ BasicBlock *llvm::SplitBlock(BasicBlock *Old, Instruction *SplitPt, Pass *P) { // The new block lives in whichever loop the old one did. This preserves // LCSSA as well, because we force the split point to be after any PHI nodes. - if (LoopInfo *LI = P->getAnalysisIfAvailable<LoopInfo>()) + if (LI) if (Loop *L = LI->getLoopFor(Old)) - L->addBasicBlockToLoop(New, LI->getBase()); + L->addBasicBlockToLoop(New, *LI); - if (DominatorTreeWrapperPass *DTWP = - P->getAnalysisIfAvailable<DominatorTreeWrapperPass>()) { - DominatorTree &DT = DTWP->getDomTree(); + if (DT) // Old dominates New. New node dominates all other nodes dominated by Old. - if (DomTreeNode *OldNode = DT.getNode(Old)) { + if (DomTreeNode *OldNode = DT->getNode(Old)) { std::vector<DomTreeNode *> Children; for (DomTreeNode::iterator I = OldNode->begin(), E = OldNode->end(); I != E; ++I) Children.push_back(*I); - DomTreeNode *NewNode = DT.addNewBlock(New, Old); + DomTreeNode *NewNode = DT->addNewBlock(New, Old); for (std::vector<DomTreeNode *>::iterator I = Children.begin(), E = Children.end(); I != E; ++I) - DT.changeImmediateDominator(*I, NewNode); + DT->changeImmediateDominator(*I, NewNode); } - } return New; } @@ -318,45 +311,46 @@ BasicBlock *llvm::SplitBlock(BasicBlock *Old, Instruction *SplitPt, Pass *P) { /// analysis information. static void UpdateAnalysisInformation(BasicBlock *OldBB, BasicBlock *NewBB, ArrayRef<BasicBlock *> Preds, - Pass *P, bool &HasLoopExit) { - if (!P) return; + DominatorTree *DT, LoopInfo *LI, + bool PreserveLCSSA, bool &HasLoopExit) { + // Update dominator tree if available. + if (DT) + DT->splitBlock(NewBB); + + // The rest of the logic is only relevant for updating the loop structures. + if (!LI) + return; - LoopInfo *LI = P->getAnalysisIfAvailable<LoopInfo>(); - Loop *L = LI ? LI->getLoopFor(OldBB) : nullptr; + Loop *L = LI->getLoopFor(OldBB); // If we need to preserve loop analyses, collect some information about how // this split will affect loops. bool IsLoopEntry = !!L; bool SplitMakesNewLoopHeader = false; - if (LI) { - bool PreserveLCSSA = P->mustPreserveAnalysisID(LCSSAID); - for (ArrayRef<BasicBlock*>::iterator - i = Preds.begin(), e = Preds.end(); i != e; ++i) { - BasicBlock *Pred = *i; - - // If we need to preserve LCSSA, determine if any of the preds is a loop - // exit. - if (PreserveLCSSA) - if (Loop *PL = LI->getLoopFor(Pred)) - if (!PL->contains(OldBB)) - HasLoopExit = true; - - // If we need to preserve LoopInfo, note whether any of the preds crosses - // an interesting loop boundary. - if (!L) continue; - if (L->contains(Pred)) - IsLoopEntry = false; - else - SplitMakesNewLoopHeader = true; - } + for (ArrayRef<BasicBlock *>::iterator i = Preds.begin(), e = Preds.end(); + i != e; ++i) { + BasicBlock *Pred = *i; + + // If we need to preserve LCSSA, determine if any of the preds is a loop + // exit. + if (PreserveLCSSA) + if (Loop *PL = LI->getLoopFor(Pred)) + if (!PL->contains(OldBB)) + HasLoopExit = true; + + // If we need to preserve LoopInfo, note whether any of the preds crosses + // an interesting loop boundary. + if (!L) + continue; + if (L->contains(Pred)) + IsLoopEntry = false; + else + SplitMakesNewLoopHeader = true; } - // Update dominator tree if available. - if (DominatorTreeWrapperPass *DTWP = - P->getAnalysisIfAvailable<DominatorTreeWrapperPass>()) - DTWP->getDomTree().splitBlock(NewBB); - - if (!L) return; + // Unless we have a loop for OldBB, nothing else to do here. + if (!L) + return; if (IsLoopEntry) { // Add the new block to the nearest enclosing loop (and not an adjacent @@ -382,9 +376,9 @@ static void UpdateAnalysisInformation(BasicBlock *OldBB, BasicBlock *NewBB, } if (InnermostPredLoop) - InnermostPredLoop->addBasicBlockToLoop(NewBB, LI->getBase()); + InnermostPredLoop->addBasicBlockToLoop(NewBB, *LI); } else { - L->addBasicBlockToLoop(NewBB, LI->getBase()); + L->addBasicBlockToLoop(NewBB, *LI); if (SplitMakesNewLoopHeader) L->moveToHeader(NewBB); } @@ -393,10 +387,9 @@ static void UpdateAnalysisInformation(BasicBlock *OldBB, BasicBlock *NewBB, /// UpdatePHINodes - Update the PHI nodes in OrigBB to include the values coming /// from NewBB. This also updates AliasAnalysis, if available. static void UpdatePHINodes(BasicBlock *OrigBB, BasicBlock *NewBB, - ArrayRef<BasicBlock*> Preds, BranchInst *BI, - Pass *P, bool HasLoopExit) { + ArrayRef<BasicBlock *> Preds, BranchInst *BI, + AliasAnalysis *AA, bool HasLoopExit) { // Otherwise, create a new PHI node in NewBB for each PHI node in OrigBB. - AliasAnalysis *AA = P ? P->getAnalysisIfAvailable<AliasAnalysis>() : nullptr; SmallPtrSet<BasicBlock *, 16> PredSet(Preds.begin(), Preds.end()); for (BasicBlock::iterator I = OrigBB->begin(); isa<PHINode>(I); ) { PHINode *PN = cast<PHINode>(I++); @@ -461,11 +454,15 @@ static void UpdatePHINodes(BasicBlock *OrigBB, BasicBlock *NewBB, } } -/// SplitBlockPredecessors - This method transforms BB by introducing a new -/// basic block into the function, and moving some of the predecessors of BB to -/// be predecessors of the new block. The new predecessors are indicated by the -/// Preds array, which has NumPreds elements in it. The new block is given a -/// suffix of 'Suffix'. +/// SplitBlockPredecessors - This method introduces at least one new basic block +/// into the function and moves some of the predecessors of BB to be +/// predecessors of the new block. The new predecessors are indicated by the +/// Preds array. The new block is given a suffix of 'Suffix'. Returns new basic +/// block to which predecessors from Preds are now pointing. +/// +/// If BB is a landingpad block then additional basicblock might be introduced. +/// It will have suffix of 'Suffix'+".split_lp". +/// See SplitLandingPadPredecessors for more details on this case. /// /// This currently updates the LLVM IR, AliasAnalysis, DominatorTree, /// LoopInfo, and LCCSA but no other analyses. In particular, it does not @@ -473,8 +470,21 @@ static void UpdatePHINodes(BasicBlock *OrigBB, BasicBlock *NewBB, /// of the edges being split is an exit of a loop with other exits). /// BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB, - ArrayRef<BasicBlock*> Preds, - const char *Suffix, Pass *P) { + ArrayRef<BasicBlock *> Preds, + const char *Suffix, AliasAnalysis *AA, + DominatorTree *DT, LoopInfo *LI, + bool PreserveLCSSA) { + // For the landingpads we need to act a bit differently. + // Delegate this work to the SplitLandingPadPredecessors. + if (BB->isLandingPad()) { + SmallVector<BasicBlock*, 2> NewBBs; + std::string NewName = std::string(Suffix) + ".split-lp"; + + SplitLandingPadPredecessors(BB, Preds, Suffix, NewName.c_str(), + NewBBs, AA, DT, LI, PreserveLCSSA); + return NewBBs[0]; + } + // Create new basic block, insert right before the original block. BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), BB->getName()+Suffix, BB->getParent(), BB); @@ -505,10 +515,11 @@ BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB, // Update DominatorTree, LoopInfo, and LCCSA analysis information. bool HasLoopExit = false; - UpdateAnalysisInformation(BB, NewBB, Preds, P, HasLoopExit); + UpdateAnalysisInformation(BB, NewBB, Preds, DT, LI, PreserveLCSSA, + HasLoopExit); // Update the PHI nodes in BB with the values coming from NewBB. - UpdatePHINodes(BB, NewBB, Preds, BI, P, HasLoopExit); + UpdatePHINodes(BB, NewBB, Preds, BI, AA, HasLoopExit); return NewBB; } @@ -526,10 +537,11 @@ BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB, /// exits). /// void llvm::SplitLandingPadPredecessors(BasicBlock *OrigBB, - ArrayRef<BasicBlock*> Preds, + ArrayRef<BasicBlock *> Preds, const char *Suffix1, const char *Suffix2, - Pass *P, - SmallVectorImpl<BasicBlock*> &NewBBs) { + SmallVectorImpl<BasicBlock *> &NewBBs, + AliasAnalysis *AA, DominatorTree *DT, + LoopInfo *LI, bool PreserveLCSSA) { assert(OrigBB->isLandingPad() && "Trying to split a non-landing pad!"); // Create a new basic block for OrigBB's predecessors listed in Preds. Insert @@ -552,12 +564,12 @@ void llvm::SplitLandingPadPredecessors(BasicBlock *OrigBB, Preds[i]->getTerminator()->replaceUsesOfWith(OrigBB, NewBB1); } - // Update DominatorTree, LoopInfo, and LCCSA analysis information. bool HasLoopExit = false; - UpdateAnalysisInformation(OrigBB, NewBB1, Preds, P, HasLoopExit); + UpdateAnalysisInformation(OrigBB, NewBB1, Preds, DT, LI, PreserveLCSSA, + HasLoopExit); // Update the PHI nodes in OrigBB with the values coming from NewBB1. - UpdatePHINodes(OrigBB, NewBB1, Preds, BI1, P, HasLoopExit); + UpdatePHINodes(OrigBB, NewBB1, Preds, BI1, AA, HasLoopExit); // Move the remaining edges from OrigBB to point to NewBB2. SmallVector<BasicBlock*, 8> NewBB2Preds; @@ -589,10 +601,11 @@ void llvm::SplitLandingPadPredecessors(BasicBlock *OrigBB, // Update DominatorTree, LoopInfo, and LCCSA analysis information. HasLoopExit = false; - UpdateAnalysisInformation(OrigBB, NewBB2, NewBB2Preds, P, HasLoopExit); + UpdateAnalysisInformation(OrigBB, NewBB2, NewBB2Preds, DT, LI, + PreserveLCSSA, HasLoopExit); // Update the PHI nodes in OrigBB with the values coming from NewBB2. - UpdatePHINodes(OrigBB, NewBB2, NewBB2Preds, BI2, P, HasLoopExit); + UpdatePHINodes(OrigBB, NewBB2, NewBB2Preds, BI2, AA, HasLoopExit); } LandingPadInst *LPad = OrigBB->getLandingPadInst(); diff --git a/contrib/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp b/contrib/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp index eda22cf..7e83c9e 100644 --- a/contrib/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp +++ b/contrib/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp @@ -18,6 +18,7 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/IR/CFG.h" @@ -41,14 +42,19 @@ namespace { } bool runOnFunction(Function &F) override { - unsigned N = SplitAllCriticalEdges(F, this); + auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); + auto *DT = DTWP ? &DTWP->getDomTree() : nullptr; + auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>(); + auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr; + unsigned N = + SplitAllCriticalEdges(F, CriticalEdgeSplittingOptions(DT, LI)); NumBroken += N; return N > 0; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addPreserved<LoopInfo>(); + AU.addPreserved<LoopInfoWrapperPass>(); // No loop canonicalization guarantees are broken by this pass. AU.addPreservedID(LoopSimplifyID); @@ -125,10 +131,9 @@ static void createPHIsForSplitLoopExit(ArrayRef<BasicBlock *> Preds, /// to. /// BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum, - Pass *P, bool MergeIdenticalEdges, - bool DontDeleteUselessPhis, - bool SplitLandingPads) { - if (!isCriticalEdge(TI, SuccNum, MergeIdenticalEdges)) return nullptr; + const CriticalEdgeSplittingOptions &Options) { + if (!isCriticalEdge(TI, SuccNum, Options.MergeIdenticalEdges)) + return nullptr; assert(!isa<IndirectBrInst>(TI) && "Cannot split critical edge from IndirectBrInst"); @@ -179,29 +184,22 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum, // If there are any other edges from TIBB to DestBB, update those to go // through the split block, making those edges non-critical as well (and // reducing the number of phi entries in the DestBB if relevant). - if (MergeIdenticalEdges) { + if (Options.MergeIdenticalEdges) { for (unsigned i = SuccNum+1, e = TI->getNumSuccessors(); i != e; ++i) { if (TI->getSuccessor(i) != DestBB) continue; // Remove an entry for TIBB from DestBB phi nodes. - DestBB->removePredecessor(TIBB, DontDeleteUselessPhis); + DestBB->removePredecessor(TIBB, Options.DontDeleteUselessPHIs); // We found another edge to DestBB, go to NewBB instead. TI->setSuccessor(i, NewBB); } } - - - // If we don't have a pass object, we can't update anything... - if (!P) return NewBB; - - DominatorTreeWrapperPass *DTWP = - P->getAnalysisIfAvailable<DominatorTreeWrapperPass>(); - DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr; - LoopInfo *LI = P->getAnalysisIfAvailable<LoopInfo>(); - // If we have nothing to update, just return. + auto *AA = Options.AA; + auto *DT = Options.DT; + auto *LI = Options.LI; if (!DT && !LI) return NewBB; @@ -268,13 +266,13 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum, if (Loop *DestLoop = LI->getLoopFor(DestBB)) { if (TIL == DestLoop) { // Both in the same loop, the NewBB joins loop. - DestLoop->addBasicBlockToLoop(NewBB, LI->getBase()); + DestLoop->addBasicBlockToLoop(NewBB, *LI); } else if (TIL->contains(DestLoop)) { // Edge from an outer loop to an inner loop. Add to the outer loop. - TIL->addBasicBlockToLoop(NewBB, LI->getBase()); + TIL->addBasicBlockToLoop(NewBB, *LI); } else if (DestLoop->contains(TIL)) { // Edge from an inner loop to an outer loop. Add to the outer loop. - DestLoop->addBasicBlockToLoop(NewBB, LI->getBase()); + DestLoop->addBasicBlockToLoop(NewBB, *LI); } else { // Edge from two loops with no containment relation. Because these // are natural loops, we know that the destination block must be the @@ -283,19 +281,20 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum, assert(DestLoop->getHeader() == DestBB && "Should not create irreducible loops!"); if (Loop *P = DestLoop->getParentLoop()) - P->addBasicBlockToLoop(NewBB, LI->getBase()); + P->addBasicBlockToLoop(NewBB, *LI); } } + // If TIBB is in a loop and DestBB is outside of that loop, we may need // to update LoopSimplify form and LCSSA form. - if (!TIL->contains(DestBB) && - P->mustPreserveAnalysisID(LoopSimplifyID)) { + if (!TIL->contains(DestBB)) { assert(!TIL->contains(NewBB) && "Split point for loop exit is contained in loop!"); // Update LCSSA form in the newly created exit block. - if (P->mustPreserveAnalysisID(LCSSAID)) + if (Options.PreserveLCSSA) { createPHIsForSplitLoopExit(TIBB, NewBB, DestBB); + } // The only that we can break LoopSimplify form by splitting a critical // edge is if after the split there exists some edge from TIL to DestBB @@ -322,20 +321,12 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum, if (!LoopPreds.empty()) { assert(!DestBB->isLandingPad() && "We don't split edges to landing pads!"); - BasicBlock *NewExitBB = - SplitBlockPredecessors(DestBB, LoopPreds, "split", P); - if (P->mustPreserveAnalysisID(LCSSAID)) + BasicBlock *NewExitBB = SplitBlockPredecessors( + DestBB, LoopPreds, "split", AA, DT, LI, Options.PreserveLCSSA); + if (Options.PreserveLCSSA) createPHIsForSplitLoopExit(LoopPreds, NewExitBB, DestBB); } } - // LCSSA form was updated above for the case where LoopSimplify is - // available, which means that all predecessors of loop exit blocks - // are within the loop. Without LoopSimplify form, it would be - // necessary to insert a new phi. - assert((!P->mustPreserveAnalysisID(LCSSAID) || - P->mustPreserveAnalysisID(LoopSimplifyID)) && - "SplitCriticalEdge doesn't know how to update LCCSA form " - "without LoopSimplify!"); } } diff --git a/contrib/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/contrib/llvm/lib/Transforms/Utils/BuildLibCalls.cpp index 322485d..8aa7b2a 100644 --- a/contrib/llvm/lib/Transforms/Utils/BuildLibCalls.cpp +++ b/contrib/llvm/lib/Transforms/Utils/BuildLibCalls.cpp @@ -21,7 +21,7 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" using namespace llvm; @@ -33,7 +33,7 @@ Value *llvm::CastToCStr(Value *V, IRBuilder<> &B) { /// EmitStrLen - Emit a call to the strlen function to the builder, for the /// specified pointer. This always returns an integer value of size intptr_t. -Value *llvm::EmitStrLen(Value *Ptr, IRBuilder<> &B, const DataLayout *TD, +Value *llvm::EmitStrLen(Value *Ptr, IRBuilder<> &B, const DataLayout &DL, const TargetLibraryInfo *TLI) { if (!TLI->has(LibFunc::strlen)) return nullptr; @@ -45,12 +45,9 @@ Value *llvm::EmitStrLen(Value *Ptr, IRBuilder<> &B, const DataLayout *TD, AS[1] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex, AVs); LLVMContext &Context = B.GetInsertBlock()->getContext(); - Constant *StrLen = M->getOrInsertFunction("strlen", - AttributeSet::get(M->getContext(), - AS), - TD->getIntPtrType(Context), - B.getInt8PtrTy(), - nullptr); + Constant *StrLen = M->getOrInsertFunction( + "strlen", AttributeSet::get(M->getContext(), AS), + DL.getIntPtrType(Context), B.getInt8PtrTy(), nullptr); CallInst *CI = B.CreateCall(StrLen, CastToCStr(Ptr, B), "strlen"); if (const Function *F = dyn_cast<Function>(StrLen->stripPointerCasts())) CI->setCallingConv(F->getCallingConv()); @@ -62,7 +59,7 @@ Value *llvm::EmitStrLen(Value *Ptr, IRBuilder<> &B, const DataLayout *TD, /// specified pointer. Ptr is required to be some pointer type, MaxLen must /// be of size_t type, and the return value has 'intptr_t' type. Value *llvm::EmitStrNLen(Value *Ptr, Value *MaxLen, IRBuilder<> &B, - const DataLayout *TD, const TargetLibraryInfo *TLI) { + const DataLayout &DL, const TargetLibraryInfo *TLI) { if (!TLI->has(LibFunc::strnlen)) return nullptr; @@ -73,14 +70,11 @@ Value *llvm::EmitStrNLen(Value *Ptr, Value *MaxLen, IRBuilder<> &B, AS[1] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex, AVs); LLVMContext &Context = B.GetInsertBlock()->getContext(); - Constant *StrNLen = M->getOrInsertFunction("strnlen", - AttributeSet::get(M->getContext(), - AS), - TD->getIntPtrType(Context), - B.getInt8PtrTy(), - TD->getIntPtrType(Context), - nullptr); - CallInst *CI = B.CreateCall2(StrNLen, CastToCStr(Ptr, B), MaxLen, "strnlen"); + Constant *StrNLen = + M->getOrInsertFunction("strnlen", AttributeSet::get(M->getContext(), AS), + DL.getIntPtrType(Context), B.getInt8PtrTy(), + DL.getIntPtrType(Context), nullptr); + CallInst *CI = B.CreateCall(StrNLen, {CastToCStr(Ptr, B), MaxLen}, "strnlen"); if (const Function *F = dyn_cast<Function>(StrNLen->stripPointerCasts())) CI->setCallingConv(F->getCallingConv()); @@ -91,7 +85,7 @@ Value *llvm::EmitStrNLen(Value *Ptr, Value *MaxLen, IRBuilder<> &B, /// specified pointer and character. Ptr is required to be some pointer type, /// and the return value has 'i8*' type. Value *llvm::EmitStrChr(Value *Ptr, char C, IRBuilder<> &B, - const DataLayout *TD, const TargetLibraryInfo *TLI) { + const TargetLibraryInfo *TLI) { if (!TLI->has(LibFunc::strchr)) return nullptr; @@ -106,17 +100,16 @@ Value *llvm::EmitStrChr(Value *Ptr, char C, IRBuilder<> &B, AttributeSet::get(M->getContext(), AS), I8Ptr, I8Ptr, I32Ty, nullptr); - CallInst *CI = B.CreateCall2(StrChr, CastToCStr(Ptr, B), - ConstantInt::get(I32Ty, C), "strchr"); + CallInst *CI = B.CreateCall( + StrChr, {CastToCStr(Ptr, B), ConstantInt::get(I32Ty, C)}, "strchr"); if (const Function *F = dyn_cast<Function>(StrChr->stripPointerCasts())) CI->setCallingConv(F->getCallingConv()); return CI; } /// EmitStrNCmp - Emit a call to the strncmp function to the builder. -Value *llvm::EmitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len, - IRBuilder<> &B, const DataLayout *TD, - const TargetLibraryInfo *TLI) { +Value *llvm::EmitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B, + const DataLayout &DL, const TargetLibraryInfo *TLI) { if (!TLI->has(LibFunc::strncmp)) return nullptr; @@ -128,15 +121,11 @@ Value *llvm::EmitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len, AS[2] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex, AVs); LLVMContext &Context = B.GetInsertBlock()->getContext(); - Value *StrNCmp = M->getOrInsertFunction("strncmp", - AttributeSet::get(M->getContext(), - AS), - B.getInt32Ty(), - B.getInt8PtrTy(), - B.getInt8PtrTy(), - TD->getIntPtrType(Context), nullptr); - CallInst *CI = B.CreateCall3(StrNCmp, CastToCStr(Ptr1, B), - CastToCStr(Ptr2, B), Len, "strncmp"); + Value *StrNCmp = M->getOrInsertFunction( + "strncmp", AttributeSet::get(M->getContext(), AS), B.getInt32Ty(), + B.getInt8PtrTy(), B.getInt8PtrTy(), DL.getIntPtrType(Context), nullptr); + CallInst *CI = B.CreateCall( + StrNCmp, {CastToCStr(Ptr1, B), CastToCStr(Ptr2, B), Len}, "strncmp"); if (const Function *F = dyn_cast<Function>(StrNCmp->stripPointerCasts())) CI->setCallingConv(F->getCallingConv()); @@ -147,8 +136,7 @@ Value *llvm::EmitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len, /// EmitStrCpy - Emit a call to the strcpy function to the builder, for the /// specified pointer arguments. Value *llvm::EmitStrCpy(Value *Dst, Value *Src, IRBuilder<> &B, - const DataLayout *TD, const TargetLibraryInfo *TLI, - StringRef Name) { + const TargetLibraryInfo *TLI, StringRef Name) { if (!TLI->has(LibFunc::strcpy)) return nullptr; @@ -161,8 +149,8 @@ Value *llvm::EmitStrCpy(Value *Dst, Value *Src, IRBuilder<> &B, Value *StrCpy = M->getOrInsertFunction(Name, AttributeSet::get(M->getContext(), AS), I8Ptr, I8Ptr, I8Ptr, nullptr); - CallInst *CI = B.CreateCall2(StrCpy, CastToCStr(Dst, B), CastToCStr(Src, B), - Name); + CallInst *CI = + B.CreateCall(StrCpy, {CastToCStr(Dst, B), CastToCStr(Src, B)}, Name); if (const Function *F = dyn_cast<Function>(StrCpy->stripPointerCasts())) CI->setCallingConv(F->getCallingConv()); return CI; @@ -170,8 +158,7 @@ Value *llvm::EmitStrCpy(Value *Dst, Value *Src, IRBuilder<> &B, /// EmitStrNCpy - Emit a call to the strncpy function to the builder, for the /// specified pointer arguments. -Value *llvm::EmitStrNCpy(Value *Dst, Value *Src, Value *Len, - IRBuilder<> &B, const DataLayout *TD, +Value *llvm::EmitStrNCpy(Value *Dst, Value *Src, Value *Len, IRBuilder<> &B, const TargetLibraryInfo *TLI, StringRef Name) { if (!TLI->has(LibFunc::strncpy)) return nullptr; @@ -187,8 +174,8 @@ Value *llvm::EmitStrNCpy(Value *Dst, Value *Src, Value *Len, AS), I8Ptr, I8Ptr, I8Ptr, Len->getType(), nullptr); - CallInst *CI = B.CreateCall3(StrNCpy, CastToCStr(Dst, B), CastToCStr(Src, B), - Len, "strncpy"); + CallInst *CI = B.CreateCall( + StrNCpy, {CastToCStr(Dst, B), CastToCStr(Src, B), Len}, "strncpy"); if (const Function *F = dyn_cast<Function>(StrNCpy->stripPointerCasts())) CI->setCallingConv(F->getCallingConv()); return CI; @@ -198,7 +185,7 @@ Value *llvm::EmitStrNCpy(Value *Dst, Value *Src, Value *Len, /// This expects that the Len and ObjSize have type 'intptr_t' and Dst/Src /// are pointers. Value *llvm::EmitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize, - IRBuilder<> &B, const DataLayout *TD, + IRBuilder<> &B, const DataLayout &DL, const TargetLibraryInfo *TLI) { if (!TLI->has(LibFunc::memcpy_chk)) return nullptr; @@ -208,16 +195,13 @@ Value *llvm::EmitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize, AS = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex, Attribute::NoUnwind); LLVMContext &Context = B.GetInsertBlock()->getContext(); - Value *MemCpy = M->getOrInsertFunction("__memcpy_chk", - AttributeSet::get(M->getContext(), AS), - B.getInt8PtrTy(), - B.getInt8PtrTy(), - B.getInt8PtrTy(), - TD->getIntPtrType(Context), - TD->getIntPtrType(Context), nullptr); + Value *MemCpy = M->getOrInsertFunction( + "__memcpy_chk", AttributeSet::get(M->getContext(), AS), B.getInt8PtrTy(), + B.getInt8PtrTy(), B.getInt8PtrTy(), DL.getIntPtrType(Context), + DL.getIntPtrType(Context), nullptr); Dst = CastToCStr(Dst, B); Src = CastToCStr(Src, B); - CallInst *CI = B.CreateCall4(MemCpy, Dst, Src, Len, ObjSize); + CallInst *CI = B.CreateCall(MemCpy, {Dst, Src, Len, ObjSize}); if (const Function *F = dyn_cast<Function>(MemCpy->stripPointerCasts())) CI->setCallingConv(F->getCallingConv()); return CI; @@ -225,9 +209,8 @@ Value *llvm::EmitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize, /// EmitMemChr - Emit a call to the memchr function. This assumes that Ptr is /// a pointer, Val is an i32 value, and Len is an 'intptr_t' value. -Value *llvm::EmitMemChr(Value *Ptr, Value *Val, - Value *Len, IRBuilder<> &B, const DataLayout *TD, - const TargetLibraryInfo *TLI) { +Value *llvm::EmitMemChr(Value *Ptr, Value *Val, Value *Len, IRBuilder<> &B, + const DataLayout &DL, const TargetLibraryInfo *TLI) { if (!TLI->has(LibFunc::memchr)) return nullptr; @@ -236,14 +219,10 @@ Value *llvm::EmitMemChr(Value *Ptr, Value *Val, Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind }; AS = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex, AVs); LLVMContext &Context = B.GetInsertBlock()->getContext(); - Value *MemChr = M->getOrInsertFunction("memchr", - AttributeSet::get(M->getContext(), AS), - B.getInt8PtrTy(), - B.getInt8PtrTy(), - B.getInt32Ty(), - TD->getIntPtrType(Context), - nullptr); - CallInst *CI = B.CreateCall3(MemChr, CastToCStr(Ptr, B), Val, Len, "memchr"); + Value *MemChr = M->getOrInsertFunction( + "memchr", AttributeSet::get(M->getContext(), AS), B.getInt8PtrTy(), + B.getInt8PtrTy(), B.getInt32Ty(), DL.getIntPtrType(Context), nullptr); + CallInst *CI = B.CreateCall(MemChr, {CastToCStr(Ptr, B), Val, Len}, "memchr"); if (const Function *F = dyn_cast<Function>(MemChr->stripPointerCasts())) CI->setCallingConv(F->getCallingConv()); @@ -252,9 +231,8 @@ Value *llvm::EmitMemChr(Value *Ptr, Value *Val, } /// EmitMemCmp - Emit a call to the memcmp function. -Value *llvm::EmitMemCmp(Value *Ptr1, Value *Ptr2, - Value *Len, IRBuilder<> &B, const DataLayout *TD, - const TargetLibraryInfo *TLI) { +Value *llvm::EmitMemCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B, + const DataLayout &DL, const TargetLibraryInfo *TLI) { if (!TLI->has(LibFunc::memcmp)) return nullptr; @@ -266,14 +244,11 @@ Value *llvm::EmitMemCmp(Value *Ptr1, Value *Ptr2, AS[2] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex, AVs); LLVMContext &Context = B.GetInsertBlock()->getContext(); - Value *MemCmp = M->getOrInsertFunction("memcmp", - AttributeSet::get(M->getContext(), AS), - B.getInt32Ty(), - B.getInt8PtrTy(), - B.getInt8PtrTy(), - TD->getIntPtrType(Context), nullptr); - CallInst *CI = B.CreateCall3(MemCmp, CastToCStr(Ptr1, B), CastToCStr(Ptr2, B), - Len, "memcmp"); + Value *MemCmp = M->getOrInsertFunction( + "memcmp", AttributeSet::get(M->getContext(), AS), B.getInt32Ty(), + B.getInt8PtrTy(), B.getInt8PtrTy(), DL.getIntPtrType(Context), nullptr); + CallInst *CI = B.CreateCall( + MemCmp, {CastToCStr(Ptr1, B), CastToCStr(Ptr2, B), Len}, "memcmp"); if (const Function *F = dyn_cast<Function>(MemCmp->stripPointerCasts())) CI->setCallingConv(F->getCallingConv()); @@ -329,7 +304,7 @@ Value *llvm::EmitBinaryFloatFnCall(Value *Op1, Value *Op2, StringRef Name, Module *M = B.GetInsertBlock()->getParent()->getParent(); Value *Callee = M->getOrInsertFunction(Name, Op1->getType(), Op1->getType(), Op2->getType(), nullptr); - CallInst *CI = B.CreateCall2(Callee, Op1, Op2, Name); + CallInst *CI = B.CreateCall(Callee, {Op1, Op2}, Name); CI->setAttributes(Attrs); if (const Function *F = dyn_cast<Function>(Callee->stripPointerCasts())) CI->setCallingConv(F->getCallingConv()); @@ -339,7 +314,7 @@ Value *llvm::EmitBinaryFloatFnCall(Value *Op1, Value *Op2, StringRef Name, /// EmitPutChar - Emit a call to the putchar function. This assumes that Char /// is an integer. -Value *llvm::EmitPutChar(Value *Char, IRBuilder<> &B, const DataLayout *TD, +Value *llvm::EmitPutChar(Value *Char, IRBuilder<> &B, const TargetLibraryInfo *TLI) { if (!TLI->has(LibFunc::putchar)) return nullptr; @@ -361,7 +336,7 @@ Value *llvm::EmitPutChar(Value *Char, IRBuilder<> &B, const DataLayout *TD, /// EmitPutS - Emit a call to the puts function. This assumes that Str is /// some pointer. -Value *llvm::EmitPutS(Value *Str, IRBuilder<> &B, const DataLayout *TD, +Value *llvm::EmitPutS(Value *Str, IRBuilder<> &B, const TargetLibraryInfo *TLI) { if (!TLI->has(LibFunc::puts)) return nullptr; @@ -386,7 +361,7 @@ Value *llvm::EmitPutS(Value *Str, IRBuilder<> &B, const DataLayout *TD, /// EmitFPutC - Emit a call to the fputc function. This assumes that Char is /// an integer and File is a pointer to FILE. Value *llvm::EmitFPutC(Value *Char, Value *File, IRBuilder<> &B, - const DataLayout *TD, const TargetLibraryInfo *TLI) { + const TargetLibraryInfo *TLI) { if (!TLI->has(LibFunc::fputc)) return nullptr; @@ -409,7 +384,7 @@ Value *llvm::EmitFPutC(Value *Char, Value *File, IRBuilder<> &B, File->getType(), nullptr); Char = B.CreateIntCast(Char, B.getInt32Ty(), /*isSigned*/true, "chari"); - CallInst *CI = B.CreateCall2(F, Char, File, "fputc"); + CallInst *CI = B.CreateCall(F, {Char, File}, "fputc"); if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts())) CI->setCallingConv(Fn->getCallingConv()); @@ -419,7 +394,7 @@ Value *llvm::EmitFPutC(Value *Char, Value *File, IRBuilder<> &B, /// EmitFPutS - Emit a call to the puts function. Str is required to be a /// pointer and File is a pointer to FILE. Value *llvm::EmitFPutS(Value *Str, Value *File, IRBuilder<> &B, - const DataLayout *TD, const TargetLibraryInfo *TLI) { + const TargetLibraryInfo *TLI) { if (!TLI->has(LibFunc::fputs)) return nullptr; @@ -441,7 +416,7 @@ Value *llvm::EmitFPutS(Value *Str, Value *File, IRBuilder<> &B, F = M->getOrInsertFunction(FPutsName, B.getInt32Ty(), B.getInt8PtrTy(), File->getType(), nullptr); - CallInst *CI = B.CreateCall2(F, CastToCStr(Str, B), File, "fputs"); + CallInst *CI = B.CreateCall(F, {CastToCStr(Str, B), File}, "fputs"); if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts())) CI->setCallingConv(Fn->getCallingConv()); @@ -450,9 +425,8 @@ Value *llvm::EmitFPutS(Value *Str, Value *File, IRBuilder<> &B, /// EmitFWrite - Emit a call to the fwrite function. This assumes that Ptr is /// a pointer, Size is an 'intptr_t', and File is a pointer to FILE. -Value *llvm::EmitFWrite(Value *Ptr, Value *Size, Value *File, - IRBuilder<> &B, const DataLayout *TD, - const TargetLibraryInfo *TLI) { +Value *llvm::EmitFWrite(Value *Ptr, Value *Size, Value *File, IRBuilder<> &B, + const DataLayout &DL, const TargetLibraryInfo *TLI) { if (!TLI->has(LibFunc::fwrite)) return nullptr; @@ -466,21 +440,18 @@ Value *llvm::EmitFWrite(Value *Ptr, Value *Size, Value *File, StringRef FWriteName = TLI->getName(LibFunc::fwrite); Constant *F; if (File->getType()->isPointerTy()) - F = M->getOrInsertFunction(FWriteName, - AttributeSet::get(M->getContext(), AS), - TD->getIntPtrType(Context), - B.getInt8PtrTy(), - TD->getIntPtrType(Context), - TD->getIntPtrType(Context), - File->getType(), nullptr); + F = M->getOrInsertFunction( + FWriteName, AttributeSet::get(M->getContext(), AS), + DL.getIntPtrType(Context), B.getInt8PtrTy(), DL.getIntPtrType(Context), + DL.getIntPtrType(Context), File->getType(), nullptr); else - F = M->getOrInsertFunction(FWriteName, TD->getIntPtrType(Context), - B.getInt8PtrTy(), - TD->getIntPtrType(Context), - TD->getIntPtrType(Context), - File->getType(), nullptr); - CallInst *CI = B.CreateCall4(F, CastToCStr(Ptr, B), Size, - ConstantInt::get(TD->getIntPtrType(Context), 1), File); + F = M->getOrInsertFunction(FWriteName, DL.getIntPtrType(Context), + B.getInt8PtrTy(), DL.getIntPtrType(Context), + DL.getIntPtrType(Context), File->getType(), + nullptr); + CallInst *CI = + B.CreateCall(F, {CastToCStr(Ptr, B), Size, + ConstantInt::get(DL.getIntPtrType(Context), 1), File}); if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts())) CI->setCallingConv(Fn->getCallingConv()); diff --git a/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp b/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp index 96a763f..4f8d1df 100644 --- a/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp +++ b/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp @@ -34,7 +34,7 @@ #include <map> using namespace llvm; -// CloneBasicBlock - See comments in Cloning.h +/// See comments in Cloning.h. BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, const Twine &NameSuffix, Function *F, @@ -154,23 +154,26 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc, TypeMapper, Materializer); } -// Find the MDNode which corresponds to the DISubprogram data that described F. -static MDNode* FindSubprogram(const Function *F, DebugInfoFinder &Finder) { - for (DISubprogram Subprogram : Finder.subprograms()) { - if (Subprogram.describes(F)) return Subprogram; +// Find the MDNode which corresponds to the subprogram data that described F. +static DISubprogram *FindSubprogram(const Function *F, + DebugInfoFinder &Finder) { + for (DISubprogram *Subprogram : Finder.subprograms()) { + if (Subprogram->describes(F)) + return Subprogram; } return nullptr; } // Add an operand to an existing MDNode. The new operand will be added at the // back of the operand list. -static void AddOperand(DICompileUnit CU, DIArray SPs, Metadata *NewSP) { +static void AddOperand(DICompileUnit *CU, DISubprogramArray SPs, + Metadata *NewSP) { SmallVector<Metadata *, 16> NewSPs; - NewSPs.reserve(SPs->getNumOperands() + 1); - for (unsigned I = 0, E = SPs->getNumOperands(); I != E; ++I) - NewSPs.push_back(SPs->getOperand(I)); + NewSPs.reserve(SPs.size() + 1); + for (auto *SP : SPs) + NewSPs.push_back(SP); NewSPs.push_back(NewSP); - CU.replaceSubprograms(DIArray(MDNode::get(CU->getContext(), NewSPs))); + CU->replaceSubprograms(MDTuple::get(CU->getContext(), NewSPs)); } // Clone the module-level debug info associated with OldFunc. The cloned data @@ -180,21 +183,21 @@ static void CloneDebugInfoMetadata(Function *NewFunc, const Function *OldFunc, DebugInfoFinder Finder; Finder.processModule(*OldFunc->getParent()); - const MDNode *OldSubprogramMDNode = FindSubprogram(OldFunc, Finder); + const DISubprogram *OldSubprogramMDNode = FindSubprogram(OldFunc, Finder); if (!OldSubprogramMDNode) return; // Ensure that OldFunc appears in the map. // (if it's already there it must point to NewFunc anyway) VMap[OldFunc] = NewFunc; - DISubprogram NewSubprogram(MapMetadata(OldSubprogramMDNode, VMap)); - - for (DICompileUnit CU : Finder.compile_units()) { - DIArray Subprograms(CU.getSubprograms()); + auto *NewSubprogram = + cast<DISubprogram>(MapMetadata(OldSubprogramMDNode, VMap)); + for (auto *CU : Finder.compile_units()) { + auto Subprograms = CU->getSubprograms(); // If the compile unit's function list contains the old function, it should // also contain the new one. - for (unsigned i = 0; i < Subprograms.getNumElements(); i++) { - if ((MDNode*)Subprograms.getElement(i) == OldSubprogramMDNode) { + for (auto *SP : Subprograms) { + if (SP == OldSubprogramMDNode) { AddOperand(CU, Subprograms, NewSubprogram); break; } @@ -202,7 +205,7 @@ static void CloneDebugInfoMetadata(Function *NewFunc, const Function *OldFunc, } } -/// CloneFunction - Return a copy of the specified function, but without +/// Return a copy of the specified function, but without /// embedding the function into another module. Also, any references specified /// in the VMap are changed to refer to their mapped value instead of the /// original one. If any of the arguments to the function are in the VMap, @@ -250,8 +253,7 @@ Function *llvm::CloneFunction(const Function *F, ValueToValueMapTy &VMap, namespace { - /// PruningFunctionCloner - This class is a private class used to implement - /// the CloneAndPruneFunctionInto method. + /// This is a private class used to implement CloneAndPruneFunctionInto. struct PruningFunctionCloner { Function *NewFunc; const Function *OldFunc; @@ -259,29 +261,40 @@ namespace { bool ModuleLevelChanges; const char *NameSuffix; ClonedCodeInfo *CodeInfo; - const DataLayout *DL; + CloningDirector *Director; + ValueMapTypeRemapper *TypeMapper; + ValueMaterializer *Materializer; + public: PruningFunctionCloner(Function *newFunc, const Function *oldFunc, - ValueToValueMapTy &valueMap, - bool moduleLevelChanges, - const char *nameSuffix, - ClonedCodeInfo *codeInfo, - const DataLayout *DL) - : NewFunc(newFunc), OldFunc(oldFunc), - VMap(valueMap), ModuleLevelChanges(moduleLevelChanges), - NameSuffix(nameSuffix), CodeInfo(codeInfo), DL(DL) { + ValueToValueMapTy &valueMap, bool moduleLevelChanges, + const char *nameSuffix, ClonedCodeInfo *codeInfo, + CloningDirector *Director) + : NewFunc(newFunc), OldFunc(oldFunc), VMap(valueMap), + ModuleLevelChanges(moduleLevelChanges), NameSuffix(nameSuffix), + CodeInfo(codeInfo), Director(Director) { + // These are optional components. The Director may return null. + if (Director) { + TypeMapper = Director->getTypeRemapper(); + Materializer = Director->getValueMaterializer(); + } else { + TypeMapper = nullptr; + Materializer = nullptr; + } } - /// CloneBlock - The specified block is found to be reachable, clone it and + /// The specified block is found to be reachable, clone it and /// anything that it can reach. - void CloneBlock(const BasicBlock *BB, + void CloneBlock(const BasicBlock *BB, + BasicBlock::const_iterator StartingInst, std::vector<const BasicBlock*> &ToClone); }; } -/// CloneBlock - The specified block is found to be reachable, clone it and +/// The specified block is found to be reachable, clone it and /// anything that it can reach. void PruningFunctionCloner::CloneBlock(const BasicBlock *BB, + BasicBlock::const_iterator StartingInst, std::vector<const BasicBlock*> &ToClone){ WeakVH &BBEntry = VMap[BB]; @@ -307,26 +320,45 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB, const_cast<BasicBlock*>(BB)); VMap[OldBBAddr] = BlockAddress::get(NewFunc, NewBB); } - bool hasCalls = false, hasDynamicAllocas = false, hasStaticAllocas = false; - + // Loop over all instructions, and copy them over, DCE'ing as we go. This // loop doesn't include the terminator. - for (BasicBlock::const_iterator II = BB->begin(), IE = --BB->end(); + for (BasicBlock::const_iterator II = StartingInst, IE = --BB->end(); II != IE; ++II) { + // If the "Director" remaps the instruction, don't clone it. + if (Director) { + CloningDirector::CloningAction Action + = Director->handleInstruction(VMap, II, NewBB); + // If the cloning director says stop, we want to stop everything, not + // just break out of the loop (which would cause the terminator to be + // cloned). The cloning director is responsible for inserting a proper + // terminator into the new basic block in this case. + if (Action == CloningDirector::StopCloningBB) + return; + // If the cloning director says skip, continue to the next instruction. + // In this case, the cloning director is responsible for mapping the + // skipped instruction to some value that is defined in the new + // basic block. + if (Action == CloningDirector::SkipInstruction) + continue; + } + Instruction *NewInst = II->clone(); // Eagerly remap operands to the newly cloned instruction, except for PHI // nodes for which we defer processing until we update the CFG. if (!isa<PHINode>(NewInst)) { RemapInstruction(NewInst, VMap, - ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges); + ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges, + TypeMapper, Materializer); // If we can simplify this instruction to some other value, simply add // a mapping to that value rather than inserting a new instruction into // the basic block. - if (Value *V = SimplifyInstruction(NewInst, DL)) { + if (Value *V = + SimplifyInstruction(NewInst, BB->getModule()->getDataLayout())) { // On the off-chance that this simplifies to an instruction in the old // function, map it back into the new function. if (Value *MappedV = VMap.lookup(V)) @@ -354,6 +386,26 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB, // Finally, clone over the terminator. const TerminatorInst *OldTI = BB->getTerminator(); bool TerminatorDone = false; + if (Director) { + CloningDirector::CloningAction Action + = Director->handleInstruction(VMap, OldTI, NewBB); + // If the cloning director says stop, we want to stop everything, not + // just break out of the loop (which would cause the terminator to be + // cloned). The cloning director is responsible for inserting a proper + // terminator into the new basic block in this case. + if (Action == CloningDirector::StopCloningBB) + return; + if (Action == CloningDirector::CloneSuccessors) { + // If the director says to skip with a terminate instruction, we still + // need to clone this block's successors. + const TerminatorInst *TI = NewBB->getTerminator(); + for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) + ToClone.push_back(TI->getSuccessor(i)); + return; + } + assert(Action != CloningDirector::SkipInstruction && + "SkipInstruction is not valid for terminators."); + } if (const BranchInst *BI = dyn_cast<BranchInst>(OldTI)) { if (BI->isConditional()) { // If the condition was a known constant in the callee... @@ -409,39 +461,53 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB, } } -/// CloneAndPruneFunctionInto - This works exactly like CloneFunctionInto, -/// except that it does some simple constant prop and DCE on the fly. The -/// effect of this is to copy significantly less code in cases where (for -/// example) a function call with constant arguments is inlined, and those -/// constant arguments cause a significant amount of code in the callee to be -/// dead. Since this doesn't produce an exact copy of the input, it can't be -/// used for things like CloneFunction or CloneModule. -void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc, +/// This works like CloneAndPruneFunctionInto, except that it does not clone the +/// entire function. Instead it starts at an instruction provided by the caller +/// and copies (and prunes) only the code reachable from that instruction. +void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc, + const Instruction *StartingInst, ValueToValueMapTy &VMap, bool ModuleLevelChanges, - SmallVectorImpl<ReturnInst*> &Returns, + SmallVectorImpl<ReturnInst *> &Returns, const char *NameSuffix, ClonedCodeInfo *CodeInfo, - const DataLayout *DL, - Instruction *TheCall) { + CloningDirector *Director) { assert(NameSuffix && "NameSuffix cannot be null!"); - + + ValueMapTypeRemapper *TypeMapper = nullptr; + ValueMaterializer *Materializer = nullptr; + + if (Director) { + TypeMapper = Director->getTypeRemapper(); + Materializer = Director->getValueMaterializer(); + } + #ifndef NDEBUG - for (Function::const_arg_iterator II = OldFunc->arg_begin(), - E = OldFunc->arg_end(); II != E; ++II) - assert(VMap.count(II) && "No mapping from source argument specified!"); + // If the cloning starts at the begining of the function, verify that + // the function arguments are mapped. + if (!StartingInst) + for (Function::const_arg_iterator II = OldFunc->arg_begin(), + E = OldFunc->arg_end(); II != E; ++II) + assert(VMap.count(II) && "No mapping from source argument specified!"); #endif PruningFunctionCloner PFC(NewFunc, OldFunc, VMap, ModuleLevelChanges, - NameSuffix, CodeInfo, DL); + NameSuffix, CodeInfo, Director); + const BasicBlock *StartingBB; + if (StartingInst) + StartingBB = StartingInst->getParent(); + else { + StartingBB = &OldFunc->getEntryBlock(); + StartingInst = StartingBB->begin(); + } // Clone the entry block, and anything recursively reachable from it. std::vector<const BasicBlock*> CloneWorklist; - CloneWorklist.push_back(&OldFunc->getEntryBlock()); + PFC.CloneBlock(StartingBB, StartingInst, CloneWorklist); while (!CloneWorklist.empty()) { const BasicBlock *BB = CloneWorklist.back(); CloneWorklist.pop_back(); - PFC.CloneBlock(BB, CloneWorklist); + PFC.CloneBlock(BB, BB->begin(), CloneWorklist); } // Loop over all of the basic blocks in the old function. If the block was @@ -461,16 +527,24 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc, // Handle PHI nodes specially, as we have to remove references to dead // blocks. - for (BasicBlock::const_iterator I = BI->begin(), E = BI->end(); I != E; ++I) - if (const PHINode *PN = dyn_cast<PHINode>(I)) - PHIToResolve.push_back(PN); - else + for (BasicBlock::const_iterator I = BI->begin(), E = BI->end(); I != E; ++I) { + // PHI nodes may have been remapped to non-PHI nodes by the caller or + // during the cloning process. + if (const PHINode *PN = dyn_cast<PHINode>(I)) { + if (isa<PHINode>(VMap[PN])) + PHIToResolve.push_back(PN); + else + break; + } else { break; + } + } // Finally, remap the terminator instructions, as those can't be remapped // until all BBs are mapped. RemapInstruction(NewBB->getTerminator(), VMap, - ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges); + ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges, + TypeMapper, Materializer); } // Defer PHI resolution until rest of function is resolved, PHI resolution @@ -563,13 +637,13 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc, // node). for (unsigned Idx = 0, Size = PHIToResolve.size(); Idx != Size; ++Idx) if (PHINode *PN = dyn_cast<PHINode>(VMap[PHIToResolve[Idx]])) - recursivelySimplifyInstruction(PN, DL); + recursivelySimplifyInstruction(PN); // Now that the inlined function body has been fully constructed, go through - // and zap unconditional fall-through branches. This happen all the time when + // and zap unconditional fall-through branches. This happens all the time when // specializing code: code specialization turns conditional branches into // uncond branches, and this code folds them. - Function::iterator Begin = cast<BasicBlock>(VMap[&OldFunc->getEntryBlock()]); + Function::iterator Begin = cast<BasicBlock>(VMap[StartingBB]); Function::iterator I = Begin; while (I != NewFunc->end()) { // Check if this block has become dead during inlining or other @@ -617,12 +691,32 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc, // Do not increment I, iteratively merge all things this block branches to. } - // Make a final pass over the basic blocks from theh old function to gather + // Make a final pass over the basic blocks from the old function to gather // any return instructions which survived folding. We have to do this here // because we can iteratively remove and merge returns above. - for (Function::iterator I = cast<BasicBlock>(VMap[&OldFunc->getEntryBlock()]), + for (Function::iterator I = cast<BasicBlock>(VMap[StartingBB]), E = NewFunc->end(); I != E; ++I) if (ReturnInst *RI = dyn_cast<ReturnInst>(I->getTerminator())) Returns.push_back(RI); } + + +/// This works exactly like CloneFunctionInto, +/// except that it does some simple constant prop and DCE on the fly. The +/// effect of this is to copy significantly less code in cases where (for +/// example) a function call with constant arguments is inlined, and those +/// constant arguments cause a significant amount of code in the callee to be +/// dead. Since this doesn't produce an exact copy of the input, it can't be +/// used for things like CloneFunction or CloneModule. +void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc, + ValueToValueMapTy &VMap, + bool ModuleLevelChanges, + SmallVectorImpl<ReturnInst*> &Returns, + const char *NameSuffix, + ClonedCodeInfo *CodeInfo, + Instruction *TheCall) { + CloneAndPruneIntoFromInst(NewFunc, OldFunc, OldFunc->front().begin(), VMap, + ModuleLevelChanges, Returns, NameSuffix, CodeInfo, + nullptr); +} diff --git a/contrib/llvm/lib/Transforms/Utils/CloneModule.cpp b/contrib/llvm/lib/Transforms/Utils/CloneModule.cpp index fae9ff5..2693322 100644 --- a/contrib/llvm/lib/Transforms/Utils/CloneModule.cpp +++ b/contrib/llvm/lib/Transforms/Utils/CloneModule.cpp @@ -69,9 +69,7 @@ Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) { for (Module::const_alias_iterator I = M->alias_begin(), E = M->alias_end(); I != E; ++I) { auto *PTy = cast<PointerType>(I->getType()); - auto *GA = - GlobalAlias::create(PTy->getElementType(), PTy->getAddressSpace(), - I->getLinkage(), I->getName(), New); + auto *GA = GlobalAlias::create(PTy, I->getLinkage(), I->getName(), New); GA->copyAttributesFrom(I); VMap[I] = GA; } diff --git a/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp index e70a7d6..ab89b41 100644 --- a/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp +++ b/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp @@ -332,11 +332,11 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs, DEBUG(dbgs() << **i << ", "); DEBUG(dbgs() << ")\n"); + StructType *StructTy; if (AggregateArgs && (inputs.size() + outputs.size() > 0)) { - PointerType *StructPtr = - PointerType::getUnqual(StructType::get(M->getContext(), paramTy)); + StructTy = StructType::get(M->getContext(), paramTy); paramTy.clear(); - paramTy.push_back(StructPtr); + paramTy.push_back(PointerType::getUnqual(StructTy)); } FunctionType *funcType = FunctionType::get(RetTy, paramTy, false); @@ -364,8 +364,8 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs, Idx[0] = Constant::getNullValue(Type::getInt32Ty(header->getContext())); Idx[1] = ConstantInt::get(Type::getInt32Ty(header->getContext()), i); TerminatorInst *TI = newFunction->begin()->getTerminator(); - GetElementPtrInst *GEP = - GetElementPtrInst::Create(AI, Idx, "gep_" + inputs[i]->getName(), TI); + GetElementPtrInst *GEP = GetElementPtrInst::Create( + StructTy, AI, Idx, "gep_" + inputs[i]->getName(), TI); RewriteVal = new LoadInst(GEP, "loadgep_" + inputs[i]->getName(), TI); } else RewriteVal = AI++; @@ -447,6 +447,7 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer, } } + StructType *StructArgTy = nullptr; AllocaInst *Struct = nullptr; if (AggregateArgs && (inputs.size() + outputs.size() > 0)) { std::vector<Type*> ArgTypes; @@ -455,7 +456,7 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer, ArgTypes.push_back((*v)->getType()); // Allocate a struct at the beginning of this function - Type *StructArgTy = StructType::get(newFunction->getContext(), ArgTypes); + StructArgTy = StructType::get(newFunction->getContext(), ArgTypes); Struct = new AllocaInst(StructArgTy, nullptr, "structArg", codeReplacer->getParent()->begin()->begin()); @@ -465,9 +466,8 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer, Value *Idx[2]; Idx[0] = Constant::getNullValue(Type::getInt32Ty(Context)); Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), i); - GetElementPtrInst *GEP = - GetElementPtrInst::Create(Struct, Idx, - "gep_" + StructValues[i]->getName()); + GetElementPtrInst *GEP = GetElementPtrInst::Create( + StructArgTy, Struct, Idx, "gep_" + StructValues[i]->getName()); codeReplacer->getInstList().push_back(GEP); StoreInst *SI = new StoreInst(StructValues[i], GEP); codeReplacer->getInstList().push_back(SI); @@ -491,9 +491,8 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer, Value *Idx[2]; Idx[0] = Constant::getNullValue(Type::getInt32Ty(Context)); Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), FirstOut + i); - GetElementPtrInst *GEP - = GetElementPtrInst::Create(Struct, Idx, - "gep_reload_" + outputs[i]->getName()); + GetElementPtrInst *GEP = GetElementPtrInst::Create( + StructArgTy, Struct, Idx, "gep_reload_" + outputs[i]->getName()); codeReplacer->getInstList().push_back(GEP); Output = GEP; } else { @@ -606,10 +605,9 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer, Idx[0] = Constant::getNullValue(Type::getInt32Ty(Context)); Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), FirstOut+out); - GetElementPtrInst *GEP = - GetElementPtrInst::Create(OAI, Idx, - "gep_" + outputs[out]->getName(), - NTRet); + GetElementPtrInst *GEP = GetElementPtrInst::Create( + StructArgTy, OAI, Idx, "gep_" + outputs[out]->getName(), + NTRet); new StoreInst(outputs[out], GEP, NTRet); } else { new StoreInst(outputs[out], OAI, NTRet); diff --git a/contrib/llvm/lib/Transforms/Utils/CtorUtils.cpp b/contrib/llvm/lib/Transforms/Utils/CtorUtils.cpp index 26875e8..dc95089 100644 --- a/contrib/llvm/lib/Transforms/Utils/CtorUtils.cpp +++ b/contrib/llvm/lib/Transforms/Utils/CtorUtils.cpp @@ -11,14 +11,15 @@ // //===----------------------------------------------------------------------===// -#include "llvm/ADT/BitVector.h" #include "llvm/Transforms/Utils/CtorUtils.h" +#include "llvm/ADT/BitVector.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" #define DEBUG_TYPE "ctor_utils" diff --git a/contrib/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp b/contrib/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp index 9972b22..003da58 100644 --- a/contrib/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp +++ b/contrib/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp @@ -39,6 +39,19 @@ AllocaInst *llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads, F->getEntryBlock().begin()); } + // We cannot demote invoke instructions to the stack if their normal edge + // is critical. Therefore, split the critical edge and create a basic block + // into which the store can be inserted. + if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) { + if (!II->getNormalDest()->getSinglePredecessor()) { + unsigned SuccNum = GetSuccessorNumber(II->getParent(), II->getNormalDest()); + assert(isCriticalEdge(II, SuccNum) && "Expected a critical edge!"); + BasicBlock *BB = SplitCriticalEdge(II, SuccNum); + assert(BB && "Unable to split critical edge."); + (void)BB; + } + } + // Change all of the users of the instruction to read from the stack slot. while (!I.use_empty()) { Instruction *U = cast<Instruction>(I.user_back()); @@ -71,7 +84,6 @@ AllocaInst *llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads, } } - // Insert stores of the computed value into the stack slot. We have to be // careful if I is an invoke instruction, because we can't insert the store // AFTER the terminator instruction. @@ -79,27 +91,13 @@ AllocaInst *llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads, if (!isa<TerminatorInst>(I)) { InsertPt = &I; ++InsertPt; + for (; isa<PHINode>(InsertPt) || isa<LandingPadInst>(InsertPt); ++InsertPt) + /* empty */; // Don't insert before PHI nodes or landingpad instrs. } else { InvokeInst &II = cast<InvokeInst>(I); - if (II.getNormalDest()->getSinglePredecessor()) - InsertPt = II.getNormalDest()->getFirstInsertionPt(); - else { - // We cannot demote invoke instructions to the stack if their normal edge - // is critical. Therefore, split the critical edge and insert the store - // in the newly created basic block. - unsigned SuccNum = GetSuccessorNumber(I.getParent(), II.getNormalDest()); - TerminatorInst *TI = &cast<TerminatorInst>(I); - assert (isCriticalEdge(TI, SuccNum) && - "Expected a critical edge!"); - BasicBlock *BB = SplitCriticalEdge(TI, SuccNum); - assert (BB && "Unable to split critical edge."); - InsertPt = BB->getFirstInsertionPt(); - } + InsertPt = II.getNormalDest()->getFirstInsertionPt(); } - for (; isa<PHINode>(InsertPt) || isa<LandingPadInst>(InsertPt); ++InsertPt) - /* empty */; // Don't insert before PHI nodes or landingpad instrs. - new StoreInst(&I, Slot, InsertPt); return Slot; } diff --git a/contrib/llvm/lib/Transforms/Utils/GlobalStatus.cpp b/contrib/llvm/lib/Transforms/Utils/GlobalStatus.cpp index 52e2d59..44b7d25 100644 --- a/contrib/llvm/lib/Transforms/Utils/GlobalStatus.cpp +++ b/contrib/llvm/lib/Transforms/Utils/GlobalStatus.cpp @@ -150,7 +150,7 @@ static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS, if (MSI->isVolatile()) return true; GS.StoredType = GlobalStatus::Stored; - } else if (ImmutableCallSite C = I) { + } else if (auto C = ImmutableCallSite(I)) { if (!C.isCallee(&U)) return true; GS.IsLoaded = true; diff --git a/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp b/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp index 2a86eb5..ddeaff0 100644 --- a/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp +++ b/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp @@ -30,6 +30,7 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugInfo.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/DIBuilder.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" @@ -88,7 +89,7 @@ namespace { CallerLPad = cast<LandingPadInst>(I); } - /// getOuterResumeDest - The outer unwind destination is the target of + /// The outer unwind destination is the target of /// unwind edges introduced for calls within the inlined function. BasicBlock *getOuterResumeDest() const { return OuterResumeDest; @@ -98,17 +99,16 @@ namespace { LandingPadInst *getLandingPadInst() const { return CallerLPad; } - /// forwardResume - Forward the 'resume' instruction to the caller's landing - /// pad block. When the landing pad block has only one predecessor, this is + /// Forward the 'resume' instruction to the caller's landing pad block. + /// When the landing pad block has only one predecessor, this is /// a simple branch. When there is more than one predecessor, we need to /// split the landing pad block after the landingpad instruction and jump /// to there. void forwardResume(ResumeInst *RI, SmallPtrSetImpl<LandingPadInst*> &InlinedLPads); - /// addIncomingPHIValuesFor - Add incoming-PHI values to the unwind - /// destination block for the given basic block, using the values for the - /// original invoke's source block. + /// Add incoming-PHI values to the unwind destination block for the given + /// basic block, using the values for the original invoke's source block. void addIncomingPHIValuesFor(BasicBlock *BB) const { addIncomingPHIValuesForInto(BB, OuterResumeDest); } @@ -123,7 +123,7 @@ namespace { }; } -/// getInnerResumeDest - Get or create a target for the branch from ResumeInsts. +/// Get or create a target for the branch from ResumeInsts. BasicBlock *InvokeInliningInfo::getInnerResumeDest() { if (InnerResumeDest) return InnerResumeDest; @@ -158,8 +158,8 @@ BasicBlock *InvokeInliningInfo::getInnerResumeDest() { return InnerResumeDest; } -/// forwardResume - Forward the 'resume' instruction to the caller's landing pad -/// block. When the landing pad block has only one predecessor, this is a simple +/// Forward the 'resume' instruction to the caller's landing pad block. +/// When the landing pad block has only one predecessor, this is a simple /// branch. When there is more than one predecessor, we need to split the /// landing pad block after the landingpad instruction and jump to there. void InvokeInliningInfo::forwardResume(ResumeInst *RI, @@ -177,9 +177,9 @@ void InvokeInliningInfo::forwardResume(ResumeInst *RI, RI->eraseFromParent(); } -/// HandleCallsInBlockInlinedThroughInvoke - When we inline a basic block into -/// an invoke, we have to turn all of the calls that can throw into -/// invokes. This function analyze BB to see if there are any calls, and if so, +/// When we inline a basic block into an invoke, +/// we have to turn all of the calls that can throw into invokes. +/// This function analyze BB to see if there are any calls, and if so, /// it rewrites them to be invokes that jump to InvokeDest and fills in the PHI /// nodes in that block with the values specified in InvokeDestPHIValues. static void HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB, @@ -227,7 +227,7 @@ static void HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB, } } -/// HandleInlinedInvoke - If we inlined an invoke site, we need to convert calls +/// If we inlined an invoke site, we need to convert calls /// in the body of the inlined function into invokes. /// /// II is the invoke instruction being inlined. FirstNewBlock is the first @@ -278,8 +278,8 @@ static void HandleInlinedInvoke(InvokeInst *II, BasicBlock *FirstNewBlock, InvokeDest->removePredecessor(II->getParent()); } -/// CloneAliasScopeMetadata - When inlining a function that contains noalias -/// scope metadata, this metadata needs to be cloned so that the inlined blocks +/// When inlining a function that contains noalias scope metadata, +/// this metadata needs to be cloned so that the inlined blocks /// have different "unqiue scopes" at every call site. Were this not done, then /// aliasing scopes from a function inlined into a caller multiple times could /// not be differentiated (and this would lead to miscompiles because the @@ -319,13 +319,12 @@ static void CloneAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap) { // Now we have a complete set of all metadata in the chains used to specify // the noalias scopes and the lists of those scopes. - SmallVector<MDNode *, 16> DummyNodes; + SmallVector<TempMDTuple, 16> DummyNodes; DenseMap<const MDNode *, TrackingMDNodeRef> MDMap; for (SetVector<const MDNode *>::iterator I = MD.begin(), IE = MD.end(); I != IE; ++I) { - MDNode *Dummy = MDNode::getTemporary(CalledFunc->getContext(), None); - DummyNodes.push_back(Dummy); - MDMap[*I].reset(Dummy); + DummyNodes.push_back(MDTuple::getTemporary(CalledFunc->getContext(), None)); + MDMap[*I].reset(DummyNodes.back().get()); } // Create new metadata nodes to replace the dummy nodes, replacing old @@ -343,7 +342,8 @@ static void CloneAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap) { } MDNode *NewM = MDNode::get(CalledFunc->getContext(), NewOps); - MDNodeFwdDecl *TempM = cast<MDNodeFwdDecl>(MDMap[*I]); + MDTuple *TempM = cast<MDTuple>(MDMap[*I]); + assert(TempM->isTemporary() && "Expected temporary node"); TempM->replaceAllUsesWith(NewM); } @@ -388,18 +388,14 @@ static void CloneAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap) { NI->setMetadata(LLVMContext::MD_noalias, M); } } - - // Now that everything has been replaced, delete the dummy nodes. - for (unsigned i = 0, ie = DummyNodes.size(); i != ie; ++i) - MDNode::deleteTemporary(DummyNodes[i]); } -/// AddAliasScopeMetadata - If the inlined function has noalias arguments, then -/// add new alias scopes for each noalias argument, tag the mapped noalias +/// If the inlined function has noalias arguments, +/// then add new alias scopes for each noalias argument, tag the mapped noalias /// parameters with noalias metadata specifying the new scope, and tag all /// non-derived loads, stores and memory intrinsics with the new alias scopes. static void AddAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap, - const DataLayout *DL, AliasAnalysis *AA) { + const DataLayout &DL, AliasAnalysis *AA) { if (!EnableNoAliasConversion) return; @@ -625,8 +621,9 @@ static void AddAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap, /// If the inlined function has non-byval align arguments, then /// add @llvm.assume-based alignment assumptions to preserve this information. static void AddAlignmentAssumptions(CallSite CS, InlineFunctionInfo &IFI) { - if (!PreserveAlignmentAssumptions || !IFI.DL) + if (!PreserveAlignmentAssumptions) return; + auto &DL = CS.getCaller()->getParent()->getDataLayout(); // To avoid inserting redundant assumptions, we should check for assumptions // already in the caller. To do this, we might need a DT of the caller. @@ -648,20 +645,20 @@ static void AddAlignmentAssumptions(CallSite CS, InlineFunctionInfo &IFI) { // If we can already prove the asserted alignment in the context of the // caller, then don't bother inserting the assumption. Value *Arg = CS.getArgument(I->getArgNo()); - if (getKnownAlignment(Arg, IFI.DL, + if (getKnownAlignment(Arg, DL, CS.getInstruction(), &IFI.ACT->getAssumptionCache(*CalledFunc), - CS.getInstruction(), &DT) >= Align) + &DT) >= Align) continue; - IRBuilder<>(CS.getInstruction()).CreateAlignmentAssumption(*IFI.DL, Arg, - Align); + IRBuilder<>(CS.getInstruction()) + .CreateAlignmentAssumption(DL, Arg, Align); } } } -/// UpdateCallGraphAfterInlining - Once we have cloned code over from a callee -/// into the caller, update the specified callgraph to reflect the changes we -/// made. Note that it's possible that not all code was copied over, so only +/// Once we have cloned code over from a callee into the caller, +/// update the specified callgraph to reflect the changes we made. +/// Note that it's possible that not all code was copied over, so only /// some edges of the callgraph may remain. static void UpdateCallGraphAfterInlining(CallSite CS, Function::iterator FirstNewBlock, @@ -696,8 +693,15 @@ static void UpdateCallGraphAfterInlining(CallSite CS, // If the call was inlined, but then constant folded, there is no edge to // add. Check for this case. Instruction *NewCall = dyn_cast<Instruction>(VMI->second); - if (!NewCall) continue; + if (!NewCall) + continue; + // We do not treat intrinsic calls like real function calls because we + // expect them to become inline code; do not add an edge for an intrinsic. + CallSite CS = CallSite(NewCall); + if (CS && CS.getCalledFunction() && CS.getCalledFunction()->isIntrinsic()) + continue; + // Remember that this call site got inlined for the client of // InlineFunction. IFI.InlinedCalls.push_back(NewCall); @@ -729,11 +733,7 @@ static void HandleByValArgumentInit(Value *Dst, Value *Src, Module *M, Type *AggTy = cast<PointerType>(Src->getType())->getElementType(); IRBuilder<> Builder(InsertBlock->begin()); - Value *Size; - if (IFI.DL == nullptr) - Size = ConstantExpr::getSizeOf(AggTy); - else - Size = Builder.getInt64(IFI.DL->getTypeStoreSize(AggTy)); + Value *Size = Builder.getInt64(M->getDataLayout().getTypeStoreSize(AggTy)); // Always generate a memcpy of alignment 1 here because we don't know // the alignment of the src pointer. Other optimizations can infer @@ -741,7 +741,7 @@ static void HandleByValArgumentInit(Value *Dst, Value *Src, Module *M, Builder.CreateMemCpy(Dst, Src, Size, /*Align=*/1); } -/// HandleByValArgument - When inlining a call site that has a byval argument, +/// When inlining a call site that has a byval argument, /// we have to make the implicit memcpy explicit by adding it. static Value *HandleByValArgument(Value *Arg, Instruction *TheCall, const Function *CalledFunc, @@ -762,11 +762,13 @@ static Value *HandleByValArgument(Value *Arg, Instruction *TheCall, if (ByValAlignment <= 1) // 0 = unspecified, 1 = no particular alignment. return Arg; + const DataLayout &DL = Caller->getParent()->getDataLayout(); + // If the pointer is already known to be sufficiently aligned, or if we can // round it up to a larger alignment, then we don't need a temporary. - if (getOrEnforceKnownAlignment(Arg, ByValAlignment, IFI.DL, - &IFI.ACT->getAssumptionCache(*Caller), - TheCall) >= ByValAlignment) + if (getOrEnforceKnownAlignment(Arg, ByValAlignment, DL, TheCall, + &IFI.ACT->getAssumptionCache(*Caller)) >= + ByValAlignment) return Arg; // Otherwise, we have to make a memcpy to get a safe alignment. This is bad @@ -774,10 +776,9 @@ static Value *HandleByValArgument(Value *Arg, Instruction *TheCall, } // Create the alloca. If we have DataLayout, use nice alignment. - unsigned Align = 1; - if (IFI.DL) - Align = IFI.DL->getPrefTypeAlignment(AggTy); - + unsigned Align = + Caller->getParent()->getDataLayout().getPrefTypeAlignment(AggTy); + // If the byval had an alignment specified, we *must* use at least that // alignment, as it is required by the byval argument (and uses of the // pointer inside the callee). @@ -792,8 +793,7 @@ static Value *HandleByValArgument(Value *Arg, Instruction *TheCall, return NewAlloca; } -// isUsedByLifetimeMarker - Check whether this Value is used by a lifetime -// intrinsic. +// Check whether this Value is used by a lifetime intrinsic. static bool isUsedByLifetimeMarker(Value *V) { for (User *U : V->users()) { if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) { @@ -808,7 +808,7 @@ static bool isUsedByLifetimeMarker(Value *V) { return false; } -// hasLifetimeMarkers - Check whether the given alloca already has +// Check whether the given alloca already has // lifetime.start or lifetime.end intrinsics. static bool hasLifetimeMarkers(AllocaInst *AI) { Type *Ty = AI->getType(); @@ -827,35 +827,69 @@ static bool hasLifetimeMarkers(AllocaInst *AI) { return false; } -/// updateInlinedAtInfo - Helper function used by fixupLineNumbers to -/// recursively update InlinedAtEntry of a DebugLoc. -static DebugLoc updateInlinedAtInfo(const DebugLoc &DL, - const DebugLoc &InlinedAtDL, - LLVMContext &Ctx) { - if (MDNode *IA = DL.getInlinedAt(Ctx)) { - DebugLoc NewInlinedAtDL - = updateInlinedAtInfo(DebugLoc::getFromDILocation(IA), InlinedAtDL, Ctx); - return DebugLoc::get(DL.getLine(), DL.getCol(), DL.getScope(Ctx), - NewInlinedAtDL.getAsMDNode(Ctx)); +/// Rebuild the entire inlined-at chain for this instruction so that the top of +/// the chain now is inlined-at the new call site. +static DebugLoc +updateInlinedAtInfo(DebugLoc DL, DILocation *InlinedAtNode, LLVMContext &Ctx, + DenseMap<const DILocation *, DILocation *> &IANodes) { + SmallVector<DILocation *, 3> InlinedAtLocations; + DILocation *Last = InlinedAtNode; + DILocation *CurInlinedAt = DL; + + // Gather all the inlined-at nodes + while (DILocation *IA = CurInlinedAt->getInlinedAt()) { + // Skip any we've already built nodes for + if (DILocation *Found = IANodes[IA]) { + Last = Found; + break; + } + + InlinedAtLocations.push_back(IA); + CurInlinedAt = IA; } - return DebugLoc::get(DL.getLine(), DL.getCol(), DL.getScope(Ctx), - InlinedAtDL.getAsMDNode(Ctx)); + // Starting from the top, rebuild the nodes to point to the new inlined-at + // location (then rebuilding the rest of the chain behind it) and update the + // map of already-constructed inlined-at nodes. + for (auto I = InlinedAtLocations.rbegin(), E = InlinedAtLocations.rend(); + I != E; ++I) { + const DILocation *MD = *I; + Last = IANodes[MD] = DILocation::getDistinct( + Ctx, MD->getLine(), MD->getColumn(), MD->getScope(), Last); + } + + // And finally create the normal location for this instruction, referring to + // the new inlined-at chain. + return DebugLoc::get(DL.getLine(), DL.getCol(), DL.getScope(), Last); } -/// fixupLineNumbers - Update inlined instructions' line numbers to +/// Update inlined instructions' line numbers to /// to encode location where these instructions are inlined. static void fixupLineNumbers(Function *Fn, Function::iterator FI, Instruction *TheCall) { DebugLoc TheCallDL = TheCall->getDebugLoc(); - if (TheCallDL.isUnknown()) + if (!TheCallDL) return; + auto &Ctx = Fn->getContext(); + DILocation *InlinedAtNode = TheCallDL; + + // Create a unique call site, not to be confused with any other call from the + // same location. + InlinedAtNode = DILocation::getDistinct( + Ctx, InlinedAtNode->getLine(), InlinedAtNode->getColumn(), + InlinedAtNode->getScope(), InlinedAtNode->getInlinedAt()); + + // Cache the inlined-at nodes as they're built so they are reused, without + // this every instruction's inlined-at chain would become distinct from each + // other. + DenseMap<const DILocation *, DILocation *> IANodes; + for (; FI != Fn->end(); ++FI) { for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); BI != BE; ++BI) { DebugLoc DL = BI->getDebugLoc(); - if (DL.isUnknown()) { + if (!DL) { // If the inlined instruction has no line number, make it look as if it // originates from the call location. This is important for // ((__always_inline__, __nodebug__)) functions which must use caller @@ -868,29 +902,15 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI, BI->setDebugLoc(TheCallDL); } else { - BI->setDebugLoc(updateInlinedAtInfo(DL, TheCallDL, BI->getContext())); - if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(BI)) { - LLVMContext &Ctx = BI->getContext(); - MDNode *InlinedAt = BI->getDebugLoc().getInlinedAt(Ctx); - DVI->setOperand(2, MetadataAsValue::get( - Ctx, createInlinedVariable(DVI->getVariable(), - InlinedAt, Ctx))); - } else if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(BI)) { - LLVMContext &Ctx = BI->getContext(); - MDNode *InlinedAt = BI->getDebugLoc().getInlinedAt(Ctx); - DDI->setOperand(1, MetadataAsValue::get( - Ctx, createInlinedVariable(DDI->getVariable(), - InlinedAt, Ctx))); - } + BI->setDebugLoc(updateInlinedAtInfo(DL, InlinedAtNode, BI->getContext(), IANodes)); } } } } -/// InlineFunction - This function inlines the called function into the basic -/// block of the caller. This returns false if it is not possible to inline -/// this call. The program is still in a well defined state if this occurs -/// though. +/// This function inlines the called function into the basic block of the +/// caller. This returns false if it is not possible to inline this call. +/// The program is still in a well defined state if this occurs though. /// /// Note that this only does one level of inlining. For example, if the /// instruction 'call B' is inlined, and 'B' calls 'C', then the call to 'C' now @@ -975,6 +995,8 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, // Keep a list of pair (dst, src) to emit byval initializations. SmallVector<std::pair<Value*, Value*>, 4> ByValInit; + auto &DL = Caller->getParent()->getDataLayout(); + assert(CalledFunc->arg_size() == CS.arg_size() && "No varargs calls can be inlined!"); @@ -1009,9 +1031,9 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, // have no dead or constant instructions leftover after inlining occurs // (which can happen, e.g., because an argument was constant), but we'll be // happy with whatever the cloner can do. - CloneAndPruneFunctionInto(Caller, CalledFunc, VMap, + CloneAndPruneFunctionInto(Caller, CalledFunc, VMap, /*ModuleLevelChanges=*/false, Returns, ".i", - &InlinedFunctionInfo, IFI.DL, TheCall); + &InlinedFunctionInfo, TheCall); // Remember the first block that is newly cloned over. FirstNewBlock = LastBlock; ++FirstNewBlock; @@ -1032,7 +1054,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, CloneAliasScopeMetadata(CS, VMap); // Add noalias metadata if necessary. - AddAliasScopeMetadata(CS, VMap, IFI.DL, IFI.AA); + AddAliasScopeMetadata(CS, VMap, DL, IFI.AA); // FIXME: We could register any cloned assumptions instead of clearing the // whole function's cache. @@ -1079,6 +1101,10 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, FirstNewBlock->getInstList(), AI, I); } + // Move any dbg.declares describing the allocas into the entry basic block. + DIBuilder DIB(*Caller->getParent()); + for (auto &AI : IFI.StaticAllocas) + replaceDbgDeclareForAlloca(AI, AI, DIB, /*Deref=*/false); } bool InlinedMustTailCalls = false; @@ -1136,18 +1162,21 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, ConstantInt *AllocaSize = nullptr; if (ConstantInt *AIArraySize = dyn_cast<ConstantInt>(AI->getArraySize())) { - if (IFI.DL) { - Type *AllocaType = AI->getAllocatedType(); - uint64_t AllocaTypeSize = IFI.DL->getTypeAllocSize(AllocaType); - uint64_t AllocaArraySize = AIArraySize->getLimitedValue(); - assert(AllocaArraySize > 0 && "array size of AllocaInst is zero"); - // Check that array size doesn't saturate uint64_t and doesn't - // overflow when it's multiplied by type size. - if (AllocaArraySize != ~0ULL && - UINT64_MAX / AllocaArraySize >= AllocaTypeSize) { - AllocaSize = ConstantInt::get(Type::getInt64Ty(AI->getContext()), - AllocaArraySize * AllocaTypeSize); - } + auto &DL = Caller->getParent()->getDataLayout(); + Type *AllocaType = AI->getAllocatedType(); + uint64_t AllocaTypeSize = DL.getTypeAllocSize(AllocaType); + uint64_t AllocaArraySize = AIArraySize->getLimitedValue(); + + // Don't add markers for zero-sized allocas. + if (AllocaArraySize == 0) + continue; + + // Check that array size doesn't saturate uint64_t and doesn't + // overflow when it's multiplied by type size. + if (AllocaArraySize != ~0ULL && + UINT64_MAX / AllocaArraySize >= AllocaTypeSize) { + AllocaSize = ConstantInt::get(Type::getInt64Ty(AI->getContext()), + AllocaArraySize * AllocaTypeSize); } } @@ -1173,7 +1202,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, // Insert the llvm.stacksave. CallInst *SavedPtr = IRBuilder<>(FirstNewBlock, FirstNewBlock->begin()) - .CreateCall(StackSave, "savedstack"); + .CreateCall(StackSave, {}, "savedstack"); // Insert a call to llvm.stackrestore before any return instructions in the // inlined function. @@ -1408,7 +1437,8 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, // the entries are the same or undef). If so, remove the PHI so it doesn't // block other optimizations. if (PHI) { - if (Value *V = SimplifyInstruction(PHI, IFI.DL, nullptr, nullptr, + auto &DL = Caller->getParent()->getDataLayout(); + if (Value *V = SimplifyInstruction(PHI, DL, nullptr, nullptr, &IFI.ACT->getAssumptionCache(*Caller))) { PHI->replaceAllUsesWith(V); PHI->eraseFromParent(); diff --git a/contrib/llvm/lib/Transforms/Utils/IntegerDivision.cpp b/contrib/llvm/lib/Transforms/Utils/IntegerDivision.cpp index 0ae746c..30edf3b 100644 --- a/contrib/llvm/lib/Transforms/Utils/IntegerDivision.cpp +++ b/contrib/llvm/lib/Transforms/Utils/IntegerDivision.cpp @@ -252,8 +252,8 @@ static Value *generateUnsignedDivisionCode(Value *Dividend, Value *Divisor, Value *Ret0_1 = Builder.CreateICmpEQ(Divisor, Zero); Value *Ret0_2 = Builder.CreateICmpEQ(Dividend, Zero); Value *Ret0_3 = Builder.CreateOr(Ret0_1, Ret0_2); - Value *Tmp0 = Builder.CreateCall2(CTLZ, Divisor, True); - Value *Tmp1 = Builder.CreateCall2(CTLZ, Dividend, True); + Value *Tmp0 = Builder.CreateCall(CTLZ, {Divisor, True}); + Value *Tmp1 = Builder.CreateCall(CTLZ, {Dividend, True}); Value *SR = Builder.CreateSub(Tmp0, Tmp1); Value *Ret0_4 = Builder.CreateICmpUGT(SR, MSB); Value *Ret0 = Builder.CreateOr(Ret0_3, Ret0_4); diff --git a/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp b/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp index 3f9b702..9d40b69 100644 --- a/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp +++ b/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp @@ -112,17 +112,17 @@ static bool processInstruction(Loop &L, Instruction &Inst, DominatorTree &DT, if (SSAUpdate.HasValueForBlock(ExitBB)) continue; - PHINode *PN = PHINode::Create(Inst.getType(), PredCache.GetNumPreds(ExitBB), + PHINode *PN = PHINode::Create(Inst.getType(), PredCache.size(ExitBB), Inst.getName() + ".lcssa", ExitBB->begin()); // Add inputs from inside the loop for this PHI. - for (BasicBlock **PI = PredCache.GetPreds(ExitBB); *PI; ++PI) { - PN->addIncoming(&Inst, *PI); + for (BasicBlock *Pred : PredCache.get(ExitBB)) { + PN->addIncoming(&Inst, Pred); // If the exit block has a predecessor not within the loop, arrange for // the incoming value use corresponding to that predecessor to be // rewritten in terms of a different LCSSA PHI. - if (!L.contains(*PI)) + if (!L.contains(Pred)) UsesToRewrite.push_back( &PN->getOperandUse(PN->getOperandNumForIncomingValue( PN->getNumIncomingValues() - 1))); @@ -294,21 +294,18 @@ struct LCSSA : public FunctionPass { AU.setPreservesCFG(); AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<LoopInfo>(); + AU.addRequired<LoopInfoWrapperPass>(); AU.addPreservedID(LoopSimplifyID); AU.addPreserved<AliasAnalysis>(); AU.addPreserved<ScalarEvolution>(); } - -private: - void verifyAnalysis() const override; }; } char LCSSA::ID = 0; INITIALIZE_PASS_BEGIN(LCSSA, "lcssa", "Loop-Closed SSA Form Pass", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_END(LCSSA, "lcssa", "Loop-Closed SSA Form Pass", false, false) Pass *llvm::createLCSSAPass() { return new LCSSA(); } @@ -318,7 +315,7 @@ char &llvm::LCSSAID = LCSSA::ID; /// Process all loops in the function, inner-most out. bool LCSSA::runOnFunction(Function &F) { bool Changed = false; - LI = &getAnalysis<LoopInfo>(); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); SE = getAnalysisIfAvailable<ScalarEvolution>(); @@ -329,18 +326,3 @@ bool LCSSA::runOnFunction(Function &F) { return Changed; } -static void verifyLoop(Loop &L, DominatorTree &DT) { - // Recurse depth-first through inner loops. - for (Loop::iterator LI = L.begin(), LE = L.end(); LI != LE; ++LI) - verifyLoop(**LI, DT); - - // Check the special guarantees that LCSSA makes. - //assert(L.isLCSSAForm(DT) && "LCSSA form not preserved!"); -} - -void LCSSA::verifyAnalysis() const { - // Verify each loop nest in the function, assuming LI still points at that - // function's loop info. - for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I) - verifyLoop(**I, *DT); -} diff --git a/contrib/llvm/lib/Transforms/Utils/Local.cpp b/contrib/llvm/lib/Transforms/Utils/Local.cpp index 2a84d7e..70c77b0 100644 --- a/contrib/llvm/lib/Transforms/Utils/Local.cpp +++ b/contrib/llvm/lib/Transforms/Utils/Local.cpp @@ -18,6 +18,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/LibCallSemantics.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/CFG.h" @@ -110,11 +111,17 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions, } if (SwitchInst *SI = dyn_cast<SwitchInst>(T)) { - // If we are switching on a constant, we can convert the switch into a - // single branch instruction! + // If we are switching on a constant, we can convert the switch to an + // unconditional branch. ConstantInt *CI = dyn_cast<ConstantInt>(SI->getCondition()); - BasicBlock *TheOnlyDest = SI->getDefaultDest(); - BasicBlock *DefaultDest = TheOnlyDest; + BasicBlock *DefaultDest = SI->getDefaultDest(); + BasicBlock *TheOnlyDest = DefaultDest; + + // If the default is unreachable, ignore it when searching for TheOnlyDest. + if (isa<UnreachableInst>(DefaultDest->getFirstNonPHIOrDbg()) && + SI->getNumCases() > 0) { + TheOnlyDest = SI->case_begin().getCaseSuccessor(); + } // Figure out which case it goes to. for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); @@ -410,7 +417,7 @@ bool llvm::RecursivelyDeleteDeadPHINode(PHINode *PN, /// /// This returns true if it changed the code, note that it can delete /// instructions in other blocks as well in this block. -bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB, const DataLayout *TD, +bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB, const TargetLibraryInfo *TLI) { bool MadeChange = false; @@ -427,7 +434,7 @@ bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB, const DataLayout *TD, Instruction *Inst = BI++; WeakVH BIHandle(BI); - if (recursivelySimplifyInstruction(Inst, TD, TLI)) { + if (recursivelySimplifyInstruction(Inst, TLI)) { MadeChange = true; if (BIHandle != BI) BI = BB->begin(); @@ -457,8 +464,7 @@ bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB, const DataLayout *TD, /// /// .. and delete the predecessor corresponding to the '1', this will attempt to /// recursively fold the and to 0. -void llvm::RemovePredecessorAndSimplify(BasicBlock *BB, BasicBlock *Pred, - DataLayout *TD) { +void llvm::RemovePredecessorAndSimplify(BasicBlock *BB, BasicBlock *Pred) { // This only adjusts blocks with PHI nodes. if (!isa<PHINode>(BB->begin())) return; @@ -473,7 +479,7 @@ void llvm::RemovePredecessorAndSimplify(BasicBlock *BB, BasicBlock *Pred, PhiIt = &*++BasicBlock::iterator(cast<Instruction>(PhiIt)); Value *OldPhiIt = PhiIt; - if (!recursivelySimplifyInstruction(PN, TD)) + if (!recursivelySimplifyInstruction(PN)) continue; // If recursive simplification ended up deleting the next PHI node we would @@ -489,7 +495,7 @@ void llvm::RemovePredecessorAndSimplify(BasicBlock *BB, BasicBlock *Pred, /// between them, moving the instructions in the predecessor into DestBB and /// deleting the predecessor block. /// -void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB, Pass *P) { +void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB, DominatorTree *DT) { // If BB has single-entry PHI nodes, fold them. while (PHINode *PN = dyn_cast<PHINode>(DestBB->begin())) { Value *NewVal = PN->getIncomingValue(0); @@ -525,14 +531,10 @@ void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB, Pass *P) { if (PredBB == &DestBB->getParent()->getEntryBlock()) DestBB->moveAfter(PredBB); - if (P) { - if (DominatorTreeWrapperPass *DTWP = - P->getAnalysisIfAvailable<DominatorTreeWrapperPass>()) { - DominatorTree &DT = DTWP->getDomTree(); - BasicBlock *PredBBIDom = DT.getNode(PredBB)->getIDom()->getBlock(); - DT.changeImmediateDominator(DestBB, PredBBIDom); - DT.eraseNode(PredBB); - } + if (DT) { + BasicBlock *PredBBIDom = DT->getNode(PredBB)->getIDom()->getBlock(); + DT->changeImmediateDominator(DestBB, PredBBIDom); + DT->eraseNode(PredBB); } // Nuke BB. PredBB->eraseFromParent(); @@ -897,13 +899,14 @@ bool llvm::EliminateDuplicatePHINodes(BasicBlock *BB) { /// their preferred alignment from the beginning. /// static unsigned enforceKnownAlignment(Value *V, unsigned Align, - unsigned PrefAlign, const DataLayout *TD) { + unsigned PrefAlign, + const DataLayout &DL) { V = V->stripPointerCasts(); if (AllocaInst *AI = dyn_cast<AllocaInst>(V)) { // If the preferred alignment is greater than the natural stack alignment // then don't round up. This avoids dynamic stack realignment. - if (TD && TD->exceedsNaturalStackAlignment(PrefAlign)) + if (DL.exceedsNaturalStackAlignment(PrefAlign)) return Align; // If there is a requested alignment and if this is an alloca, round up. if (AI->getAlignment() >= PrefAlign) @@ -942,13 +945,13 @@ static unsigned enforceKnownAlignment(Value *V, unsigned Align, /// and it is more than the alignment of the ultimate object, see if we can /// increase the alignment of the ultimate object, making this check succeed. unsigned llvm::getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign, - const DataLayout *DL, - AssumptionCache *AC, + const DataLayout &DL, const Instruction *CxtI, + AssumptionCache *AC, const DominatorTree *DT) { assert(V->getType()->isPointerTy() && "getOrEnforceKnownAlignment expects a pointer!"); - unsigned BitWidth = DL ? DL->getPointerTypeSizeInBits(V->getType()) : 64; + unsigned BitWidth = DL.getPointerTypeSizeInBits(V->getType()); APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0); computeKnownBits(V, KnownZero, KnownOne, DL, 0, AC, CxtI, DT); @@ -975,7 +978,7 @@ unsigned llvm::getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign, /// /// See if there is a dbg.value intrinsic for DIVar before I. -static bool LdStHasDebugValue(DIVariable &DIVar, Instruction *I) { +static bool LdStHasDebugValue(const DILocalVariable *DIVar, Instruction *I) { // Since we can't guarantee that the original dbg.declare instrinsic // is removed by LowerDbgDeclare(), we need to make sure that we are // not inserting the same dbg.value intrinsic over and over. @@ -995,17 +998,13 @@ static bool LdStHasDebugValue(DIVariable &DIVar, Instruction *I) { /// that has an associated llvm.dbg.decl intrinsic. bool llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI, StoreInst *SI, DIBuilder &Builder) { - DIVariable DIVar(DDI->getVariable()); - DIExpression DIExpr(DDI->getExpression()); - assert((!DIVar || DIVar.isVariable()) && - "Variable in DbgDeclareInst should be either null or a DIVariable."); - if (!DIVar) - return false; + auto *DIVar = DDI->getVariable(); + auto *DIExpr = DDI->getExpression(); + assert(DIVar && "Missing variable"); if (LdStHasDebugValue(DIVar, SI)) return true; - Instruction *DbgVal = nullptr; // If an argument is zero extended then use argument directly. The ZExt // may be zapped by an optimization pass in future. Argument *ExtendedArg = nullptr; @@ -1014,11 +1013,11 @@ bool llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI, if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0))) ExtendedArg = dyn_cast<Argument>(SExt->getOperand(0)); if (ExtendedArg) - DbgVal = Builder.insertDbgValueIntrinsic(ExtendedArg, 0, DIVar, DIExpr, SI); + Builder.insertDbgValueIntrinsic(ExtendedArg, 0, DIVar, DIExpr, + DDI->getDebugLoc(), SI); else - DbgVal = Builder.insertDbgValueIntrinsic(SI->getOperand(0), 0, DIVar, - DIExpr, SI); - DbgVal->setDebugLoc(DDI->getDebugLoc()); + Builder.insertDbgValueIntrinsic(SI->getOperand(0), 0, DIVar, DIExpr, + DDI->getDebugLoc(), SI); return true; } @@ -1026,19 +1025,15 @@ bool llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI, /// that has an associated llvm.dbg.decl intrinsic. bool llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI, LoadInst *LI, DIBuilder &Builder) { - DIVariable DIVar(DDI->getVariable()); - DIExpression DIExpr(DDI->getExpression()); - assert((!DIVar || DIVar.isVariable()) && - "Variable in DbgDeclareInst should be either null or a DIVariable."); - if (!DIVar) - return false; + auto *DIVar = DDI->getVariable(); + auto *DIExpr = DDI->getExpression(); + assert(DIVar && "Missing variable"); if (LdStHasDebugValue(DIVar, LI)) return true; - Instruction *DbgVal = - Builder.insertDbgValueIntrinsic(LI->getOperand(0), 0, DIVar, DIExpr, LI); - DbgVal->setDebugLoc(DDI->getDebugLoc()); + Builder.insertDbgValueIntrinsic(LI->getOperand(0), 0, DIVar, DIExpr, + DDI->getDebugLoc(), LI); return true; } @@ -1080,10 +1075,9 @@ bool llvm::LowerDbgDeclare(Function &F) { // This is a call by-value or some other instruction that // takes a pointer to the variable. Insert a *value* // intrinsic that describes the alloca. - auto DbgVal = DIB.insertDbgValueIntrinsic( - AI, 0, DIVariable(DDI->getVariable()), - DIExpression(DDI->getExpression()), CI); - DbgVal->setDebugLoc(DDI->getDebugLoc()); + DIB.insertDbgValueIntrinsic(AI, 0, DDI->getVariable(), + DDI->getExpression(), DDI->getDebugLoc(), + CI); } DDI->eraseFromParent(); } @@ -1104,32 +1098,31 @@ DbgDeclareInst *llvm::FindAllocaDbgDeclare(Value *V) { } bool llvm::replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress, - DIBuilder &Builder) { + DIBuilder &Builder, bool Deref) { DbgDeclareInst *DDI = FindAllocaDbgDeclare(AI); if (!DDI) return false; - DIVariable DIVar(DDI->getVariable()); - DIExpression DIExpr(DDI->getExpression()); - assert((!DIVar || DIVar.isVariable()) && - "Variable in DbgDeclareInst should be either null or a DIVariable."); - if (!DIVar) - return false; - - // Create a copy of the original DIDescriptor for user variable, prepending - // "deref" operation to a list of address elements, as new llvm.dbg.declare - // will take a value storing address of the memory for variable, not - // alloca itself. - SmallVector<int64_t, 4> NewDIExpr; - NewDIExpr.push_back(dwarf::DW_OP_deref); - if (DIExpr) - for (unsigned i = 0, n = DIExpr.getNumElements(); i < n; ++i) - NewDIExpr.push_back(DIExpr.getElement(i)); + DebugLoc Loc = DDI->getDebugLoc(); + auto *DIVar = DDI->getVariable(); + auto *DIExpr = DDI->getExpression(); + assert(DIVar && "Missing variable"); + + if (Deref) { + // Create a copy of the original DIDescriptor for user variable, prepending + // "deref" operation to a list of address elements, as new llvm.dbg.declare + // will take a value storing address of the memory for variable, not + // alloca itself. + SmallVector<uint64_t, 4> NewDIExpr; + NewDIExpr.push_back(dwarf::DW_OP_deref); + if (DIExpr) + NewDIExpr.append(DIExpr->elements_begin(), DIExpr->elements_end()); + DIExpr = Builder.createExpression(NewDIExpr); + } // Insert llvm.dbg.declare in the same basic block as the original alloca, // and remove old llvm.dbg.declare. BasicBlock *BB = AI->getParent(); - Builder.insertDeclare(NewAllocaAddress, DIVar, - Builder.createExpression(NewDIExpr), BB); + Builder.insertDeclare(NewAllocaAddress, DIVar, DIExpr, Loc, BB); DDI->eraseFromParent(); return true; } @@ -1254,7 +1247,7 @@ static bool markAliveBlocks(BasicBlock *BB, if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) { changeToUnreachable(II, true); Changed = true; - } else if (II->doesNotThrow()) { + } else if (II->doesNotThrow() && canSimplifyInvokeNoUnwind(II)) { if (II->use_empty() && II->onlyReadsMemory()) { // jump to the normal destination branch. BranchInst::Create(II->getNormalDest(), II); @@ -1350,3 +1343,23 @@ void llvm::combineMetadata(Instruction *K, const Instruction *J, ArrayRef<unsign } } } + +unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To, + DominatorTree &DT, + const BasicBlockEdge &Root) { + assert(From->getType() == To->getType()); + + unsigned Count = 0; + for (Value::use_iterator UI = From->use_begin(), UE = From->use_end(); + UI != UE; ) { + Use &U = *UI++; + if (DT.dominates(Root, U)) { + U.set(To); + DEBUG(dbgs() << "Replace dominated use of '" + << From->getName() << "' as " + << *To << " in " << *U << "\n"); + ++Count; + } + } + return Count; +} diff --git a/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp index c832a4b..90dfaba 100644 --- a/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp +++ b/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp @@ -57,8 +57,10 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" @@ -113,6 +115,14 @@ static void placeSplitBlockCarefully(BasicBlock *NewBB, BasicBlock *llvm::InsertPreheaderForLoop(Loop *L, Pass *PP) { BasicBlock *Header = L->getHeader(); + // Get analyses that we try to update. + auto *AA = PP->getAnalysisIfAvailable<AliasAnalysis>(); + auto *DTWP = PP->getAnalysisIfAvailable<DominatorTreeWrapperPass>(); + auto *DT = DTWP ? &DTWP->getDomTree() : nullptr; + auto *LIWP = PP->getAnalysisIfAvailable<LoopInfoWrapperPass>(); + auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr; + bool PreserveLCSSA = PP->mustPreserveAnalysisID(LCSSAID); + // Compute the set of predecessors of the loop that are not in the loop. SmallVector<BasicBlock*, 8> OutsideBlocks; for (pred_iterator PI = pred_begin(Header), PE = pred_end(Header); @@ -131,15 +141,8 @@ BasicBlock *llvm::InsertPreheaderForLoop(Loop *L, Pass *PP) { // Split out the loop pre-header. BasicBlock *PreheaderBB; - if (!Header->isLandingPad()) { - PreheaderBB = SplitBlockPredecessors(Header, OutsideBlocks, ".preheader", - PP); - } else { - SmallVector<BasicBlock*, 2> NewBBs; - SplitLandingPadPredecessors(Header, OutsideBlocks, ".preheader", - ".split-lp", PP, NewBBs); - PreheaderBB = NewBBs[0]; - } + PreheaderBB = SplitBlockPredecessors(Header, OutsideBlocks, ".preheader", + AA, DT, LI, PreserveLCSSA); PreheaderBB->getTerminator()->setDebugLoc( Header->getFirstNonPHI()->getDebugLoc()); @@ -157,7 +160,9 @@ BasicBlock *llvm::InsertPreheaderForLoop(Loop *L, Pass *PP) { /// /// This method is used to split exit blocks that have predecessors outside of /// the loop. -static BasicBlock *rewriteLoopExitBlock(Loop *L, BasicBlock *Exit, Pass *PP) { +static BasicBlock *rewriteLoopExitBlock(Loop *L, BasicBlock *Exit, + AliasAnalysis *AA, DominatorTree *DT, + LoopInfo *LI, Pass *PP) { SmallVector<BasicBlock*, 8> LoopBlocks; for (pred_iterator I = pred_begin(Exit), E = pred_end(Exit); I != E; ++I) { BasicBlock *P = *I; @@ -172,15 +177,10 @@ static BasicBlock *rewriteLoopExitBlock(Loop *L, BasicBlock *Exit, Pass *PP) { assert(!LoopBlocks.empty() && "No edges coming in from outside the loop?"); BasicBlock *NewExitBB = nullptr; - if (Exit->isLandingPad()) { - SmallVector<BasicBlock*, 2> NewBBs; - SplitLandingPadPredecessors(Exit, LoopBlocks, - ".loopexit", ".nonloopexit", - PP, NewBBs); - NewExitBB = NewBBs[0]; - } else { - NewExitBB = SplitBlockPredecessors(Exit, LoopBlocks, ".loopexit", PP); - } + bool PreserveLCSSA = PP->mustPreserveAnalysisID(LCSSAID); + + NewExitBB = SplitBlockPredecessors(Exit, LoopBlocks, ".loopexit", AA, DT, + LI, PreserveLCSSA); DEBUG(dbgs() << "LoopSimplify: Creating dedicated exit block " << NewExitBB->getName() << "\n"); @@ -211,10 +211,11 @@ static void addBlockAndPredsToSet(BasicBlock *InputBB, BasicBlock *StopBlock, static PHINode *findPHIToPartitionLoops(Loop *L, AliasAnalysis *AA, DominatorTree *DT, AssumptionCache *AC) { + const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ) { PHINode *PN = cast<PHINode>(I); ++I; - if (Value *V = SimplifyInstruction(PN, nullptr, nullptr, DT, AC)) { + if (Value *V = SimplifyInstruction(PN, DL, nullptr, DT, AC)) { // This is a degenerate PHI already, don't modify it! PN->replaceAllUsesWith(V); if (AA) AA->deleteValue(PN); @@ -287,9 +288,11 @@ static Loop *separateNestedLoop(Loop *L, BasicBlock *Preheader, if (SE) SE->forgetLoop(L); + bool PreserveLCSSA = PP->mustPreserveAnalysisID(LCSSAID); + BasicBlock *Header = L->getHeader(); - BasicBlock *NewBB = - SplitBlockPredecessors(Header, OuterLoopPreds, ".outer", PP); + BasicBlock *NewBB = SplitBlockPredecessors(Header, OuterLoopPreds, ".outer", + AA, DT, LI, PreserveLCSSA); // Make sure that NewBB is put someplace intelligent, which doesn't mess up // code layout too horribly. @@ -460,7 +463,7 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader, // Update Loop Information - we know that this block is now in the current // loop and all parent loops. - L->addBasicBlockToLoop(BEBlock, LI->getBase()); + L->addBasicBlockToLoop(BEBlock, *LI); // Update dominator information DT->splitBlock(BEBlock); @@ -476,7 +479,7 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader, /// explicit if they accepted the analysis directly and then updated it. static bool simplifyOneLoop(Loop *L, SmallVectorImpl<Loop *> &Worklist, AliasAnalysis *AA, DominatorTree *DT, LoopInfo *LI, - ScalarEvolution *SE, Pass *PP, const DataLayout *DL, + ScalarEvolution *SE, Pass *PP, AssumptionCache *AC) { bool Changed = false; ReprocessLoop: @@ -567,7 +570,7 @@ ReprocessLoop: // Must be exactly this loop: no subloops, parent loops, or non-loop preds // allowed. if (!L->contains(*PI)) { - if (rewriteLoopExitBlock(L, ExitBlock, PP)) { + if (rewriteLoopExitBlock(L, ExitBlock, AA, DT, LI, PP)) { ++NumInserted; Changed = true; } @@ -608,13 +611,15 @@ ReprocessLoop: } } + const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); + // Scan over the PHI nodes in the loop header. Since they now have only two // incoming values (the loop is canonicalized), we may have simplified the PHI // down to 'X = phi [X, Y]', which should be replaced with 'Y'. PHINode *PN; for (BasicBlock::iterator I = L->getHeader()->begin(); (PN = dyn_cast<PHINode>(I++)); ) - if (Value *V = SimplifyInstruction(PN, nullptr, nullptr, DT, AC)) { + if (Value *V = SimplifyInstruction(PN, DL, nullptr, DT, AC)) { if (AA) AA->deleteValue(PN); if (SE) SE->forgetValue(PN); PN->replaceAllUsesWith(V); @@ -676,7 +681,8 @@ ReprocessLoop: // The block has now been cleared of all instructions except for // a comparison and a conditional branch. SimplifyCFG may be able // to fold it now. - if (!FoldBranchToCommonDest(BI, DL)) continue; + if (!FoldBranchToCommonDest(BI)) + continue; // Success. The block is now dead, so remove it from the loop, // update the dominator tree and delete it. @@ -714,7 +720,7 @@ ReprocessLoop: bool llvm::simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, Pass *PP, AliasAnalysis *AA, ScalarEvolution *SE, - const DataLayout *DL, AssumptionCache *AC) { + AssumptionCache *AC) { bool Changed = false; // Worklist maintains our depth-first queue of loops in this nest to process. @@ -726,13 +732,12 @@ bool llvm::simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, Pass *PP, // order. We can use this simple process because loops form a tree. for (unsigned Idx = 0; Idx != Worklist.size(); ++Idx) { Loop *L2 = Worklist[Idx]; - for (Loop::iterator I = L2->begin(), E = L2->end(); I != E; ++I) - Worklist.push_back(*I); + Worklist.append(L2->begin(), L2->end()); } while (!Worklist.empty()) Changed |= simplifyOneLoop(Worklist.pop_back_val(), Worklist, AA, DT, LI, - SE, PP, DL, AC); + SE, PP, AC); return Changed; } @@ -750,7 +755,6 @@ namespace { DominatorTree *DT; LoopInfo *LI; ScalarEvolution *SE; - const DataLayout *DL; AssumptionCache *AC; bool runOnFunction(Function &F) override; @@ -762,8 +766,8 @@ namespace { AU.addRequired<DominatorTreeWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addRequired<LoopInfo>(); - AU.addPreserved<LoopInfo>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); AU.addPreserved<AliasAnalysis>(); AU.addPreserved<ScalarEvolution>(); @@ -781,7 +785,7 @@ INITIALIZE_PASS_BEGIN(LoopSimplify, "loop-simplify", "Canonicalize natural loops", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_END(LoopSimplify, "loop-simplify", "Canonicalize natural loops", false, false) @@ -795,16 +799,14 @@ Pass *llvm::createLoopSimplifyPass() { return new LoopSimplify(); } bool LoopSimplify::runOnFunction(Function &F) { bool Changed = false; AA = getAnalysisIfAvailable<AliasAnalysis>(); - LI = &getAnalysis<LoopInfo>(); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); SE = getAnalysisIfAvailable<ScalarEvolution>(); - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - DL = DLP ? &DLP->getDataLayout() : nullptr; AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); // Simplify each loop nest in the function. for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I) - Changed |= simplifyLoop(*I, DT, LI, this, AA, SE, DL, AC); + Changed |= simplifyLoop(*I, DT, LI, this, AA, SE, AC); return Changed; } diff --git a/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp index 5745920..1dbce47 100644 --- a/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp +++ b/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp @@ -26,8 +26,8 @@ #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/DataLayout.h" -#include "llvm/IR/Dominators.h" #include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/LLVMContext.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -146,6 +146,13 @@ FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI, LPPassManager *LPM, /// Similarly, TripMultiple divides the number of times that the LatchBlock may /// execute without exiting the loop. /// +/// If AllowRuntime is true then UnrollLoop will consider unrolling loops that +/// have a runtime (i.e. not compile time constant) trip count. Unrolling these +/// loops require a unroll "prologue" that runs "RuntimeTripCount % Count" +/// iterations before branching into the unrolled loop. UnrollLoop will not +/// runtime-unroll the loop if computing RuntimeTripCount will be expensive and +/// AllowExpensiveTripCount is false. +/// /// The LoopInfo Analysis that is passed will be kept consistent. /// /// If a LoopPassManager is passed in, and the loop is fully removed, it will be @@ -154,8 +161,9 @@ FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI, LPPassManager *LPM, /// This utility preserves LoopInfo. If DominatorTree or ScalarEvolution are /// available from the Pass it must also preserve those analyses. bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, - bool AllowRuntime, unsigned TripMultiple, LoopInfo *LI, - Pass *PP, LPPassManager *LPM, AssumptionCache *AC) { + bool AllowRuntime, bool AllowExpensiveTripCount, + unsigned TripMultiple, LoopInfo *LI, Pass *PP, + LPPassManager *LPM, AssumptionCache *AC) { BasicBlock *Preheader = L->getLoopPreheader(); if (!Preheader) { DEBUG(dbgs() << " Can't unroll; loop preheader-insertion failed.\n"); @@ -218,7 +226,8 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, // flag is specified. bool RuntimeTripCount = (TripCount == 0 && Count > 0 && AllowRuntime); - if (RuntimeTripCount && !UnrollRuntimeLoopProlog(L, Count, LI, LPM)) + if (RuntimeTripCount && + !UnrollRuntimeLoopProlog(L, Count, AllowExpensiveTripCount, LI, LPM)) return false; // Notify ScalarEvolution that the loop will be substantially changed, @@ -311,7 +320,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, // Tell LI about New. if (*BB == Header) { assert(LI->getLoopFor(*BB) == L && "Header should not be in a sub-loop"); - L->addBasicBlockToLoop(New, LI->getBase()); + L->addBasicBlockToLoop(New, *LI); } else { // Figure out which loop New is in. const Loop *OldLoop = LI->getLoopFor(*BB); @@ -333,7 +342,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, if (SE) SE->forgetLoop(OldLoop); } - NewLoop->addBasicBlockToLoop(New, LI->getBase()); + NewLoop->addBasicBlockToLoop(New, *LI); } if (*BB == Header) @@ -500,6 +509,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, // At this point, the code is well formed. We now do a quick sweep over the // inserted code, doing constant propagation and dead code elimination as we // go. + const DataLayout &DL = Header->getModule()->getDataLayout(); const std::vector<BasicBlock*> &NewLoopBlocks = L->getBlocks(); for (std::vector<BasicBlock*>::const_iterator BB = NewLoopBlocks.begin(), BBE = NewLoopBlocks.end(); BB != BBE; ++BB) @@ -508,7 +518,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, if (isInstructionTriviallyDead(Inst)) (*BB)->getInstList().erase(Inst); - else if (Value *V = SimplifyInstruction(Inst)) + else if (Value *V = SimplifyInstruction(Inst, DL)) if (LI->replacementPreservesLCSSAForm(Inst, V)) { Inst->replaceAllUsesWith(V); (*BB)->getInstList().erase(Inst); @@ -531,9 +541,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, if (!OuterL && !CompletelyUnroll) OuterL = L; if (OuterL) { - DataLayoutPass *DLP = PP->getAnalysisIfAvailable<DataLayoutPass>(); - const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr; - simplifyLoop(OuterL, DT, LI, PP, /*AliasAnalysis*/ nullptr, SE, DL, AC); + simplifyLoop(OuterL, DT, LI, PP, /*AliasAnalysis*/ nullptr, SE, AC); // LCSSA must be performed on the outermost affected loop. The unrolled // loop's last loop latch is guaranteed to be in the outermost loop after @@ -549,3 +557,26 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, return true; } + +/// Given an llvm.loop loop id metadata node, returns the loop hint metadata +/// node with the given name (for example, "llvm.loop.unroll.count"). If no +/// such metadata node exists, then nullptr is returned. +MDNode *llvm::GetUnrollMetadata(MDNode *LoopID, StringRef Name) { + // First operand should refer to the loop id itself. + assert(LoopID->getNumOperands() > 0 && "requires at least one operand"); + assert(LoopID->getOperand(0) == LoopID && "invalid loop id"); + + for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) { + MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); + if (!MD) + continue; + + MDString *S = dyn_cast<MDString>(MD->getOperand(0)); + if (!S) + continue; + + if (Name.equals(S->getString())) + return MD; + } + return nullptr; +} diff --git a/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp index 8a32215..d1774df 100644 --- a/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ b/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -23,14 +23,18 @@ #include "llvm/Transforms/Utils/UnrollLoop.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Metadata.h" +#include "llvm/IR/Module.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" #include <algorithm> @@ -58,7 +62,8 @@ STATISTIC(NumRuntimeUnrolled, static void ConnectProlog(Loop *L, Value *BECount, unsigned Count, BasicBlock *LastPrologBB, BasicBlock *PrologEnd, BasicBlock *OrigPH, BasicBlock *NewPH, - ValueToValueMapTy &VMap, Pass *P) { + ValueToValueMapTy &VMap, AliasAnalysis *AA, + DominatorTree *DT, LoopInfo *LI, Pass *P) { BasicBlock *Latch = L->getLoopLatch(); assert(Latch && "Loop must have a latch"); @@ -122,13 +127,8 @@ static void ConnectProlog(Loop *L, Value *BECount, unsigned Count, assert(Exit && "Loop must have a single exit block only"); // Split the exit to maintain loop canonicalization guarantees SmallVector<BasicBlock*, 4> Preds(pred_begin(Exit), pred_end(Exit)); - if (!Exit->isLandingPad()) { - SplitBlockPredecessors(Exit, Preds, ".unr-lcssa", P); - } else { - SmallVector<BasicBlock*, 2> NewBBs; - SplitLandingPadPredecessors(Exit, Preds, ".unr1-lcssa", ".unr2-lcssa", - P, NewBBs); - } + SplitBlockPredecessors(Exit, Preds, ".unr-lcssa", AA, DT, LI, + P->mustPreserveAnalysisID(LCSSAID)); // Add the branch to the exit block (around the unrolled loop) BranchInst::Create(Exit, NewPH, BrLoopExit, InsertPt); InsertPt->eraseFromParent(); @@ -167,9 +167,9 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog, NewBlocks.push_back(NewBB); if (NewLoop) - NewLoop->addBasicBlockToLoop(NewBB, LI->getBase()); + NewLoop->addBasicBlockToLoop(NewBB, *LI); else if (ParentLoop) - ParentLoop->addBasicBlockToLoop(NewBB, LI->getBase()); + ParentLoop->addBasicBlockToLoop(NewBB, *LI); VMap[*BB] = NewBB; if (Header == *BB) { @@ -278,7 +278,8 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog, /// ... /// End: /// -bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI, +bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, + bool AllowExpensiveTripCount, LoopInfo *LI, LPPassManager *LPM) { // for now, only unroll loops that contain a single exit if (!L->getExitingBlock()) @@ -312,15 +313,20 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI, if (isa<SCEVCouldNotCompute>(TripCountSC)) return false; + BasicBlock *Header = L->getHeader(); + const DataLayout &DL = Header->getModule()->getDataLayout(); + SCEVExpander Expander(*SE, DL, "loop-unroll"); + if (!AllowExpensiveTripCount && Expander.isHighCostExpansion(TripCountSC, L)) + return false; + // We only handle cases when the unroll factor is a power of 2. // Count is the loop unroll factor, the number of extra copies added + 1. if (!isPowerOf2_32(Count)) return false; // This constraint lets us deal with an overflowing trip count easily; see the - // comment on ModVal below. This check is equivalent to `Log2(Count) < - // BEWidth`. - if (static_cast<uint64_t>(Count) > (1ULL << BEWidth)) + // comment on ModVal below. + if (Log2_32(Count) > BEWidth) return false; // If this loop is nested, then the loop unroller changes the code in @@ -328,18 +334,20 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI, if (Loop *ParentLoop = L->getParentLoop()) SE->forgetLoop(ParentLoop); + // Grab analyses that we preserve. + auto *DTWP = LPM->getAnalysisIfAvailable<DominatorTreeWrapperPass>(); + auto *DT = DTWP ? &DTWP->getDomTree() : nullptr; + BasicBlock *PH = L->getLoopPreheader(); - BasicBlock *Header = L->getHeader(); BasicBlock *Latch = L->getLoopLatch(); // It helps to splits the original preheader twice, one for the end of the // prolog code and one for a new loop preheader - BasicBlock *PEnd = SplitEdge(PH, Header, LPM->getAsPass()); - BasicBlock *NewPH = SplitBlock(PEnd, PEnd->getTerminator(), LPM->getAsPass()); + BasicBlock *PEnd = SplitEdge(PH, Header, DT, LI); + BasicBlock *NewPH = SplitBlock(PEnd, PEnd->getTerminator(), DT, LI); BranchInst *PreHeaderBR = cast<BranchInst>(PH->getTerminator()); // Compute the number of extra iterations required, which is: // extra iterations = run-time trip count % (loop unroll factor + 1) - SCEVExpander Expander(*SE, "loop-unroll"); Value *TripCount = Expander.expandCodeFor(TripCountSC, TripCountSC->getType(), PreHeaderBR); Value *BECount = Expander.expandCodeFor(BECountSC, BECountSC->getType(), @@ -408,7 +416,7 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI, // PHI functions. BasicBlock *LastLoopBB = cast<BasicBlock>(VMap[Latch]); ConnectProlog(L, BECount, Count, LastLoopBB, PEnd, PH, NewPH, VMap, - LPM->getAsPass()); + /*AliasAnalysis*/ nullptr, DT, LI, LPM->getAsPass()); NumRuntimeUnrolled++; return true; } diff --git a/contrib/llvm/lib/Transforms/Utils/LoopUtils.cpp b/contrib/llvm/lib/Transforms/Utils/LoopUtils.cpp new file mode 100644 index 0000000..a5890c0 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -0,0 +1,499 @@ +//===-- LoopUtils.cpp - Loop Utility functions -------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines common loop utility functions. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/IR/ValueHandle.h" +#include "llvm/Support/Debug.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/IR/Module.h" +#include "llvm/Transforms/Utils/LoopUtils.h" + +using namespace llvm; +using namespace llvm::PatternMatch; + +#define DEBUG_TYPE "loop-utils" + +bool ReductionDescriptor::areAllUsesIn(Instruction *I, + SmallPtrSetImpl<Instruction *> &Set) { + for (User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use) + if (!Set.count(dyn_cast<Instruction>(*Use))) + return false; + return true; +} + +bool ReductionDescriptor::AddReductionVar(PHINode *Phi, ReductionKind Kind, + Loop *TheLoop, bool HasFunNoNaNAttr, + ReductionDescriptor &RedDes) { + if (Phi->getNumIncomingValues() != 2) + return false; + + // Reduction variables are only found in the loop header block. + if (Phi->getParent() != TheLoop->getHeader()) + return false; + + // Obtain the reduction start value from the value that comes from the loop + // preheader. + Value *RdxStart = Phi->getIncomingValueForBlock(TheLoop->getLoopPreheader()); + + // ExitInstruction is the single value which is used outside the loop. + // We only allow for a single reduction value to be used outside the loop. + // This includes users of the reduction, variables (which form a cycle + // which ends in the phi node). + Instruction *ExitInstruction = nullptr; + // Indicates that we found a reduction operation in our scan. + bool FoundReduxOp = false; + + // We start with the PHI node and scan for all of the users of this + // instruction. All users must be instructions that can be used as reduction + // variables (such as ADD). We must have a single out-of-block user. The cycle + // must include the original PHI. + bool FoundStartPHI = false; + + // To recognize min/max patterns formed by a icmp select sequence, we store + // the number of instruction we saw from the recognized min/max pattern, + // to make sure we only see exactly the two instructions. + unsigned NumCmpSelectPatternInst = 0; + ReductionInstDesc ReduxDesc(false, nullptr); + + SmallPtrSet<Instruction *, 8> VisitedInsts; + SmallVector<Instruction *, 8> Worklist; + Worklist.push_back(Phi); + VisitedInsts.insert(Phi); + + // A value in the reduction can be used: + // - By the reduction: + // - Reduction operation: + // - One use of reduction value (safe). + // - Multiple use of reduction value (not safe). + // - PHI: + // - All uses of the PHI must be the reduction (safe). + // - Otherwise, not safe. + // - By one instruction outside of the loop (safe). + // - By further instructions outside of the loop (not safe). + // - By an instruction that is not part of the reduction (not safe). + // This is either: + // * An instruction type other than PHI or the reduction operation. + // * A PHI in the header other than the initial PHI. + while (!Worklist.empty()) { + Instruction *Cur = Worklist.back(); + Worklist.pop_back(); + + // No Users. + // If the instruction has no users then this is a broken chain and can't be + // a reduction variable. + if (Cur->use_empty()) + return false; + + bool IsAPhi = isa<PHINode>(Cur); + + // A header PHI use other than the original PHI. + if (Cur != Phi && IsAPhi && Cur->getParent() == Phi->getParent()) + return false; + + // Reductions of instructions such as Div, and Sub is only possible if the + // LHS is the reduction variable. + if (!Cur->isCommutative() && !IsAPhi && !isa<SelectInst>(Cur) && + !isa<ICmpInst>(Cur) && !isa<FCmpInst>(Cur) && + !VisitedInsts.count(dyn_cast<Instruction>(Cur->getOperand(0)))) + return false; + + // Any reduction instruction must be of one of the allowed kinds. + ReduxDesc = isReductionInstr(Cur, Kind, ReduxDesc, HasFunNoNaNAttr); + if (!ReduxDesc.isReduction()) + return false; + + // A reduction operation must only have one use of the reduction value. + if (!IsAPhi && Kind != RK_IntegerMinMax && Kind != RK_FloatMinMax && + hasMultipleUsesOf(Cur, VisitedInsts)) + return false; + + // All inputs to a PHI node must be a reduction value. + if (IsAPhi && Cur != Phi && !areAllUsesIn(Cur, VisitedInsts)) + return false; + + if (Kind == RK_IntegerMinMax && + (isa<ICmpInst>(Cur) || isa<SelectInst>(Cur))) + ++NumCmpSelectPatternInst; + if (Kind == RK_FloatMinMax && (isa<FCmpInst>(Cur) || isa<SelectInst>(Cur))) + ++NumCmpSelectPatternInst; + + // Check whether we found a reduction operator. + FoundReduxOp |= !IsAPhi; + + // Process users of current instruction. Push non-PHI nodes after PHI nodes + // onto the stack. This way we are going to have seen all inputs to PHI + // nodes once we get to them. + SmallVector<Instruction *, 8> NonPHIs; + SmallVector<Instruction *, 8> PHIs; + for (User *U : Cur->users()) { + Instruction *UI = cast<Instruction>(U); + + // Check if we found the exit user. + BasicBlock *Parent = UI->getParent(); + if (!TheLoop->contains(Parent)) { + // Exit if you find multiple outside users or if the header phi node is + // being used. In this case the user uses the value of the previous + // iteration, in which case we would loose "VF-1" iterations of the + // reduction operation if we vectorize. + if (ExitInstruction != nullptr || Cur == Phi) + return false; + + // The instruction used by an outside user must be the last instruction + // before we feed back to the reduction phi. Otherwise, we loose VF-1 + // operations on the value. + if (std::find(Phi->op_begin(), Phi->op_end(), Cur) == Phi->op_end()) + return false; + + ExitInstruction = Cur; + continue; + } + + // Process instructions only once (termination). Each reduction cycle + // value must only be used once, except by phi nodes and min/max + // reductions which are represented as a cmp followed by a select. + ReductionInstDesc IgnoredVal(false, nullptr); + if (VisitedInsts.insert(UI).second) { + if (isa<PHINode>(UI)) + PHIs.push_back(UI); + else + NonPHIs.push_back(UI); + } else if (!isa<PHINode>(UI) && + ((!isa<FCmpInst>(UI) && !isa<ICmpInst>(UI) && + !isa<SelectInst>(UI)) || + !isMinMaxSelectCmpPattern(UI, IgnoredVal).isReduction())) + return false; + + // Remember that we completed the cycle. + if (UI == Phi) + FoundStartPHI = true; + } + Worklist.append(PHIs.begin(), PHIs.end()); + Worklist.append(NonPHIs.begin(), NonPHIs.end()); + } + + // This means we have seen one but not the other instruction of the + // pattern or more than just a select and cmp. + if ((Kind == RK_IntegerMinMax || Kind == RK_FloatMinMax) && + NumCmpSelectPatternInst != 2) + return false; + + if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction) + return false; + + // We found a reduction var if we have reached the original phi node and we + // only have a single instruction with out-of-loop users. + + // The ExitInstruction(Instruction which is allowed to have out-of-loop users) + // is saved as part of the ReductionDescriptor. + + // Save the description of this reduction variable. + ReductionDescriptor RD(RdxStart, ExitInstruction, Kind, + ReduxDesc.getMinMaxKind()); + + RedDes = RD; + + return true; +} + +/// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction +/// pattern corresponding to a min(X, Y) or max(X, Y). +ReductionInstDesc +ReductionDescriptor::isMinMaxSelectCmpPattern(Instruction *I, + ReductionInstDesc &Prev) { + + assert((isa<ICmpInst>(I) || isa<FCmpInst>(I) || isa<SelectInst>(I)) && + "Expect a select instruction"); + Instruction *Cmp = nullptr; + SelectInst *Select = nullptr; + + // We must handle the select(cmp()) as a single instruction. Advance to the + // select. + if ((Cmp = dyn_cast<ICmpInst>(I)) || (Cmp = dyn_cast<FCmpInst>(I))) { + if (!Cmp->hasOneUse() || !(Select = dyn_cast<SelectInst>(*I->user_begin()))) + return ReductionInstDesc(false, I); + return ReductionInstDesc(Select, Prev.getMinMaxKind()); + } + + // Only handle single use cases for now. + if (!(Select = dyn_cast<SelectInst>(I))) + return ReductionInstDesc(false, I); + if (!(Cmp = dyn_cast<ICmpInst>(I->getOperand(0))) && + !(Cmp = dyn_cast<FCmpInst>(I->getOperand(0)))) + return ReductionInstDesc(false, I); + if (!Cmp->hasOneUse()) + return ReductionInstDesc(false, I); + + Value *CmpLeft; + Value *CmpRight; + + // Look for a min/max pattern. + if (m_UMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) + return ReductionInstDesc(Select, ReductionInstDesc::MRK_UIntMin); + else if (m_UMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) + return ReductionInstDesc(Select, ReductionInstDesc::MRK_UIntMax); + else if (m_SMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) + return ReductionInstDesc(Select, ReductionInstDesc::MRK_SIntMax); + else if (m_SMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) + return ReductionInstDesc(Select, ReductionInstDesc::MRK_SIntMin); + else if (m_OrdFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) + return ReductionInstDesc(Select, ReductionInstDesc::MRK_FloatMin); + else if (m_OrdFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) + return ReductionInstDesc(Select, ReductionInstDesc::MRK_FloatMax); + else if (m_UnordFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) + return ReductionInstDesc(Select, ReductionInstDesc::MRK_FloatMin); + else if (m_UnordFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) + return ReductionInstDesc(Select, ReductionInstDesc::MRK_FloatMax); + + return ReductionInstDesc(false, I); +} + +ReductionInstDesc ReductionDescriptor::isReductionInstr(Instruction *I, + ReductionKind Kind, + ReductionInstDesc &Prev, + bool HasFunNoNaNAttr) { + bool FP = I->getType()->isFloatingPointTy(); + bool FastMath = FP && I->hasUnsafeAlgebra(); + switch (I->getOpcode()) { + default: + return ReductionInstDesc(false, I); + case Instruction::PHI: + if (FP && + (Kind != RK_FloatMult && Kind != RK_FloatAdd && Kind != RK_FloatMinMax)) + return ReductionInstDesc(false, I); + return ReductionInstDesc(I, Prev.getMinMaxKind()); + case Instruction::Sub: + case Instruction::Add: + return ReductionInstDesc(Kind == RK_IntegerAdd, I); + case Instruction::Mul: + return ReductionInstDesc(Kind == RK_IntegerMult, I); + case Instruction::And: + return ReductionInstDesc(Kind == RK_IntegerAnd, I); + case Instruction::Or: + return ReductionInstDesc(Kind == RK_IntegerOr, I); + case Instruction::Xor: + return ReductionInstDesc(Kind == RK_IntegerXor, I); + case Instruction::FMul: + return ReductionInstDesc(Kind == RK_FloatMult && FastMath, I); + case Instruction::FSub: + case Instruction::FAdd: + return ReductionInstDesc(Kind == RK_FloatAdd && FastMath, I); + case Instruction::FCmp: + case Instruction::ICmp: + case Instruction::Select: + if (Kind != RK_IntegerMinMax && + (!HasFunNoNaNAttr || Kind != RK_FloatMinMax)) + return ReductionInstDesc(false, I); + return isMinMaxSelectCmpPattern(I, Prev); + } +} + +bool ReductionDescriptor::hasMultipleUsesOf( + Instruction *I, SmallPtrSetImpl<Instruction *> &Insts) { + unsigned NumUses = 0; + for (User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; + ++Use) { + if (Insts.count(dyn_cast<Instruction>(*Use))) + ++NumUses; + if (NumUses > 1) + return true; + } + + return false; +} +bool ReductionDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop, + ReductionDescriptor &RedDes) { + + bool HasFunNoNaNAttr = false; + BasicBlock *Header = TheLoop->getHeader(); + Function &F = *Header->getParent(); + if (F.hasFnAttribute("no-nans-fp-math")) + HasFunNoNaNAttr = + F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true"; + + if (AddReductionVar(Phi, RK_IntegerAdd, TheLoop, HasFunNoNaNAttr, RedDes)) { + DEBUG(dbgs() << "Found an ADD reduction PHI." << *Phi << "\n"); + return true; + } + if (AddReductionVar(Phi, RK_IntegerMult, TheLoop, HasFunNoNaNAttr, RedDes)) { + DEBUG(dbgs() << "Found a MUL reduction PHI." << *Phi << "\n"); + return true; + } + if (AddReductionVar(Phi, RK_IntegerOr, TheLoop, HasFunNoNaNAttr, RedDes)) { + DEBUG(dbgs() << "Found an OR reduction PHI." << *Phi << "\n"); + return true; + } + if (AddReductionVar(Phi, RK_IntegerAnd, TheLoop, HasFunNoNaNAttr, RedDes)) { + DEBUG(dbgs() << "Found an AND reduction PHI." << *Phi << "\n"); + return true; + } + if (AddReductionVar(Phi, RK_IntegerXor, TheLoop, HasFunNoNaNAttr, RedDes)) { + DEBUG(dbgs() << "Found a XOR reduction PHI." << *Phi << "\n"); + return true; + } + if (AddReductionVar(Phi, RK_IntegerMinMax, TheLoop, HasFunNoNaNAttr, + RedDes)) { + DEBUG(dbgs() << "Found a MINMAX reduction PHI." << *Phi << "\n"); + return true; + } + if (AddReductionVar(Phi, RK_FloatMult, TheLoop, HasFunNoNaNAttr, RedDes)) { + DEBUG(dbgs() << "Found an FMult reduction PHI." << *Phi << "\n"); + return true; + } + if (AddReductionVar(Phi, RK_FloatAdd, TheLoop, HasFunNoNaNAttr, RedDes)) { + DEBUG(dbgs() << "Found an FAdd reduction PHI." << *Phi << "\n"); + return true; + } + if (AddReductionVar(Phi, RK_FloatMinMax, TheLoop, HasFunNoNaNAttr, RedDes)) { + DEBUG(dbgs() << "Found an float MINMAX reduction PHI." << *Phi << "\n"); + return true; + } + // Not a reduction of known type. + return false; +} + +/// This function returns the identity element (or neutral element) for +/// the operation K. +Constant *ReductionDescriptor::getReductionIdentity(ReductionKind K, Type *Tp) { + switch (K) { + case RK_IntegerXor: + case RK_IntegerAdd: + case RK_IntegerOr: + // Adding, Xoring, Oring zero to a number does not change it. + return ConstantInt::get(Tp, 0); + case RK_IntegerMult: + // Multiplying a number by 1 does not change it. + return ConstantInt::get(Tp, 1); + case RK_IntegerAnd: + // AND-ing a number with an all-1 value does not change it. + return ConstantInt::get(Tp, -1, true); + case RK_FloatMult: + // Multiplying a number by 1 does not change it. + return ConstantFP::get(Tp, 1.0L); + case RK_FloatAdd: + // Adding zero to a number does not change it. + return ConstantFP::get(Tp, 0.0L); + default: + llvm_unreachable("Unknown reduction kind"); + } +} + +/// This function translates the reduction kind to an LLVM binary operator. +unsigned ReductionDescriptor::getReductionBinOp(ReductionKind Kind) { + switch (Kind) { + case RK_IntegerAdd: + return Instruction::Add; + case RK_IntegerMult: + return Instruction::Mul; + case RK_IntegerOr: + return Instruction::Or; + case RK_IntegerAnd: + return Instruction::And; + case RK_IntegerXor: + return Instruction::Xor; + case RK_FloatMult: + return Instruction::FMul; + case RK_FloatAdd: + return Instruction::FAdd; + case RK_IntegerMinMax: + return Instruction::ICmp; + case RK_FloatMinMax: + return Instruction::FCmp; + default: + llvm_unreachable("Unknown reduction operation"); + } +} + +Value * +ReductionDescriptor::createMinMaxOp(IRBuilder<> &Builder, + ReductionInstDesc::MinMaxReductionKind RK, + Value *Left, Value *Right) { + CmpInst::Predicate P = CmpInst::ICMP_NE; + switch (RK) { + default: + llvm_unreachable("Unknown min/max reduction kind"); + case ReductionInstDesc::MRK_UIntMin: + P = CmpInst::ICMP_ULT; + break; + case ReductionInstDesc::MRK_UIntMax: + P = CmpInst::ICMP_UGT; + break; + case ReductionInstDesc::MRK_SIntMin: + P = CmpInst::ICMP_SLT; + break; + case ReductionInstDesc::MRK_SIntMax: + P = CmpInst::ICMP_SGT; + break; + case ReductionInstDesc::MRK_FloatMin: + P = CmpInst::FCMP_OLT; + break; + case ReductionInstDesc::MRK_FloatMax: + P = CmpInst::FCMP_OGT; + break; + } + + Value *Cmp; + if (RK == ReductionInstDesc::MRK_FloatMin || + RK == ReductionInstDesc::MRK_FloatMax) + Cmp = Builder.CreateFCmp(P, Left, Right, "rdx.minmax.cmp"); + else + Cmp = Builder.CreateICmp(P, Left, Right, "rdx.minmax.cmp"); + + Value *Select = Builder.CreateSelect(Cmp, Left, Right, "rdx.minmax.select"); + return Select; +} + +bool llvm::isInductionPHI(PHINode *Phi, ScalarEvolution *SE, + ConstantInt *&StepValue) { + Type *PhiTy = Phi->getType(); + // We only handle integer and pointer inductions variables. + if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy()) + return false; + + // Check that the PHI is consecutive. + const SCEV *PhiScev = SE->getSCEV(Phi); + const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev); + if (!AR) { + DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n"); + return false; + } + + const SCEV *Step = AR->getStepRecurrence(*SE); + // Calculate the pointer stride and check if it is consecutive. + const SCEVConstant *C = dyn_cast<SCEVConstant>(Step); + if (!C) + return false; + + ConstantInt *CV = C->getValue(); + if (PhiTy->isIntegerTy()) { + StepValue = CV; + return true; + } + + assert(PhiTy->isPointerTy() && "The PHI must be a pointer"); + Type *PointerElementType = PhiTy->getPointerElementType(); + // The pointer stride cannot be determined if the pointer element type is not + // sized. + if (!PointerElementType->isSized()) + return false; + + const DataLayout &DL = Phi->getModule()->getDataLayout(); + int64_t Size = static_cast<int64_t>(DL.getTypeAllocSize(PointerElementType)); + int64_t CVSize = CV->getSExtValue(); + if (CVSize % Size) + return false; + StepValue = ConstantInt::getSigned(CV->getType(), CVSize / Size); + return true; +} diff --git a/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp b/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp index 04b9130..e0e0e90 100644 --- a/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp +++ b/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp @@ -14,17 +14,17 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/LLVMContext.h" -#include "llvm/IR/CFG.h" #include "llvm/Pass.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h" #include <algorithm> using namespace llvm; @@ -32,6 +32,23 @@ using namespace llvm; #define DEBUG_TYPE "lower-switch" namespace { + struct IntRange { + int64_t Low, High; + }; + // Return true iff R is covered by Ranges. + static bool IsInRanges(const IntRange &R, + const std::vector<IntRange> &Ranges) { + // Note: Ranges must be sorted, non-overlapping and non-adjacent. + + // Find the first range whose High field is >= R.High, + // then check if the Low field is <= R.Low. If so, we + // have a Range that covers R. + auto I = std::lower_bound( + Ranges.begin(), Ranges.end(), R, + [](const IntRange &A, const IntRange &B) { return A.High < B.High; }); + return I != Ranges.end() && I->Low <= R.Low; + } + /// LowerSwitch Pass - Replace all SwitchInst instructions with chained branch /// instructions. class LowerSwitch : public FunctionPass { @@ -50,13 +67,12 @@ namespace { } struct CaseRange { - Constant* Low; - Constant* High; + ConstantInt* Low; + ConstantInt* High; BasicBlock* BB; - CaseRange(Constant *low = nullptr, Constant *high = nullptr, - BasicBlock *bb = nullptr) : - Low(low), High(high), BB(bb) { } + CaseRange(ConstantInt *low, ConstantInt *high, BasicBlock *bb) + : Low(low), High(high), BB(bb) {} }; typedef std::vector<CaseRange> CaseVector; @@ -67,7 +83,8 @@ namespace { BasicBlock *switchConvert(CaseItr Begin, CaseItr End, ConstantInt *LowerBound, ConstantInt *UpperBound, Value *Val, BasicBlock *Predecessor, - BasicBlock *OrigBlock, BasicBlock *Default); + BasicBlock *OrigBlock, BasicBlock *Default, + const std::vector<IntRange> &UnreachableRanges); BasicBlock *newLeafBlock(CaseRange &Leaf, Value *Val, BasicBlock *OrigBlock, BasicBlock *Default); unsigned Clusterify(CaseVector &Cases, SwitchInst *SI); @@ -158,11 +175,16 @@ static void fixPhis(BasicBlock *SuccBB, BasicBlock *OrigBB, BasicBlock *NewBB, // Remove additional occurences coming from condensed cases and keep the // number of incoming values equal to the number of branches to SuccBB. + SmallVector<unsigned, 8> Indices; for (++Idx; LocalNumMergedCases > 0 && Idx < E; ++Idx) if (PN->getIncomingBlock(Idx) == OrigBB) { - PN->removeIncomingValue(Idx); + Indices.push_back(Idx); LocalNumMergedCases--; } + // Remove incoming values in the reverse order to prevent invalidating + // *successive* index. + for (auto III = Indices.rbegin(), IIE = Indices.rend(); III != IIE; ++III) + PN->removeIncomingValue(*III); } } @@ -171,12 +193,12 @@ static void fixPhis(BasicBlock *SuccBB, BasicBlock *OrigBB, BasicBlock *NewBB, // LowerBound and UpperBound are used to keep track of the bounds for Val // that have already been checked by a block emitted by one of the previous // calls to switchConvert in the call stack. -BasicBlock *LowerSwitch::switchConvert(CaseItr Begin, CaseItr End, - ConstantInt *LowerBound, - ConstantInt *UpperBound, Value *Val, - BasicBlock *Predecessor, - BasicBlock *OrigBlock, - BasicBlock *Default) { +BasicBlock * +LowerSwitch::switchConvert(CaseItr Begin, CaseItr End, ConstantInt *LowerBound, + ConstantInt *UpperBound, Value *Val, + BasicBlock *Predecessor, BasicBlock *OrigBlock, + BasicBlock *Default, + const std::vector<IntRange> &UnreachableRanges) { unsigned Size = End - Begin; if (Size == 1) { @@ -203,32 +225,32 @@ BasicBlock *LowerSwitch::switchConvert(CaseItr Begin, CaseItr End, CaseRange &Pivot = *(Begin + Mid); DEBUG(dbgs() << "Pivot ==> " - << cast<ConstantInt>(Pivot.Low)->getValue() - << " -" << cast<ConstantInt>(Pivot.High)->getValue() << "\n"); + << Pivot.Low->getValue() + << " -" << Pivot.High->getValue() << "\n"); // NewLowerBound here should never be the integer minimal value. // This is because it is computed from a case range that is never // the smallest, so there is always a case range that has at least // a smaller value. - ConstantInt *NewLowerBound = cast<ConstantInt>(Pivot.Low); - ConstantInt *NewUpperBound; - - // If we don't have a Default block then it means that we can never - // have a value outside of a case range, so set the UpperBound to the highest - // value in the LHS part of the case ranges. - if (Default != nullptr) { - // Because NewLowerBound is never the smallest representable integer - // it is safe here to subtract one. - NewUpperBound = ConstantInt::get(NewLowerBound->getContext(), - NewLowerBound->getValue() - 1); - } else { - CaseItr LastLHS = LHS.begin() + LHS.size() - 1; - NewUpperBound = cast<ConstantInt>(LastLHS->High); + ConstantInt *NewLowerBound = Pivot.Low; + + // Because NewLowerBound is never the smallest representable integer + // it is safe here to subtract one. + ConstantInt *NewUpperBound = ConstantInt::get(NewLowerBound->getContext(), + NewLowerBound->getValue() - 1); + + if (!UnreachableRanges.empty()) { + // Check if the gap between LHS's highest and NewLowerBound is unreachable. + int64_t GapLow = LHS.back().High->getSExtValue() + 1; + int64_t GapHigh = NewLowerBound->getSExtValue() - 1; + IntRange Gap = { GapLow, GapHigh }; + if (GapHigh >= GapLow && IsInRanges(Gap, UnreachableRanges)) + NewUpperBound = LHS.back().High; } DEBUG(dbgs() << "LHS Bounds ==> "; if (LowerBound) { - dbgs() << cast<ConstantInt>(LowerBound)->getSExtValue(); + dbgs() << LowerBound->getSExtValue(); } else { dbgs() << "NONE"; } @@ -236,7 +258,7 @@ BasicBlock *LowerSwitch::switchConvert(CaseItr Begin, CaseItr End, dbgs() << "RHS Bounds ==> "; dbgs() << NewLowerBound->getSExtValue() << " - "; if (UpperBound) { - dbgs() << cast<ConstantInt>(UpperBound)->getSExtValue() << "\n"; + dbgs() << UpperBound->getSExtValue() << "\n"; } else { dbgs() << "NONE\n"; }); @@ -251,10 +273,10 @@ BasicBlock *LowerSwitch::switchConvert(CaseItr Begin, CaseItr End, BasicBlock *LBranch = switchConvert(LHS.begin(), LHS.end(), LowerBound, NewUpperBound, Val, NewNode, OrigBlock, - Default); + Default, UnreachableRanges); BasicBlock *RBranch = switchConvert(RHS.begin(), RHS.end(), NewLowerBound, UpperBound, Val, NewNode, OrigBlock, - Default); + Default, UnreachableRanges); Function::iterator FI = OrigBlock; F->getBasicBlockList().insert(++FI, NewNode); @@ -287,11 +309,11 @@ BasicBlock* LowerSwitch::newLeafBlock(CaseRange& Leaf, Value* Val, Leaf.Low, "SwitchLeaf"); } else { // Make range comparison - if (cast<ConstantInt>(Leaf.Low)->isMinValue(true /*isSigned*/)) { + if (Leaf.Low->isMinValue(true /*isSigned*/)) { // Val >= Min && Val <= Hi --> Val <= Hi Comp = new ICmpInst(*NewLeaf, ICmpInst::ICMP_SLE, Val, Leaf.High, "SwitchLeaf"); - } else if (cast<ConstantInt>(Leaf.Low)->isZero()) { + } else if (Leaf.Low->isZero()) { // Val >= 0 && Val <= Hi --> Val <=u Hi Comp = new ICmpInst(*NewLeaf, ICmpInst::ICMP_ULE, Val, Leaf.High, "SwitchLeaf"); @@ -316,8 +338,8 @@ BasicBlock* LowerSwitch::newLeafBlock(CaseRange& Leaf, Value* Val, for (BasicBlock::iterator I = Succ->begin(); isa<PHINode>(I); ++I) { PHINode* PN = cast<PHINode>(I); // Remove all but one incoming entries from the cluster - uint64_t Range = cast<ConstantInt>(Leaf.High)->getSExtValue() - - cast<ConstantInt>(Leaf.Low)->getSExtValue(); + uint64_t Range = Leaf.High->getSExtValue() - + Leaf.Low->getSExtValue(); for (uint64_t j = 0; j < Range; ++j) { PN->removeIncomingValue(OrigBlock); } @@ -345,8 +367,8 @@ unsigned LowerSwitch::Clusterify(CaseVector& Cases, SwitchInst *SI) { if (Cases.size()>=2) for (CaseItr I = Cases.begin(), J = std::next(Cases.begin()); J != Cases.end();) { - int64_t nextValue = cast<ConstantInt>(J->Low)->getSExtValue(); - int64_t currentValue = cast<ConstantInt>(I->High)->getSExtValue(); + int64_t nextValue = J->Low->getSExtValue(); + int64_t currentValue = I->High->getSExtValue(); BasicBlock* nextBB = J->BB; BasicBlock* currentBB = I->BB; @@ -379,26 +401,102 @@ void LowerSwitch::processSwitchInst(SwitchInst *SI) { Value *Val = SI->getCondition(); // The value we are switching on... BasicBlock* Default = SI->getDefaultDest(); - // If there is only the default destination, don't bother with the code below. + // If there is only the default destination, just branch. if (!SI->getNumCases()) { - BranchInst::Create(SI->getDefaultDest(), CurBlock); - CurBlock->getInstList().erase(SI); + BranchInst::Create(Default, CurBlock); + SI->eraseFromParent(); return; } - const bool DefaultIsUnreachable = - Default->size() == 1 && isa<UnreachableInst>(Default->getTerminator()); + // Prepare cases vector. + CaseVector Cases; + unsigned numCmps = Clusterify(Cases, SI); + DEBUG(dbgs() << "Clusterify finished. Total clusters: " << Cases.size() + << ". Total compares: " << numCmps << "\n"); + DEBUG(dbgs() << "Cases: " << Cases << "\n"); + (void)numCmps; + + ConstantInt *LowerBound = nullptr; + ConstantInt *UpperBound = nullptr; + std::vector<IntRange> UnreachableRanges; + + if (isa<UnreachableInst>(Default->getFirstNonPHIOrDbg())) { + // Make the bounds tightly fitted around the case value range, becase we + // know that the value passed to the switch must be exactly one of the case + // values. + assert(!Cases.empty()); + LowerBound = Cases.front().Low; + UpperBound = Cases.back().High; + + DenseMap<BasicBlock *, unsigned> Popularity; + unsigned MaxPop = 0; + BasicBlock *PopSucc = nullptr; + + IntRange R = { INT64_MIN, INT64_MAX }; + UnreachableRanges.push_back(R); + for (const auto &I : Cases) { + int64_t Low = I.Low->getSExtValue(); + int64_t High = I.High->getSExtValue(); + + IntRange &LastRange = UnreachableRanges.back(); + if (LastRange.Low == Low) { + // There is nothing left of the previous range. + UnreachableRanges.pop_back(); + } else { + // Terminate the previous range. + assert(Low > LastRange.Low); + LastRange.High = Low - 1; + } + if (High != INT64_MAX) { + IntRange R = { High + 1, INT64_MAX }; + UnreachableRanges.push_back(R); + } + + // Count popularity. + int64_t N = High - Low + 1; + unsigned &Pop = Popularity[I.BB]; + if ((Pop += N) > MaxPop) { + MaxPop = Pop; + PopSucc = I.BB; + } + } +#ifndef NDEBUG + /* UnreachableRanges should be sorted and the ranges non-adjacent. */ + for (auto I = UnreachableRanges.begin(), E = UnreachableRanges.end(); + I != E; ++I) { + assert(I->Low <= I->High); + auto Next = I + 1; + if (Next != E) { + assert(Next->Low > I->High); + } + } +#endif + + // Use the most popular block as the new default, reducing the number of + // cases. + assert(MaxPop > 0 && PopSucc); + Default = PopSucc; + for (CaseItr I = Cases.begin(); I != Cases.end();) { + if (I->BB == PopSucc) + I = Cases.erase(I); + else + ++I; + } + + // If there are no cases left, just branch. + if (Cases.empty()) { + BranchInst::Create(Default, CurBlock); + SI->eraseFromParent(); + return; + } + } + // Create a new, empty default block so that the new hierarchy of // if-then statements go to this and the PHI nodes are happy. - // if the default block is set as an unreachable we avoid creating one - // because will never be a valid target. - BasicBlock *NewDefault = nullptr; - if (!DefaultIsUnreachable) { - NewDefault = BasicBlock::Create(SI->getContext(), "NewDefault"); - F->getBasicBlockList().insert(Default, NewDefault); - - BranchInst::Create(Default, NewDefault); - } + BasicBlock *NewDefault = BasicBlock::Create(SI->getContext(), "NewDefault"); + F->getBasicBlockList().insert(Default, NewDefault); + BranchInst::Create(Default, NewDefault); + // If there is an entry in any PHI nodes for the default edge, make sure // to update them as well. for (BasicBlock::iterator I = Default->begin(); isa<PHINode>(I); ++I) { @@ -408,40 +506,18 @@ void LowerSwitch::processSwitchInst(SwitchInst *SI) { PN->setIncomingBlock((unsigned)BlockIdx, NewDefault); } - // Prepare cases vector. - CaseVector Cases; - unsigned numCmps = Clusterify(Cases, SI); - - DEBUG(dbgs() << "Clusterify finished. Total clusters: " << Cases.size() - << ". Total compares: " << numCmps << "\n"); - DEBUG(dbgs() << "Cases: " << Cases << "\n"); - (void)numCmps; - - ConstantInt *UpperBound = nullptr; - ConstantInt *LowerBound = nullptr; - - // Optimize the condition where Default is an unreachable block. In this case - // we can make the bounds tightly fitted around the case value ranges, - // because we know that the value passed to the switch should always be - // exactly one of the case values. - if (DefaultIsUnreachable) { - CaseItr LastCase = Cases.begin() + Cases.size() - 1; - UpperBound = cast<ConstantInt>(LastCase->High); - LowerBound = cast<ConstantInt>(Cases.begin()->Low); - } BasicBlock *SwitchBlock = switchConvert(Cases.begin(), Cases.end(), LowerBound, UpperBound, Val, - OrigBlock, OrigBlock, NewDefault); + OrigBlock, OrigBlock, NewDefault, UnreachableRanges); // Branch to our shiny new if-then stuff... BranchInst::Create(SwitchBlock, OrigBlock); // We are now done with the switch instruction, delete it. + BasicBlock *OldDefault = SI->getDefaultDest(); CurBlock->getInstList().erase(SI); - pred_iterator PI = pred_begin(Default), E = pred_end(Default); - // If the Default block has no more predecessors just remove it - if (PI == E) { - DeleteDeadBlock(Default); - } + // If the Default block has no more predecessors just remove it. + if (pred_begin(OldDefault) == pred_end(OldDefault)) + DeleteDeadBlock(OldDefault); } diff --git a/contrib/llvm/lib/Transforms/Utils/ModuleUtils.cpp b/contrib/llvm/lib/Transforms/Utils/ModuleUtils.cpp index 35c701e..d69a81e 100644 --- a/contrib/llvm/lib/Transforms/Utils/ModuleUtils.cpp +++ b/contrib/llvm/lib/Transforms/Utils/ModuleUtils.cpp @@ -17,6 +17,7 @@ #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Module.h" +#include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -93,3 +94,34 @@ llvm::collectUsedGlobalVariables(Module &M, SmallPtrSetImpl<GlobalValue *> &Set, } return GV; } + +Function *llvm::checkSanitizerInterfaceFunction(Constant *FuncOrBitcast) { + if (isa<Function>(FuncOrBitcast)) + return cast<Function>(FuncOrBitcast); + FuncOrBitcast->dump(); + std::string Err; + raw_string_ostream Stream(Err); + Stream << "Sanitizer interface function redefined: " << *FuncOrBitcast; + report_fatal_error(Err); +} + +std::pair<Function *, Function *> llvm::createSanitizerCtorAndInitFunctions( + Module &M, StringRef CtorName, StringRef InitName, + ArrayRef<Type *> InitArgTypes, ArrayRef<Value *> InitArgs) { + assert(!InitName.empty() && "Expected init function name"); + assert(InitArgTypes.size() == InitArgTypes.size() && + "Sanitizer's init function expects different number of arguments"); + Function *Ctor = Function::Create( + FunctionType::get(Type::getVoidTy(M.getContext()), false), + GlobalValue::InternalLinkage, CtorName, &M); + BasicBlock *CtorBB = BasicBlock::Create(M.getContext(), "", Ctor); + IRBuilder<> IRB(ReturnInst::Create(M.getContext(), CtorBB)); + Function *InitFunction = + checkSanitizerInterfaceFunction(M.getOrInsertFunction( + InitName, FunctionType::get(IRB.getVoidTy(), InitArgTypes, false), + AttributeSet())); + InitFunction->setLinkage(Function::ExternalLinkage); + IRB.CreateCall(InitFunction, InitArgs); + return std::make_pair(Ctor, InitFunction); +} + diff --git a/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp index dabadb7..623dbc9 100644 --- a/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp +++ b/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp @@ -13,16 +13,6 @@ // traversing the function in depth-first order to rewrite loads and stores as // appropriate. // -// The algorithm used here is based on: -// -// Sreedhar and Gao. A linear time algorithm for placing phi-nodes. -// In Proceedings of the 22nd ACM SIGPLAN-SIGACT Symposium on Principles of -// Programming Languages -// POPL '95. ACM, New York, NY, 62-73. -// -// It has been modified to not explicitly use the DJ graph data structure and to -// directly compute pruned SSA using per-variable liveness information. -// //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/PromoteMemToReg.h" @@ -34,6 +24,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasSetTracker.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/IteratedDominanceFrontier.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" @@ -45,9 +36,9 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Metadata.h" +#include "llvm/IR/Module.h" #include "llvm/Transforms/Utils/Local.h" #include <algorithm> -#include <queue> using namespace llvm; #define DEBUG_TYPE "mem2reg" @@ -274,9 +265,6 @@ struct PromoteMem2Reg { /// behavior. DenseMap<BasicBlock *, unsigned> BBNumbers; - /// Maps DomTreeNodes to their level in the dominator tree. - DenseMap<DomTreeNode *, unsigned> DomLevels; - /// Lazily compute the number of predecessors a block has. DenseMap<const BasicBlock *, unsigned> BBNumPreds; @@ -303,8 +291,6 @@ private: return NP - 1; } - void DetermineInsertionPoint(AllocaInst *AI, unsigned AllocaNum, - AllocaInfo &Info); void ComputeLiveInBlocks(AllocaInst *AI, AllocaInfo &Info, const SmallPtrSetImpl<BasicBlock *> &DefBlocks, SmallPtrSetImpl<BasicBlock *> &LiveInBlocks); @@ -531,6 +517,7 @@ void PromoteMem2Reg::run() { AllocaInfo Info; LargeBlockInfo LBI; + IDFCalculator IDF(DT); for (unsigned AllocaNum = 0; AllocaNum != Allocas.size(); ++AllocaNum) { AllocaInst *AI = Allocas[AllocaNum]; @@ -578,31 +565,12 @@ void PromoteMem2Reg::run() { continue; } - // If we haven't computed dominator tree levels, do so now. - if (DomLevels.empty()) { - SmallVector<DomTreeNode *, 32> Worklist; - - DomTreeNode *Root = DT.getRootNode(); - DomLevels[Root] = 0; - Worklist.push_back(Root); - - while (!Worklist.empty()) { - DomTreeNode *Node = Worklist.pop_back_val(); - unsigned ChildLevel = DomLevels[Node] + 1; - for (DomTreeNode::iterator CI = Node->begin(), CE = Node->end(); - CI != CE; ++CI) { - DomLevels[*CI] = ChildLevel; - Worklist.push_back(*CI); - } - } - } - // If we haven't computed a numbering for the BB's in the function, do so // now. if (BBNumbers.empty()) { unsigned ID = 0; - for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) - BBNumbers[I] = ID++; + for (auto &BB : F) + BBNumbers[&BB] = ID++; } // If we have an AST to keep updated, remember some pointer value that is @@ -621,7 +589,34 @@ void PromoteMem2Reg::run() { // the standard SSA construction algorithm. Determine which blocks need PHI // nodes and see if we can optimize out some work by avoiding insertion of // dead phi nodes. - DetermineInsertionPoint(AI, AllocaNum, Info); + + + // Unique the set of defining blocks for efficient lookup. + SmallPtrSet<BasicBlock *, 32> DefBlocks; + DefBlocks.insert(Info.DefiningBlocks.begin(), Info.DefiningBlocks.end()); + + // Determine which blocks the value is live in. These are blocks which lead + // to uses. + SmallPtrSet<BasicBlock *, 32> LiveInBlocks; + ComputeLiveInBlocks(AI, Info, DefBlocks, LiveInBlocks); + + // At this point, we're committed to promoting the alloca using IDF's, and + // the standard SSA construction algorithm. Determine which blocks need phi + // nodes and see if we can optimize out some work by avoiding insertion of + // dead phi nodes. + IDF.setLiveInBlocks(LiveInBlocks); + IDF.setDefiningBlocks(DefBlocks); + SmallVector<BasicBlock *, 32> PHIBlocks; + IDF.calculate(PHIBlocks); + if (PHIBlocks.size() > 1) + std::sort(PHIBlocks.begin(), PHIBlocks.end(), + [this](BasicBlock *A, BasicBlock *B) { + return BBNumbers.lookup(A) < BBNumbers.lookup(B); + }); + + unsigned CurrentVersion = 0; + for (unsigned i = 0, e = PHIBlocks.size(); i != e; ++i) + QueuePhiNode(PHIBlocks[i], AllocaNum, CurrentVersion); } if (Allocas.empty()) @@ -667,6 +662,8 @@ void PromoteMem2Reg::run() { A->eraseFromParent(); } + const DataLayout &DL = F.getParent()->getDataLayout(); + // Remove alloca's dbg.declare instrinsics from the function. for (unsigned i = 0, e = AllocaDbgDeclares.size(); i != e; ++i) if (DbgDeclareInst *DDI = AllocaDbgDeclares[i]) @@ -691,7 +688,7 @@ void PromoteMem2Reg::run() { PHINode *PN = I->second; // If this PHI node merges one value and/or undefs, get the value. - if (Value *V = SimplifyInstruction(PN, nullptr, nullptr, &DT, AC)) { + if (Value *V = SimplifyInstruction(PN, DL, nullptr, &DT, AC)) { if (AST && PN->getType()->isPointerTy()) AST->deleteValue(PN); PN->replaceAllUsesWith(V); @@ -841,95 +838,6 @@ void PromoteMem2Reg::ComputeLiveInBlocks( } } -/// At this point, we're committed to promoting the alloca using IDF's, and the -/// standard SSA construction algorithm. Determine which blocks need phi nodes -/// and see if we can optimize out some work by avoiding insertion of dead phi -/// nodes. -void PromoteMem2Reg::DetermineInsertionPoint(AllocaInst *AI, unsigned AllocaNum, - AllocaInfo &Info) { - // Unique the set of defining blocks for efficient lookup. - SmallPtrSet<BasicBlock *, 32> DefBlocks; - DefBlocks.insert(Info.DefiningBlocks.begin(), Info.DefiningBlocks.end()); - - // Determine which blocks the value is live in. These are blocks which lead - // to uses. - SmallPtrSet<BasicBlock *, 32> LiveInBlocks; - ComputeLiveInBlocks(AI, Info, DefBlocks, LiveInBlocks); - - // Use a priority queue keyed on dominator tree level so that inserted nodes - // are handled from the bottom of the dominator tree upwards. - typedef std::pair<DomTreeNode *, unsigned> DomTreeNodePair; - typedef std::priority_queue<DomTreeNodePair, SmallVector<DomTreeNodePair, 32>, - less_second> IDFPriorityQueue; - IDFPriorityQueue PQ; - - for (BasicBlock *BB : DefBlocks) { - if (DomTreeNode *Node = DT.getNode(BB)) - PQ.push(std::make_pair(Node, DomLevels[Node])); - } - - SmallVector<std::pair<unsigned, BasicBlock *>, 32> DFBlocks; - SmallPtrSet<DomTreeNode *, 32> Visited; - SmallVector<DomTreeNode *, 32> Worklist; - while (!PQ.empty()) { - DomTreeNodePair RootPair = PQ.top(); - PQ.pop(); - DomTreeNode *Root = RootPair.first; - unsigned RootLevel = RootPair.second; - - // Walk all dominator tree children of Root, inspecting their CFG edges with - // targets elsewhere on the dominator tree. Only targets whose level is at - // most Root's level are added to the iterated dominance frontier of the - // definition set. - - Worklist.clear(); - Worklist.push_back(Root); - - while (!Worklist.empty()) { - DomTreeNode *Node = Worklist.pop_back_val(); - BasicBlock *BB = Node->getBlock(); - - for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; - ++SI) { - DomTreeNode *SuccNode = DT.getNode(*SI); - - // Quickly skip all CFG edges that are also dominator tree edges instead - // of catching them below. - if (SuccNode->getIDom() == Node) - continue; - - unsigned SuccLevel = DomLevels[SuccNode]; - if (SuccLevel > RootLevel) - continue; - - if (!Visited.insert(SuccNode).second) - continue; - - BasicBlock *SuccBB = SuccNode->getBlock(); - if (!LiveInBlocks.count(SuccBB)) - continue; - - DFBlocks.push_back(std::make_pair(BBNumbers[SuccBB], SuccBB)); - if (!DefBlocks.count(SuccBB)) - PQ.push(std::make_pair(SuccNode, SuccLevel)); - } - - for (DomTreeNode::iterator CI = Node->begin(), CE = Node->end(); CI != CE; - ++CI) { - if (!Visited.count(*CI)) - Worklist.push_back(*CI); - } - } - } - - if (DFBlocks.size() > 1) - std::sort(DFBlocks.begin(), DFBlocks.end()); - - unsigned CurrentVersion = 0; - for (unsigned i = 0, e = DFBlocks.size(); i != e; ++i) - QueuePhiNode(DFBlocks[i].second, AllocaNum, CurrentVersion); -} - /// \brief Queue a phi-node to be added to a basic-block for a specific Alloca. /// /// Returns true if there wasn't already a phi-node for that variable diff --git a/contrib/llvm/lib/Transforms/Utils/SSAUpdater.cpp b/contrib/llvm/lib/Transforms/Utils/SSAUpdater.cpp index 3fcb789..88b39dd 100644 --- a/contrib/llvm/lib/Transforms/Utils/SSAUpdater.cpp +++ b/contrib/llvm/lib/Transforms/Utils/SSAUpdater.cpp @@ -19,6 +19,7 @@ #include "llvm/IR/Constants.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" @@ -150,12 +151,13 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) { ProtoName, &BB->front()); // Fill in all the predecessors of the PHI. - for (unsigned i = 0, e = PredValues.size(); i != e; ++i) - InsertedPHI->addIncoming(PredValues[i].second, PredValues[i].first); + for (const auto &PredValue : PredValues) + InsertedPHI->addIncoming(PredValue.second, PredValue.first); // See if the PHI node can be merged to a single value. This can happen in // loop cases when we get a PHI of itself and one other value. - if (Value *V = SimplifyInstruction(InsertedPHI)) { + if (Value *V = + SimplifyInstruction(InsertedPHI, BB->getModule()->getDataLayout())) { InsertedPHI->eraseFromParent(); return V; } @@ -245,8 +247,7 @@ public: // but it is relatively slow. If we already have PHI nodes in this // block, walk one of them to get the predecessor list instead. if (PHINode *SomePhi = dyn_cast<PHINode>(BB->begin())) { - for (unsigned PI = 0, E = SomePhi->getNumIncomingValues(); PI != E; ++PI) - Preds->push_back(SomePhi->getIncomingBlock(PI)); + Preds->append(SomePhi->block_begin(), SomePhi->block_end()); } else { for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) Preds->push_back(*PI); @@ -321,12 +322,12 @@ Value *SSAUpdater::GetValueAtEndOfBlockInternal(BasicBlock *BB) { //===----------------------------------------------------------------------===// LoadAndStorePromoter:: -LoadAndStorePromoter(const SmallVectorImpl<Instruction*> &Insts, +LoadAndStorePromoter(ArrayRef<const Instruction*> Insts, SSAUpdater &S, StringRef BaseName) : SSA(S) { if (Insts.empty()) return; - Value *SomeVal; - if (LoadInst *LI = dyn_cast<LoadInst>(Insts[0])) + const Value *SomeVal; + if (const LoadInst *LI = dyn_cast<LoadInst>(Insts[0])) SomeVal = LI; else SomeVal = cast<StoreInst>(Insts[0])->getOperand(0); @@ -344,20 +345,17 @@ run(const SmallVectorImpl<Instruction*> &Insts) const { // This is important because we have to handle multiple defs/uses in a block // ourselves: SSAUpdater is purely for cross-block references. DenseMap<BasicBlock*, TinyPtrVector<Instruction*> > UsesByBlock; - - for (unsigned i = 0, e = Insts.size(); i != e; ++i) { - Instruction *User = Insts[i]; + + for (Instruction *User : Insts) UsesByBlock[User->getParent()].push_back(User); - } // Okay, now we can iterate over all the blocks in the function with uses, // processing them. Keep track of which loads are loading a live-in value. // Walk the uses in the use-list order to be determinstic. SmallVector<LoadInst*, 32> LiveInLoads; DenseMap<Value*, Value*> ReplacedLoads; - - for (unsigned i = 0, e = Insts.size(); i != e; ++i) { - Instruction *User = Insts[i]; + + for (Instruction *User : Insts) { BasicBlock *BB = User->getParent(); TinyPtrVector<Instruction*> &BlockUses = UsesByBlock[BB]; @@ -380,8 +378,8 @@ run(const SmallVectorImpl<Instruction*> &Insts) const { // Otherwise, check to see if this block is all loads. bool HasStore = false; - for (unsigned i = 0, e = BlockUses.size(); i != e; ++i) { - if (isa<StoreInst>(BlockUses[i])) { + for (Instruction *I : BlockUses) { + if (isa<StoreInst>(I)) { HasStore = true; break; } @@ -391,8 +389,8 @@ run(const SmallVectorImpl<Instruction*> &Insts) const { // efficient way to tell which on is first in the block and don't want to // scan large blocks, so just add all loads as live ins. if (!HasStore) { - for (unsigned i = 0, e = BlockUses.size(); i != e; ++i) - LiveInLoads.push_back(cast<LoadInst>(BlockUses[i])); + for (Instruction *I : BlockUses) + LiveInLoads.push_back(cast<LoadInst>(I)); BlockUses.clear(); continue; } @@ -403,8 +401,8 @@ run(const SmallVectorImpl<Instruction*> &Insts) const { // block is a load, then it uses the live in value. The last store defines // the live out value. We handle this by doing a linear scan of the block. Value *StoredValue = nullptr; - for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E; ++II) { - if (LoadInst *L = dyn_cast<LoadInst>(II)) { + for (Instruction &I : *BB) { + if (LoadInst *L = dyn_cast<LoadInst>(&I)) { // If this is a load from an unrelated pointer, ignore it. if (!isInstInList(L, Insts)) continue; @@ -419,8 +417,8 @@ run(const SmallVectorImpl<Instruction*> &Insts) const { } continue; } - - if (StoreInst *SI = dyn_cast<StoreInst>(II)) { + + if (StoreInst *SI = dyn_cast<StoreInst>(&I)) { // If this is a store to an unrelated pointer, ignore it. if (!isInstInList(SI, Insts)) continue; updateDebugInfo(SI); @@ -438,8 +436,7 @@ run(const SmallVectorImpl<Instruction*> &Insts) const { // Okay, now we rewrite all loads that use live-in values in the loop, // inserting PHI nodes as necessary. - for (unsigned i = 0, e = LiveInLoads.size(); i != e; ++i) { - LoadInst *ALoad = LiveInLoads[i]; + for (LoadInst *ALoad : LiveInLoads) { Value *NewVal = SSA.GetValueInMiddleOfBlock(ALoad->getParent()); replaceLoadWithValue(ALoad, NewVal); @@ -454,9 +451,7 @@ run(const SmallVectorImpl<Instruction*> &Insts) const { // Now that everything is rewritten, delete the old instructions from the // function. They should all be dead now. - for (unsigned i = 0, e = Insts.size(); i != e; ++i) { - Instruction *User = Insts[i]; - + for (Instruction *User : Insts) { // If this is a load that still has uses, then the load must have been added // as a live value in the SSAUpdate data structure for a block (e.g. because // the loaded value was stored later). In this case, we need to recursively diff --git a/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index f6867c2..60ac271 100644 --- a/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -53,9 +53,13 @@ using namespace PatternMatch; #define DEBUG_TYPE "simplifycfg" +// Chosen as 2 so as to be cheap, but still to have enough power to fold +// a select, so the "clamp" idiom (of a min followed by a max) will be caught. +// To catch this, we need to fold a compare and a select, hence '2' being the +// minimum reasonable default. static cl::opt<unsigned> -PHINodeFoldingThreshold("phi-node-folding-threshold", cl::Hidden, cl::init(1), - cl::desc("Control the amount of phi node folding to perform (default = 1)")); +PHINodeFoldingThreshold("phi-node-folding-threshold", cl::Hidden, cl::init(2), + cl::desc("Control the amount of phi node folding to perform (default = 2)")); static cl::opt<bool> DupRet("simplifycfg-dup-ret", cl::Hidden, cl::init(false), @@ -106,8 +110,8 @@ namespace { class SimplifyCFGOpt { const TargetTransformInfo &TTI; + const DataLayout &DL; unsigned BonusInstThreshold; - const DataLayout *const DL; AssumptionCache *AC; Value *isValueEqualityComparison(TerminatorInst *TI); BasicBlock *GetValueEqualityComparisonCases(TerminatorInst *TI, @@ -127,9 +131,9 @@ class SimplifyCFGOpt { bool SimplifyCondBranch(BranchInst *BI, IRBuilder <>&Builder); public: - SimplifyCFGOpt(const TargetTransformInfo &TTI, unsigned BonusInstThreshold, - const DataLayout *DL, AssumptionCache *AC) - : TTI(TTI), BonusInstThreshold(BonusInstThreshold), DL(DL), AC(AC) {} + SimplifyCFGOpt(const TargetTransformInfo &TTI, const DataLayout &DL, + unsigned BonusInstThreshold, AssumptionCache *AC) + : TTI(TTI), DL(DL), BonusInstThreshold(BonusInstThreshold), AC(AC) {} bool run(BasicBlock *BB); }; } @@ -216,45 +220,15 @@ static void AddPredecessorToBlock(BasicBlock *Succ, BasicBlock *NewPred, } /// ComputeSpeculationCost - Compute an abstract "cost" of speculating the -/// given instruction, which is assumed to be safe to speculate. 1 means -/// cheap, 2 means less cheap, and UINT_MAX means prohibitively expensive. -static unsigned ComputeSpeculationCost(const User *I, const DataLayout *DL) { - assert(isSafeToSpeculativelyExecute(I, DL) && +/// given instruction, which is assumed to be safe to speculate. TCC_Free means +/// cheap, TCC_Basic means less cheap, and TCC_Expensive means prohibitively +/// expensive. +static unsigned ComputeSpeculationCost(const User *I, + const TargetTransformInfo &TTI) { + assert(isSafeToSpeculativelyExecute(I) && "Instruction is not safe to speculatively execute!"); - switch (Operator::getOpcode(I)) { - default: - // In doubt, be conservative. - return UINT_MAX; - case Instruction::GetElementPtr: - // GEPs are cheap if all indices are constant. - if (!cast<GEPOperator>(I)->hasAllConstantIndices()) - return UINT_MAX; - return 1; - case Instruction::ExtractValue: - case Instruction::Load: - case Instruction::Add: - case Instruction::Sub: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::ICmp: - case Instruction::Trunc: - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::BitCast: - case Instruction::ExtractElement: - case Instruction::InsertElement: - return 1; // These are all cheap. - - case Instruction::Call: - case Instruction::Select: - return 2; - } + return TTI.getUserCost(I); } - /// DominatesMergePoint - If we have a merge point of an "if condition" as /// accepted above, return true if the specified value dominates the block. We /// don't handle the true generality of domination here, just a special case @@ -275,7 +249,7 @@ static unsigned ComputeSpeculationCost(const User *I, const DataLayout *DL) { static bool DominatesMergePoint(Value *V, BasicBlock *BB, SmallPtrSetImpl<Instruction*> *AggressiveInsts, unsigned &CostRemaining, - const DataLayout *DL) { + const TargetTransformInfo &TTI) { Instruction *I = dyn_cast<Instruction>(V); if (!I) { // Non-instructions all dominate instructions, but not all constantexprs @@ -308,10 +282,10 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB, // Okay, it looks like the instruction IS in the "condition". Check to // see if it's a cheap instruction to unconditionally compute, and if it // only uses stuff defined outside of the condition. If so, hoist it out. - if (!isSafeToSpeculativelyExecute(I, DL)) + if (!isSafeToSpeculativelyExecute(I)) return false; - unsigned Cost = ComputeSpeculationCost(I, DL); + unsigned Cost = ComputeSpeculationCost(I, TTI); if (Cost > CostRemaining) return false; @@ -321,7 +295,7 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB, // Okay, we can only really hoist these out if their operands do // not take us over the cost threshold. for (User::op_iterator i = I->op_begin(), e = I->op_end(); i != e; ++i) - if (!DominatesMergePoint(*i, BB, AggressiveInsts, CostRemaining, DL)) + if (!DominatesMergePoint(*i, BB, AggressiveInsts, CostRemaining, TTI)) return false; // Okay, it's safe to do this! Remember this instruction. AggressiveInsts->insert(I); @@ -330,15 +304,15 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB, /// GetConstantInt - Extract ConstantInt from value, looking through IntToPtr /// and PointerNullValue. Return NULL if value is not a constant int. -static ConstantInt *GetConstantInt(Value *V, const DataLayout *DL) { +static ConstantInt *GetConstantInt(Value *V, const DataLayout &DL) { // Normal constant int. ConstantInt *CI = dyn_cast<ConstantInt>(V); - if (CI || !DL || !isa<Constant>(V) || !V->getType()->isPointerTy()) + if (CI || !isa<Constant>(V) || !V->getType()->isPointerTy()) return CI; // This is some kind of pointer constant. Turn it into a pointer-sized // ConstantInt if possible. - IntegerType *PtrTy = cast<IntegerType>(DL->getIntPtrType(V->getType())); + IntegerType *PtrTy = cast<IntegerType>(DL.getIntPtrType(V->getType())); // Null pointer means 0, see SelectionDAGBuilder::getValue(const Value*). if (isa<ConstantPointerNull>(V)) @@ -371,23 +345,22 @@ namespace { /// while for a chain of '&&' it will build the set elements that make the test /// fail. struct ConstantComparesGatherer { - + const DataLayout &DL; Value *CompValue; /// Value found for the switch comparison Value *Extra; /// Extra clause to be checked before the switch SmallVector<ConstantInt *, 8> Vals; /// Set of integers to match in switch unsigned UsedICmps; /// Number of comparisons matched in the and/or chain /// Construct and compute the result for the comparison instruction Cond - ConstantComparesGatherer(Instruction *Cond, const DataLayout *DL) - : CompValue(nullptr), Extra(nullptr), UsedICmps(0) { - gather(Cond, DL); + ConstantComparesGatherer(Instruction *Cond, const DataLayout &DL) + : DL(DL), CompValue(nullptr), Extra(nullptr), UsedICmps(0) { + gather(Cond); } /// Prevent copy - ConstantComparesGatherer(const ConstantComparesGatherer &) - LLVM_DELETED_FUNCTION; + ConstantComparesGatherer(const ConstantComparesGatherer &) = delete; ConstantComparesGatherer & - operator=(const ConstantComparesGatherer &) LLVM_DELETED_FUNCTION; + operator=(const ConstantComparesGatherer &) = delete; private: @@ -406,7 +379,7 @@ private: /// against is placed in CompValue. /// If CompValue is already set, the function is expected to fail if a match /// is found but the value compared to is different. - bool matchInstruction(Instruction *I, const DataLayout *DL, bool isEQ) { + bool matchInstruction(Instruction *I, bool isEQ) { // If this is an icmp against a constant, handle this as one of the cases. ICmpInst *ICI; ConstantInt *C; @@ -448,8 +421,8 @@ private: } // If we have "x ult 3", for example, then we can add 0,1,2 to the set. - ConstantRange Span = ConstantRange::makeICmpRegion(ICI->getPredicate(), - C->getValue()); + ConstantRange Span = ConstantRange::makeAllowedICmpRegion( + ICI->getPredicate(), C->getValue()); // Shift the range if the compare is fed by an add. This is the range // compare idiom as emitted by instcombine. @@ -488,7 +461,7 @@ private: /// the value being compared, and stick the list constants into the Vals /// vector. /// One "Extra" case is allowed to differ from the other. - void gather(Value *V, const DataLayout *DL) { + void gather(Value *V) { Instruction *I = dyn_cast<Instruction>(V); bool isEQ = (I->getOpcode() == Instruction::Or); @@ -510,7 +483,7 @@ private: } // Try to match the current instruction - if (matchInstruction(I, DL, isEQ)) + if (matchInstruction(I, isEQ)) // Match succeed, continue the loop continue; } @@ -558,15 +531,16 @@ Value *SimplifyCFGOpt::isValueEqualityComparison(TerminatorInst *TI) { CV = SI->getCondition(); } else if (BranchInst *BI = dyn_cast<BranchInst>(TI)) if (BI->isConditional() && BI->getCondition()->hasOneUse()) - if (ICmpInst *ICI = dyn_cast<ICmpInst>(BI->getCondition())) + if (ICmpInst *ICI = dyn_cast<ICmpInst>(BI->getCondition())) { if (ICI->isEquality() && GetConstantInt(ICI->getOperand(1), DL)) CV = ICI->getOperand(0); + } // Unwrap any lossless ptrtoint cast. - if (DL && CV) { + if (CV) { if (PtrToIntInst *PTII = dyn_cast<PtrToIntInst>(CV)) { Value *Ptr = PTII->getPointerOperand(); - if (PTII->getType() == DL->getIntPtrType(Ptr->getType())) + if (PTII->getType() == DL.getIntPtrType(Ptr->getType())) CV = Ptr; } } @@ -1007,8 +981,7 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI, Builder.SetInsertPoint(PTI); // Convert pointer to int before we switch. if (CV->getType()->isPointerTy()) { - assert(DL && "Cannot switch on pointer without DataLayout"); - CV = Builder.CreatePtrToInt(CV, DL->getIntPtrType(CV->getType()), + CV = Builder.CreatePtrToInt(CV, DL.getIntPtrType(CV->getType()), "magicptr"); } @@ -1079,7 +1052,8 @@ static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I); /// HoistThenElseCodeToIf - Given a conditional branch that goes to BB1 and /// BB2, hoist any common code in the two blocks up into the branch block. The /// caller of this function guarantees that BI's block dominates BB1 and BB2. -static bool HoistThenElseCodeToIf(BranchInst *BI, const DataLayout *DL) { +static bool HoistThenElseCodeToIf(BranchInst *BI, + const TargetTransformInfo &TTI) { // This does very trivial matching, with limited scanning, to find identical // instructions in the two blocks. In particular, we don't want to get into // O(M*N) situations here where M and N are the sizes of BB1 and BB2. As @@ -1114,6 +1088,9 @@ static bool HoistThenElseCodeToIf(BranchInst *BI, const DataLayout *DL) { if (isa<TerminatorInst>(I1)) goto HoistTerminator; + if (!TTI.isProfitableToHoist(I1) || !TTI.isProfitableToHoist(I2)) + return Changed; + // For a normal instruction, we just move one to right before the branch, // then replace all uses of the other with the first. Finally, we remove // the now redundant second instruction. @@ -1167,9 +1144,9 @@ HoistTerminator: passingValueIsAlwaysUndefined(BB2V, PN)) return Changed; - if (isa<ConstantExpr>(BB1V) && !isSafeToSpeculativelyExecute(BB1V, DL)) + if (isa<ConstantExpr>(BB1V) && !isSafeToSpeculativelyExecute(BB1V)) return Changed; - if (isa<ConstantExpr>(BB2V) && !isSafeToSpeculativelyExecute(BB2V, DL)) + if (isa<ConstantExpr>(BB2V) && !isSafeToSpeculativelyExecute(BB2V)) return Changed; } } @@ -1489,7 +1466,7 @@ static Value *isSafeToSpeculateStore(Instruction *I, BasicBlock *BrBB, /// /// \returns true if the conditional block is removed. static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB, - const DataLayout *DL) { + const TargetTransformInfo &TTI) { // Be conservative for now. FP select instruction can often be expensive. Value *BrCond = BI->getCondition(); if (isa<FCmpInst>(BrCond)) @@ -1525,20 +1502,20 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB, if (isa<DbgInfoIntrinsic>(I)) continue; - // Only speculatively execution a single instruction (not counting the + // Only speculatively execute a single instruction (not counting the // terminator) for now. ++SpeculationCost; if (SpeculationCost > 1) return false; // Don't hoist the instruction if it's unsafe or expensive. - if (!isSafeToSpeculativelyExecute(I, DL) && - !(HoistCondStores && - (SpeculatedStoreValue = isSafeToSpeculateStore(I, BB, ThenBB, - EndBB)))) + if (!isSafeToSpeculativelyExecute(I) && + !(HoistCondStores && (SpeculatedStoreValue = isSafeToSpeculateStore( + I, BB, ThenBB, EndBB)))) return false; if (!SpeculatedStoreValue && - ComputeSpeculationCost(I, DL) > PHINodeFoldingThreshold) + ComputeSpeculationCost(I, TTI) > + PHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic) return false; // Store the store speculation candidate. @@ -1594,12 +1571,14 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB, if (!OrigCE && !ThenCE) continue; // Known safe and cheap. - if ((ThenCE && !isSafeToSpeculativelyExecute(ThenCE, DL)) || - (OrigCE && !isSafeToSpeculativelyExecute(OrigCE, DL))) + if ((ThenCE && !isSafeToSpeculativelyExecute(ThenCE)) || + (OrigCE && !isSafeToSpeculativelyExecute(OrigCE))) return false; - unsigned OrigCost = OrigCE ? ComputeSpeculationCost(OrigCE, DL) : 0; - unsigned ThenCost = ThenCE ? ComputeSpeculationCost(ThenCE, DL) : 0; - if (OrigCost + ThenCost > 2 * PHINodeFoldingThreshold) + unsigned OrigCost = OrigCE ? ComputeSpeculationCost(OrigCE, TTI) : 0; + unsigned ThenCost = ThenCE ? ComputeSpeculationCost(ThenCE, TTI) : 0; + unsigned MaxCost = 2 * PHINodeFoldingThreshold * + TargetTransformInfo::TCC_Basic; + if (OrigCost + ThenCost > MaxCost) return false; // Account for the cost of an unfolded ConstantExpr which could end up @@ -1706,7 +1685,7 @@ static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB) { /// that is defined in the same block as the branch and if any PHI entries are /// constants, thread edges corresponding to that entry to be branches to their /// ultimate destination. -static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout *DL) { +static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout &DL) { BasicBlock *BB = BI->getParent(); PHINode *PN = dyn_cast<PHINode>(BI->getCondition()); // NOTE: we currently cannot transform this case if the PHI node is used @@ -1804,7 +1783,8 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout *DL) { /// FoldTwoEntryPHINode - Given a BB that starts with the specified two-entry /// PHI node, see if we can eliminate it. -static bool FoldTwoEntryPHINode(PHINode *PN, const DataLayout *DL) { +static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI, + const DataLayout &DL) { // Ok, this is a two entry PHI node. Check to see if this is a simple "if // statement", which has a very simple dominance structure. Basically, we // are trying to find the condition that is being branched on, which @@ -1835,6 +1815,8 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const DataLayout *DL) { SmallPtrSet<Instruction*, 4> AggressiveInsts; unsigned MaxCostVal0 = PHINodeFoldingThreshold, MaxCostVal1 = PHINodeFoldingThreshold; + MaxCostVal0 *= TargetTransformInfo::TCC_Basic; + MaxCostVal1 *= TargetTransformInfo::TCC_Basic; for (BasicBlock::iterator II = BB->begin(); isa<PHINode>(II);) { PHINode *PN = cast<PHINode>(II++); @@ -1845,9 +1827,9 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const DataLayout *DL) { } if (!DominatesMergePoint(PN->getIncomingValue(0), BB, &AggressiveInsts, - MaxCostVal0, DL) || + MaxCostVal0, TTI) || !DominatesMergePoint(PN->getIncomingValue(1), BB, &AggressiveInsts, - MaxCostVal1, DL)) + MaxCostVal1, TTI)) return false; } @@ -2067,8 +2049,7 @@ static bool checkCSEInPredecessor(Instruction *Inst, BasicBlock *PB) { /// FoldBranchToCommonDest - If this basic block is simple enough, and if a /// predecessor branches to us and one of our successors, fold the block into /// the predecessor and use logical operations to pick the right destination. -bool llvm::FoldBranchToCommonDest(BranchInst *BI, const DataLayout *DL, - unsigned BonusInstThreshold) { +bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) { BasicBlock *BB = BI->getParent(); Instruction *Cond = nullptr; @@ -2124,7 +2105,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, const DataLayout *DL, // Ignore dbg intrinsics. if (isa<DbgInfoIntrinsic>(I)) continue; - if (!I->hasOneUse() || !isSafeToSpeculativelyExecute(I, DL)) + if (!I->hasOneUse() || !isSafeToSpeculativelyExecute(I)) return false; // I has only one use and can be executed unconditionally. Instruction *User = dyn_cast<Instruction>(I->user_back()); @@ -2536,17 +2517,15 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) { // The weight to CommonDest should be PredCommon * SuccTotal + // PredOther * SuccCommon. // The weight to OtherDest should be PredOther * SuccOther. - SmallVector<uint64_t, 2> NewWeights; - NewWeights.push_back(PredCommon * (SuccCommon + SuccOther) + - PredOther * SuccCommon); - NewWeights.push_back(PredOther * SuccOther); + uint64_t NewWeights[2] = {PredCommon * (SuccCommon + SuccOther) + + PredOther * SuccCommon, + PredOther * SuccOther}; // Halve the weights if any of them cannot fit in an uint32_t FitWeights(NewWeights); - SmallVector<uint32_t, 2> MDWeights(NewWeights.begin(),NewWeights.end()); PBI->setMetadata(LLVMContext::MD_prof, - MDBuilder(BI->getContext()). - createBranchWeights(MDWeights)); + MDBuilder(BI->getContext()) + .createBranchWeights(NewWeights[0], NewWeights[1])); } // OtherDest may have phi nodes. If so, add an entry from PBI's @@ -2719,8 +2698,9 @@ static bool SimplifyIndirectBrOnSelect(IndirectBrInst *IBI, SelectInst *SI) { /// We prefer to split the edge to 'end' so that there is a true/false entry to /// the PHI, merging the third icmp into the switch. static bool TryToSimplifyUncondBranchWithICmpInIt( - ICmpInst *ICI, IRBuilder<> &Builder, const TargetTransformInfo &TTI, - unsigned BonusInstThreshold, const DataLayout *DL, AssumptionCache *AC) { + ICmpInst *ICI, IRBuilder<> &Builder, const DataLayout &DL, + const TargetTransformInfo &TTI, unsigned BonusInstThreshold, + AssumptionCache *AC) { BasicBlock *BB = ICI->getParent(); // If the block has any PHIs in it or the icmp has multiple uses, it is too @@ -2753,7 +2733,7 @@ static bool TryToSimplifyUncondBranchWithICmpInIt( ICI->eraseFromParent(); } // BB is now empty, so it is likely to simplify away. - return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true; + return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true; } // Ok, the block is reachable from the default dest. If the constant we're @@ -2769,7 +2749,7 @@ static bool TryToSimplifyUncondBranchWithICmpInIt( ICI->replaceAllUsesWith(V); ICI->eraseFromParent(); // BB is now empty, so it is likely to simplify away. - return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true; + return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true; } // The use of the icmp has to be in the 'end' block, by the only PHI node in @@ -2825,8 +2805,8 @@ static bool TryToSimplifyUncondBranchWithICmpInIt( /// SimplifyBranchOnICmpChain - The specified branch is a conditional branch. /// Check to see if it is branching on an or/and chain of icmp instructions, and /// fold it into a switch instruction if so. -static bool SimplifyBranchOnICmpChain(BranchInst *BI, const DataLayout *DL, - IRBuilder<> &Builder) { +static bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder, + const DataLayout &DL) { Instruction *Cond = dyn_cast<Instruction>(BI->getCondition()); if (!Cond) return false; @@ -2901,10 +2881,8 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, const DataLayout *DL, Builder.SetInsertPoint(BI); // Convert pointer to int before we switch. if (CompVal->getType()->isPointerTy()) { - assert(DL && "Cannot switch on pointer without DataLayout"); - CompVal = Builder.CreatePtrToInt(CompVal, - DL->getIntPtrType(CompVal->getType()), - "magicptr"); + CompVal = Builder.CreatePtrToInt( + CompVal, DL.getIntPtrType(CompVal->getType()), "magicptr"); } // Create the new switch instruction now. @@ -2949,20 +2927,9 @@ bool SimplifyCFGOpt::SimplifyResume(ResumeInst *RI, IRBuilder<> &Builder) { return false; // Turn all invokes that unwind here into calls and delete the basic block. - bool InvokeRequiresTableEntry = false; - bool Changed = false; for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE;) { InvokeInst *II = cast<InvokeInst>((*PI++)->getTerminator()); - - if (II->hasFnAttr(Attribute::UWTable)) { - // Don't remove an `invoke' instruction if the ABI requires an entry into - // the table. - InvokeRequiresTableEntry = true; - continue; - } - SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3); - // Insert a call instruction before the invoke. CallInst *Call = CallInst::Create(II->getCalledValue(), Args, "", II); Call->takeName(II); @@ -2982,14 +2949,11 @@ bool SimplifyCFGOpt::SimplifyResume(ResumeInst *RI, IRBuilder<> &Builder) { // Finally, delete the invoke instruction! II->eraseFromParent(); - Changed = true; } - if (!InvokeRequiresTableEntry) - // The landingpad is now unreachable. Zap it. - BB->eraseFromParent(); - - return Changed; + // The landingpad is now unreachable. Zap it. + BB->eraseFromParent(); + return true; } bool SimplifyCFGOpt::SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder) { @@ -3121,55 +3085,6 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) { --i; --e; Changed = true; } - // If the default value is unreachable, figure out the most popular - // destination and make it the default. - if (SI->getDefaultDest() == BB) { - std::map<BasicBlock*, std::pair<unsigned, unsigned> > Popularity; - for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); - i != e; ++i) { - std::pair<unsigned, unsigned> &entry = - Popularity[i.getCaseSuccessor()]; - if (entry.first == 0) { - entry.first = 1; - entry.second = i.getCaseIndex(); - } else { - entry.first++; - } - } - - // Find the most popular block. - unsigned MaxPop = 0; - unsigned MaxIndex = 0; - BasicBlock *MaxBlock = nullptr; - for (std::map<BasicBlock*, std::pair<unsigned, unsigned> >::iterator - I = Popularity.begin(), E = Popularity.end(); I != E; ++I) { - if (I->second.first > MaxPop || - (I->second.first == MaxPop && MaxIndex > I->second.second)) { - MaxPop = I->second.first; - MaxIndex = I->second.second; - MaxBlock = I->first; - } - } - if (MaxBlock) { - // Make this the new default, allowing us to delete any explicit - // edges to it. - SI->setDefaultDest(MaxBlock); - Changed = true; - - // If MaxBlock has phinodes in it, remove MaxPop-1 entries from - // it. - if (isa<PHINode>(MaxBlock->begin())) - for (unsigned i = 0; i != MaxPop-1; ++i) - MaxBlock->removePredecessor(SI->getParent()); - - for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); - i != e; ++i) - if (i.getCaseSuccessor() == MaxBlock) { - SI->removeCase(i); - --i; --e; - } - } - } } else if (InvokeInst *II = dyn_cast<InvokeInst>(TI)) { if (II->getUnwindDest() == BB) { // Convert the invoke to a call instruction. This would be a good @@ -3203,70 +3118,122 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) { return Changed; } -/// TurnSwitchRangeIntoICmp - Turns a switch with that contains only a -/// integer range comparison into a sub, an icmp and a branch. -static bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder) { - assert(SI->getNumCases() > 1 && "Degenerate switch?"); +static bool CasesAreContiguous(SmallVectorImpl<ConstantInt *> &Cases) { + assert(Cases.size() >= 1); - // Make sure all cases point to the same destination and gather the values. - SmallVector<ConstantInt *, 16> Cases; - SwitchInst::CaseIt I = SI->case_begin(); - Cases.push_back(I.getCaseValue()); - SwitchInst::CaseIt PrevI = I++; - for (SwitchInst::CaseIt E = SI->case_end(); I != E; PrevI = I++) { - if (PrevI.getCaseSuccessor() != I.getCaseSuccessor()) + array_pod_sort(Cases.begin(), Cases.end(), ConstantIntSortPredicate); + for (size_t I = 1, E = Cases.size(); I != E; ++I) { + if (Cases[I - 1]->getValue() != Cases[I]->getValue() + 1) return false; - Cases.push_back(I.getCaseValue()); } - assert(Cases.size() == SI->getNumCases() && "Not all cases gathered"); + return true; +} - // Sort the case values, then check if they form a range we can transform. - array_pod_sort(Cases.begin(), Cases.end(), ConstantIntSortPredicate); - for (unsigned I = 1, E = Cases.size(); I != E; ++I) { - if (Cases[I-1]->getValue() != Cases[I]->getValue()+1) - return false; +/// Turn a switch with two reachable destinations into an integer range +/// comparison and branch. +static bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder) { + assert(SI->getNumCases() > 1 && "Degenerate switch?"); + + bool HasDefault = + !isa<UnreachableInst>(SI->getDefaultDest()->getFirstNonPHIOrDbg()); + + // Partition the cases into two sets with different destinations. + BasicBlock *DestA = HasDefault ? SI->getDefaultDest() : nullptr; + BasicBlock *DestB = nullptr; + SmallVector <ConstantInt *, 16> CasesA; + SmallVector <ConstantInt *, 16> CasesB; + + for (SwitchInst::CaseIt I : SI->cases()) { + BasicBlock *Dest = I.getCaseSuccessor(); + if (!DestA) DestA = Dest; + if (Dest == DestA) { + CasesA.push_back(I.getCaseValue()); + continue; + } + if (!DestB) DestB = Dest; + if (Dest == DestB) { + CasesB.push_back(I.getCaseValue()); + continue; + } + return false; // More than two destinations. } - Constant *Offset = ConstantExpr::getNeg(Cases.back()); - Constant *NumCases = ConstantInt::get(Offset->getType(), SI->getNumCases()); + assert(DestA && DestB && "Single-destination switch should have been folded."); + assert(DestA != DestB); + assert(DestB != SI->getDefaultDest()); + assert(!CasesB.empty() && "There must be non-default cases."); + assert(!CasesA.empty() || HasDefault); + + // Figure out if one of the sets of cases form a contiguous range. + SmallVectorImpl<ConstantInt *> *ContiguousCases = nullptr; + BasicBlock *ContiguousDest = nullptr; + BasicBlock *OtherDest = nullptr; + if (!CasesA.empty() && CasesAreContiguous(CasesA)) { + ContiguousCases = &CasesA; + ContiguousDest = DestA; + OtherDest = DestB; + } else if (CasesAreContiguous(CasesB)) { + ContiguousCases = &CasesB; + ContiguousDest = DestB; + OtherDest = DestA; + } else + return false; + + // Start building the compare and branch. + + Constant *Offset = ConstantExpr::getNeg(ContiguousCases->back()); + Constant *NumCases = ConstantInt::get(Offset->getType(), ContiguousCases->size()); Value *Sub = SI->getCondition(); if (!Offset->isNullValue()) - Sub = Builder.CreateAdd(Sub, Offset, Sub->getName()+".off"); + Sub = Builder.CreateAdd(Sub, Offset, Sub->getName() + ".off"); + Value *Cmp; // If NumCases overflowed, then all possible values jump to the successor. - if (NumCases->isNullValue() && SI->getNumCases() != 0) + if (NumCases->isNullValue() && !ContiguousCases->empty()) Cmp = ConstantInt::getTrue(SI->getContext()); else Cmp = Builder.CreateICmpULT(Sub, NumCases, "switch"); - BranchInst *NewBI = Builder.CreateCondBr( - Cmp, SI->case_begin().getCaseSuccessor(), SI->getDefaultDest()); + BranchInst *NewBI = Builder.CreateCondBr(Cmp, ContiguousDest, OtherDest); // Update weight for the newly-created conditional branch. - SmallVector<uint64_t, 8> Weights; - bool HasWeights = HasBranchWeights(SI); - if (HasWeights) { + if (HasBranchWeights(SI)) { + SmallVector<uint64_t, 8> Weights; GetBranchWeights(SI, Weights); if (Weights.size() == 1 + SI->getNumCases()) { - // Combine all weights for the cases to be the true weight of NewBI. - // We assume that the sum of all weights for a Terminator can fit into 32 - // bits. - uint32_t NewTrueWeight = 0; - for (unsigned I = 1, E = Weights.size(); I != E; ++I) - NewTrueWeight += (uint32_t)Weights[I]; + uint64_t TrueWeight = 0; + uint64_t FalseWeight = 0; + for (size_t I = 0, E = Weights.size(); I != E; ++I) { + if (SI->getSuccessor(I) == ContiguousDest) + TrueWeight += Weights[I]; + else + FalseWeight += Weights[I]; + } + while (TrueWeight > UINT32_MAX || FalseWeight > UINT32_MAX) { + TrueWeight /= 2; + FalseWeight /= 2; + } NewBI->setMetadata(LLVMContext::MD_prof, - MDBuilder(SI->getContext()). - createBranchWeights(NewTrueWeight, - (uint32_t)Weights[0])); + MDBuilder(SI->getContext()).createBranchWeights( + (uint32_t)TrueWeight, (uint32_t)FalseWeight)); } } - // Prune obsolete incoming values off the successor's PHI nodes. - for (BasicBlock::iterator BBI = SI->case_begin().getCaseSuccessor()->begin(); - isa<PHINode>(BBI); ++BBI) { - for (unsigned I = 0, E = SI->getNumCases()-1; I != E; ++I) + // Prune obsolete incoming values off the successors' PHI nodes. + for (auto BBI = ContiguousDest->begin(); isa<PHINode>(BBI); ++BBI) { + unsigned PreviousEdges = ContiguousCases->size(); + if (ContiguousDest == SI->getDefaultDest()) ++PreviousEdges; + for (unsigned I = 0, E = PreviousEdges - 1; I != E; ++I) + cast<PHINode>(BBI)->removeIncomingValue(SI->getParent()); + } + for (auto BBI = OtherDest->begin(); isa<PHINode>(BBI); ++BBI) { + unsigned PreviousEdges = SI->getNumCases() - ContiguousCases->size(); + if (OtherDest == SI->getDefaultDest()) ++PreviousEdges; + for (unsigned I = 0, E = PreviousEdges - 1; I != E; ++I) cast<PHINode>(BBI)->removeIncomingValue(SI->getParent()); } + + // Drop the switch. SI->eraseFromParent(); return true; @@ -3274,8 +3241,8 @@ static bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder) { /// EliminateDeadSwitchCases - Compute masked bits for the condition of a switch /// and use it to remove dead cases. -static bool EliminateDeadSwitchCases(SwitchInst *SI, const DataLayout *DL, - AssumptionCache *AC) { +static bool EliminateDeadSwitchCases(SwitchInst *SI, AssumptionCache *AC, + const DataLayout &DL) { Value *Cond = SI->getCondition(); unsigned Bits = Cond->getType()->getIntegerBitWidth(); APInt KnownZero(Bits, 0), KnownOne(Bits, 0); @@ -3426,9 +3393,8 @@ static Constant *LookupConstant(Value *V, /// constant or can be replaced by constants from the ConstantPool. Returns the /// resulting constant on success, 0 otherwise. static Constant * -ConstantFold(Instruction *I, - const SmallDenseMap<Value *, Constant *> &ConstantPool, - const DataLayout *DL) { +ConstantFold(Instruction *I, const DataLayout &DL, + const SmallDenseMap<Value *, Constant *> &ConstantPool) { if (SelectInst *Select = dyn_cast<SelectInst>(I)) { Constant *A = LookupConstant(Select->getCondition(), ConstantPool); if (!A) @@ -3448,9 +3414,10 @@ ConstantFold(Instruction *I, return nullptr; } - if (CmpInst *Cmp = dyn_cast<CmpInst>(I)) + if (CmpInst *Cmp = dyn_cast<CmpInst>(I)) { return ConstantFoldCompareInstOperands(Cmp->getPredicate(), COps[0], COps[1], DL); + } return ConstantFoldInstOperands(I->getOpcode(), I->getType(), COps, DL); } @@ -3460,12 +3427,10 @@ ConstantFold(Instruction *I, /// destionations CaseDest corresponding to value CaseVal (0 for the default /// case), of a switch instruction SI. static bool -GetCaseResults(SwitchInst *SI, - ConstantInt *CaseVal, - BasicBlock *CaseDest, +GetCaseResults(SwitchInst *SI, ConstantInt *CaseVal, BasicBlock *CaseDest, BasicBlock **CommonDest, - SmallVectorImpl<std::pair<PHINode *, Constant *> > &Res, - const DataLayout *DL) { + SmallVectorImpl<std::pair<PHINode *, Constant *>> &Res, + const DataLayout &DL) { // The block from which we enter the common destination. BasicBlock *Pred = SI->getParent(); @@ -3484,7 +3449,7 @@ GetCaseResults(SwitchInst *SI, } else if (isa<DbgInfoIntrinsic>(I)) { // Skip debug intrinsic. continue; - } else if (Constant *C = ConstantFold(I, ConstantPool, DL)) { + } else if (Constant *C = ConstantFold(I, DL, ConstantPool)) { // Instruction is side-effect free and constant. // If the instruction has uses outside this block or a phi node slot for @@ -3555,11 +3520,11 @@ static void MapCaseToResult(ConstantInt *CaseVal, // results for the PHI node of the common destination block for a switch // instruction. Returns false if multiple PHI nodes have been found or if // there is not a common destination block for the switch. -static bool InitializeUniqueCases( - SwitchInst *SI, const DataLayout *DL, PHINode *&PHI, - BasicBlock *&CommonDest, - SwitchCaseResultVectorTy &UniqueResults, - Constant *&DefaultResult) { +static bool InitializeUniqueCases(SwitchInst *SI, PHINode *&PHI, + BasicBlock *&CommonDest, + SwitchCaseResultVectorTy &UniqueResults, + Constant *&DefaultResult, + const DataLayout &DL) { for (auto &I : SI->cases()) { ConstantInt *CaseVal = I.getCaseValue(); @@ -3666,15 +3631,15 @@ static void RemoveSwitchAfterSelectConversion(SwitchInst *SI, PHINode *PHI, /// phi nodes in a common successor block with only two different /// constant values, replace the switch with select. static bool SwitchToSelect(SwitchInst *SI, IRBuilder<> &Builder, - const DataLayout *DL, AssumptionCache *AC) { + AssumptionCache *AC, const DataLayout &DL) { Value *const Cond = SI->getCondition(); PHINode *PHI = nullptr; BasicBlock *CommonDest = nullptr; Constant *DefaultResult; SwitchCaseResultVectorTy UniqueResults; // Collect all the cases that will deliver the same value from the switch. - if (!InitializeUniqueCases(SI, DL, PHI, CommonDest, UniqueResults, - DefaultResult)) + if (!InitializeUniqueCases(SI, PHI, CommonDest, UniqueResults, DefaultResult, + DL)) return false; // Selects choose between maximum two values. if (UniqueResults.size() != 2) @@ -3701,12 +3666,10 @@ namespace { /// SwitchLookupTable - Create a lookup table to use as a switch replacement /// with the contents of Values, using DefaultValue to fill any holes in the /// table. - SwitchLookupTable(Module &M, - uint64_t TableSize, - ConstantInt *Offset, - const SmallVectorImpl<std::pair<ConstantInt*, Constant*> >& Values, - Constant *DefaultValue, - const DataLayout *DL); + SwitchLookupTable( + Module &M, uint64_t TableSize, ConstantInt *Offset, + const SmallVectorImpl<std::pair<ConstantInt *, Constant *>> &Values, + Constant *DefaultValue, const DataLayout &DL); /// BuildLookup - Build instructions with Builder to retrieve the value at /// the position given by Index in the lookup table. @@ -3714,8 +3677,7 @@ namespace { /// WouldFitInRegister - Return true if a table with TableSize elements of /// type ElementType would fit in a target-legal register. - static bool WouldFitInRegister(const DataLayout *DL, - uint64_t TableSize, + static bool WouldFitInRegister(const DataLayout &DL, uint64_t TableSize, const Type *ElementType); private: @@ -3757,12 +3719,10 @@ namespace { }; } -SwitchLookupTable::SwitchLookupTable(Module &M, - uint64_t TableSize, - ConstantInt *Offset, - const SmallVectorImpl<std::pair<ConstantInt*, Constant*> >& Values, - Constant *DefaultValue, - const DataLayout *DL) +SwitchLookupTable::SwitchLookupTable( + Module &M, uint64_t TableSize, ConstantInt *Offset, + const SmallVectorImpl<std::pair<ConstantInt *, Constant *>> &Values, + Constant *DefaultValue, const DataLayout &DL) : SingleValue(nullptr), BitMap(nullptr), BitMapElementTy(nullptr), LinearOffset(nullptr), LinearMultiplier(nullptr), Array(nullptr) { assert(Values.size() && "Can't build lookup table without values!"); @@ -3924,19 +3884,17 @@ Value *SwitchLookupTable::BuildLookup(Value *Index, IRBuilder<> &Builder) { "switch.tableidx.zext"); Value *GEPIndices[] = { Builder.getInt32(0), Index }; - Value *GEP = Builder.CreateInBoundsGEP(Array, GEPIndices, - "switch.gep"); + Value *GEP = Builder.CreateInBoundsGEP(Array->getValueType(), Array, + GEPIndices, "switch.gep"); return Builder.CreateLoad(GEP, "switch.load"); } } llvm_unreachable("Unknown lookup table kind!"); } -bool SwitchLookupTable::WouldFitInRegister(const DataLayout *DL, +bool SwitchLookupTable::WouldFitInRegister(const DataLayout &DL, uint64_t TableSize, const Type *ElementType) { - if (!DL) - return false; const IntegerType *IT = dyn_cast<IntegerType>(ElementType); if (!IT) return false; @@ -3946,17 +3904,16 @@ bool SwitchLookupTable::WouldFitInRegister(const DataLayout *DL, // Avoid overflow, fitsInLegalInteger uses unsigned int for the width. if (TableSize >= UINT_MAX/IT->getBitWidth()) return false; - return DL->fitsInLegalInteger(TableSize * IT->getBitWidth()); + return DL.fitsInLegalInteger(TableSize * IT->getBitWidth()); } /// ShouldBuildLookupTable - Determine whether a lookup table should be built /// for this switch, based on the number of cases, size of the table and the /// types of the results. -static bool ShouldBuildLookupTable(SwitchInst *SI, - uint64_t TableSize, - const TargetTransformInfo &TTI, - const DataLayout *DL, - const SmallDenseMap<PHINode*, Type*>& ResultTypes) { +static bool +ShouldBuildLookupTable(SwitchInst *SI, uint64_t TableSize, + const TargetTransformInfo &TTI, const DataLayout &DL, + const SmallDenseMap<PHINode *, Type *> &ResultTypes) { if (SI->getNumCases() > TableSize || TableSize >= UINT64_MAX / 10) return false; // TableSize overflowed, or mul below might overflow. @@ -4079,10 +4036,9 @@ static void reuseTableCompare(User *PhiUser, BasicBlock *PhiBlock, /// SwitchToLookupTable - If the switch is only used to initialize one or more /// phi nodes in a common successor block with different constant values, /// replace the switch with lookup tables. -static bool SwitchToLookupTable(SwitchInst *SI, - IRBuilder<> &Builder, - const TargetTransformInfo &TTI, - const DataLayout* DL) { +static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder, + const DataLayout &DL, + const TargetTransformInfo &TTI) { assert(SI->getNumCases() > 1 && "Degenerate switch?"); // Only build lookup table when we have a target that supports it. @@ -4153,14 +4109,14 @@ static bool SwitchToLookupTable(SwitchInst *SI, // or a bitmask that fits in a register. SmallVector<std::pair<PHINode*, Constant*>, 4> DefaultResultsList; bool HasDefaultResults = GetCaseResults(SI, nullptr, SI->getDefaultDest(), - &CommonDest, DefaultResultsList, DL); + &CommonDest, DefaultResultsList, DL); bool NeedMask = (TableHasHoles && !HasDefaultResults); if (NeedMask) { // As an extra penalty for the validity test we require more cases. if (SI->getNumCases() < 4) // FIXME: Find best threshold value (benchmark). return false; - if (!(DL && DL->fitsInLegalInteger(TableSize))) + if (!DL.fitsInLegalInteger(TableSize)) return false; } @@ -4193,19 +4149,18 @@ static bool SwitchToLookupTable(SwitchInst *SI, "It is impossible for a switch to have more entries than the max " "representable value of its input integer type's size."); - // If we have a fully covered lookup table, unconditionally branch to the - // lookup table BB. Otherwise, check if the condition value is within the case - // range. If it is so, branch to the new BB. Otherwise branch to SI's default - // destination. + // If the default destination is unreachable, or if the lookup table covers + // all values of the conditional variable, branch directly to the lookup table + // BB. Otherwise, check that the condition is within the case range. + const bool DefaultIsReachable = + !isa<UnreachableInst>(SI->getDefaultDest()->getFirstNonPHIOrDbg()); + const bool GeneratingCoveredLookupTable = (MaxTableSize == TableSize); BranchInst *RangeCheckBranch = nullptr; - const bool GeneratingCoveredLookupTable = MaxTableSize == TableSize; - if (GeneratingCoveredLookupTable) { + if (!DefaultIsReachable || GeneratingCoveredLookupTable) { Builder.CreateBr(LookupBB); - // We cached PHINodes in PHIs, to avoid accessing deleted PHINodes later, - // do not delete PHINodes here. - SI->getDefaultDest()->removePredecessor(SI->getParent(), - true/*DontDeleteUselessPHIs*/); + // Note: We call removeProdecessor later since we need to be able to get the + // PHI value for the default case in case we're using a bit mask. } else { Value *Cmp = Builder.CreateICmpULT(TableIndex, ConstantInt::get( MinCaseVal->getType(), TableSize)); @@ -4257,6 +4212,13 @@ static bool SwitchToLookupTable(SwitchInst *SI, AddPredecessorToBlock(SI->getDefaultDest(), MaskBB, SI->getParent()); } + if (!DefaultIsReachable || GeneratingCoveredLookupTable) { + // We cached PHINodes in PHIs, to avoid accessing deleted PHINodes later, + // do not delete PHINodes here. + SI->getDefaultDest()->removePredecessor(SI->getParent(), + /*DontDeleteUselessPHIs=*/true); + } + bool ReturnedEarly = false; for (size_t I = 0, E = PHIs.size(); I != E; ++I) { PHINode *PHI = PHIs[I]; @@ -4317,12 +4279,12 @@ bool SimplifyCFGOpt::SimplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) { // see if that predecessor totally determines the outcome of this switch. if (BasicBlock *OnlyPred = BB->getSinglePredecessor()) if (SimplifyEqualityComparisonWithOnlyPredecessor(SI, OnlyPred, Builder)) - return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true; + return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true; Value *Cond = SI->getCondition(); if (SelectInst *Select = dyn_cast<SelectInst>(Cond)) if (SimplifySwitchOnSelect(SI, Select)) - return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true; + return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true; // If the block only contains the switch, see if we can fold the block // away into any preds. @@ -4332,25 +4294,25 @@ bool SimplifyCFGOpt::SimplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) { ++BBI; if (SI == &*BBI) if (FoldValueComparisonIntoPredecessors(SI, Builder)) - return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true; + return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true; } // Try to transform the switch into an icmp and a branch. if (TurnSwitchRangeIntoICmp(SI, Builder)) - return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true; + return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true; // Remove unreachable cases. - if (EliminateDeadSwitchCases(SI, DL, AC)) - return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true; + if (EliminateDeadSwitchCases(SI, AC, DL)) + return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true; - if (SwitchToSelect(SI, Builder, DL, AC)) - return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true; + if (SwitchToSelect(SI, Builder, AC, DL)) + return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true; if (ForwardSwitchConditionToPHI(SI)) - return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true; + return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true; - if (SwitchToLookupTable(SI, Builder, TTI, DL)) - return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true; + if (SwitchToLookupTable(SI, Builder, DL, TTI)) + return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true; return false; } @@ -4387,11 +4349,87 @@ bool SimplifyCFGOpt::SimplifyIndirectBr(IndirectBrInst *IBI) { if (SelectInst *SI = dyn_cast<SelectInst>(IBI->getAddress())) { if (SimplifyIndirectBrOnSelect(IBI, SI)) - return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true; + return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true; } return Changed; } +/// Given an block with only a single landing pad and a unconditional branch +/// try to find another basic block which this one can be merged with. This +/// handles cases where we have multiple invokes with unique landing pads, but +/// a shared handler. +/// +/// We specifically choose to not worry about merging non-empty blocks +/// here. That is a PRE/scheduling problem and is best solved elsewhere. In +/// practice, the optimizer produces empty landing pad blocks quite frequently +/// when dealing with exception dense code. (see: instcombine, gvn, if-else +/// sinking in this file) +/// +/// This is primarily a code size optimization. We need to avoid performing +/// any transform which might inhibit optimization (such as our ability to +/// specialize a particular handler via tail commoning). We do this by not +/// merging any blocks which require us to introduce a phi. Since the same +/// values are flowing through both blocks, we don't loose any ability to +/// specialize. If anything, we make such specialization more likely. +/// +/// TODO - This transformation could remove entries from a phi in the target +/// block when the inputs in the phi are the same for the two blocks being +/// merged. In some cases, this could result in removal of the PHI entirely. +static bool TryToMergeLandingPad(LandingPadInst *LPad, BranchInst *BI, + BasicBlock *BB) { + auto Succ = BB->getUniqueSuccessor(); + assert(Succ); + // If there's a phi in the successor block, we'd likely have to introduce + // a phi into the merged landing pad block. + if (isa<PHINode>(*Succ->begin())) + return false; + + for (BasicBlock *OtherPred : predecessors(Succ)) { + if (BB == OtherPred) + continue; + BasicBlock::iterator I = OtherPred->begin(); + LandingPadInst *LPad2 = dyn_cast<LandingPadInst>(I); + if (!LPad2 || !LPad2->isIdenticalTo(LPad)) + continue; + for (++I; isa<DbgInfoIntrinsic>(I); ++I) {} + BranchInst *BI2 = dyn_cast<BranchInst>(I); + if (!BI2 || !BI2->isIdenticalTo(BI)) + continue; + + // We've found an identical block. Update our predeccessors to take that + // path instead and make ourselves dead. + SmallSet<BasicBlock *, 16> Preds; + Preds.insert(pred_begin(BB), pred_end(BB)); + for (BasicBlock *Pred : Preds) { + InvokeInst *II = cast<InvokeInst>(Pred->getTerminator()); + assert(II->getNormalDest() != BB && + II->getUnwindDest() == BB && "unexpected successor"); + II->setUnwindDest(OtherPred); + } + + // The debug info in OtherPred doesn't cover the merged control flow that + // used to go through BB. We need to delete it or update it. + for (auto I = OtherPred->begin(), E = OtherPred->end(); + I != E;) { + Instruction &Inst = *I; I++; + if (isa<DbgInfoIntrinsic>(Inst)) + Inst.eraseFromParent(); + } + + SmallSet<BasicBlock *, 16> Succs; + Succs.insert(succ_begin(BB), succ_end(BB)); + for (BasicBlock *Succ : Succs) { + Succ->removePredecessor(BB); + } + + IRBuilder<> Builder(BI); + Builder.CreateUnreachable(); + BI->eraseFromParent(); + return true; + } + return false; +} + bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder){ BasicBlock *BB = BI->getParent(); @@ -4411,17 +4449,26 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder){ for (++I; isa<DbgInfoIntrinsic>(I); ++I) ; if (I->isTerminator() && - TryToSimplifyUncondBranchWithICmpInIt(ICI, Builder, TTI, - BonusInstThreshold, DL, AC)) + TryToSimplifyUncondBranchWithICmpInIt(ICI, Builder, DL, TTI, + BonusInstThreshold, AC)) return true; } + // See if we can merge an empty landing pad block with another which is + // equivalent. + if (LandingPadInst *LPad = dyn_cast<LandingPadInst>(I)) { + for (++I; isa<DbgInfoIntrinsic>(I); ++I) {} + if (I->isTerminator() && + TryToMergeLandingPad(LPad, BI, BB)) + return true; + } + // If this basic block is ONLY a compare and a branch, and if a predecessor // branches to us and our successor, fold the comparison into the // predecessor and use logical operations to update the incoming value // for PHI nodes in common successor. - if (FoldBranchToCommonDest(BI, DL, BonusInstThreshold)) - return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true; + if (FoldBranchToCommonDest(BI, BonusInstThreshold)) + return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true; return false; } @@ -4436,7 +4483,7 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) { // switch. if (BasicBlock *OnlyPred = BB->getSinglePredecessor()) if (SimplifyEqualityComparisonWithOnlyPredecessor(BI, OnlyPred, Builder)) - return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true; + return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true; // This block must be empty, except for the setcond inst, if it exists. // Ignore dbg intrinsics. @@ -4446,26 +4493,26 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) { ++I; if (&*I == BI) { if (FoldValueComparisonIntoPredecessors(BI, Builder)) - return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true; + return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true; } else if (&*I == cast<Instruction>(BI->getCondition())){ ++I; // Ignore dbg intrinsics. while (isa<DbgInfoIntrinsic>(I)) ++I; if (&*I == BI && FoldValueComparisonIntoPredecessors(BI, Builder)) - return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true; + return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true; } } // Try to turn "br (X == 0 | X == 1), T, F" into a switch instruction. - if (SimplifyBranchOnICmpChain(BI, DL, Builder)) + if (SimplifyBranchOnICmpChain(BI, Builder, DL)) return true; // If this basic block is ONLY a compare and a branch, and if a predecessor // branches to us and one of our successors, fold the comparison into the // predecessor and use logical operations to pick the right destination. - if (FoldBranchToCommonDest(BI, DL, BonusInstThreshold)) - return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true; + if (FoldBranchToCommonDest(BI, BonusInstThreshold)) + return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true; // We have a conditional branch to two blocks that are only reachable // from BI. We know that the condbr dominates the two blocks, so see if @@ -4473,16 +4520,16 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) { // can hoist it up to the branching block. if (BI->getSuccessor(0)->getSinglePredecessor()) { if (BI->getSuccessor(1)->getSinglePredecessor()) { - if (HoistThenElseCodeToIf(BI, DL)) - return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true; + if (HoistThenElseCodeToIf(BI, TTI)) + return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true; } else { // If Successor #1 has multiple preds, we may be able to conditionally // execute Successor #0 if it branches to Successor #1. TerminatorInst *Succ0TI = BI->getSuccessor(0)->getTerminator(); if (Succ0TI->getNumSuccessors() == 1 && Succ0TI->getSuccessor(0) == BI->getSuccessor(1)) - if (SpeculativelyExecuteBB(BI, BI->getSuccessor(0), DL)) - return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true; + if (SpeculativelyExecuteBB(BI, BI->getSuccessor(0), TTI)) + return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true; } } else if (BI->getSuccessor(1)->getSinglePredecessor()) { // If Successor #0 has multiple preds, we may be able to conditionally @@ -4490,8 +4537,8 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) { TerminatorInst *Succ1TI = BI->getSuccessor(1)->getTerminator(); if (Succ1TI->getNumSuccessors() == 1 && Succ1TI->getSuccessor(0) == BI->getSuccessor(0)) - if (SpeculativelyExecuteBB(BI, BI->getSuccessor(1), DL)) - return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true; + if (SpeculativelyExecuteBB(BI, BI->getSuccessor(1), TTI)) + return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true; } // If this is a branch on a phi node in the current block, thread control @@ -4499,14 +4546,14 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) { if (PHINode *PN = dyn_cast<PHINode>(BI->getCondition())) if (PN->getParent() == BI->getParent()) if (FoldCondBranchOnPHI(BI, DL)) - return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true; + return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true; // Scan predecessor blocks for conditional branches. for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) if (BranchInst *PBI = dyn_cast<BranchInst>((*PI)->getTerminator())) if (PBI != BI && PBI->isConditional()) if (SimplifyCondBranchToCondBranch(PBI, BI)) - return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true; + return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true; return false; } @@ -4618,7 +4665,7 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) { // eliminate it, do so now. if (PHINode *PN = dyn_cast<PHINode>(BB->begin())) if (PN->getNumIncomingValues() == 2) - Changed |= FoldTwoEntryPHINode(PN, DL); + Changed |= FoldTwoEntryPHINode(PN, TTI, DL); Builder.SetInsertPoint(BB->getTerminator()); if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) { @@ -4650,7 +4697,7 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) { /// of the CFG. It returns true if a modification was made. /// bool llvm::SimplifyCFG(BasicBlock *BB, const TargetTransformInfo &TTI, - unsigned BonusInstThreshold, const DataLayout *DL, - AssumptionCache *AC) { - return SimplifyCFGOpt(TTI, BonusInstThreshold, DL, AC).run(BB); + unsigned BonusInstThreshold, AssumptionCache *AC) { + return SimplifyCFGOpt(TTI, BB->getModule()->getDataLayout(), + BonusInstThreshold, AC).run(BB); } diff --git a/contrib/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp b/contrib/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp index d54c09a..3757a80 100644 --- a/contrib/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp +++ b/contrib/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp @@ -17,7 +17,6 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/IVUsers.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" @@ -48,22 +47,15 @@ namespace { Loop *L; LoopInfo *LI; ScalarEvolution *SE; - const DataLayout *DL; // May be NULL SmallVectorImpl<WeakVH> &DeadInsts; bool Changed; public: - SimplifyIndvar(Loop *Loop, ScalarEvolution *SE, LPPassManager *LPM, - SmallVectorImpl<WeakVH> &Dead, IVUsers *IVU = nullptr) : - L(Loop), - LI(LPM->getAnalysisIfAvailable<LoopInfo>()), - SE(SE), - DeadInsts(Dead), - Changed(false) { - DataLayoutPass *DLP = LPM->getAnalysisIfAvailable<DataLayoutPass>(); - DL = DLP ? &DLP->getDataLayout() : nullptr; + SimplifyIndvar(Loop *Loop, ScalarEvolution *SE, LoopInfo *LI, + SmallVectorImpl<WeakVH> &Dead) + : L(Loop), LI(LI), SE(SE), DeadInsts(Dead), Changed(false) { assert(LI && "IV simplification requires LoopInfo"); } @@ -277,95 +269,57 @@ bool SimplifyIndvar::eliminateIVUser(Instruction *UseInst, bool SimplifyIndvar::strengthenOverflowingOperation(BinaryOperator *BO, Value *IVOperand) { - // Currently we only handle instructions of the form "add <indvar> <value>" - unsigned Op = BO->getOpcode(); - if (Op != Instruction::Add) + // Fastpath: we don't have any work to do if `BO` is `nuw` and `nsw`. + if (BO->hasNoUnsignedWrap() && BO->hasNoSignedWrap()) return false; - // If BO is already both nuw and nsw then there is nothing left to do - if (BO->hasNoUnsignedWrap() && BO->hasNoSignedWrap()) + const SCEV *(ScalarEvolution::*GetExprForBO)(const SCEV *, const SCEV *, + SCEV::NoWrapFlags); + + switch (BO->getOpcode()) { + default: return false; - IntegerType *IT = cast<IntegerType>(IVOperand->getType()); - Value *OtherOperand = nullptr; - if (BO->getOperand(0) == IVOperand) { - OtherOperand = BO->getOperand(1); - } else { - assert(BO->getOperand(1) == IVOperand && "only other use!"); - OtherOperand = BO->getOperand(0); + case Instruction::Add: + GetExprForBO = &ScalarEvolution::getAddExpr; + break; + + case Instruction::Sub: + GetExprForBO = &ScalarEvolution::getMinusSCEV; + break; + + case Instruction::Mul: + GetExprForBO = &ScalarEvolution::getMulExpr; + break; } - bool Changed = false; - const SCEV *OtherOpSCEV = SE->getSCEV(OtherOperand); - if (OtherOpSCEV == SE->getCouldNotCompute()) - return false; + unsigned BitWidth = cast<IntegerType>(BO->getType())->getBitWidth(); + Type *WideTy = IntegerType::get(BO->getContext(), BitWidth * 2); + const SCEV *LHS = SE->getSCEV(BO->getOperand(0)); + const SCEV *RHS = SE->getSCEV(BO->getOperand(1)); - const SCEV *IVOpSCEV = SE->getSCEV(IVOperand); - const SCEV *ZeroSCEV = SE->getConstant(IVOpSCEV->getType(), 0); + bool Changed = false; - if (!BO->hasNoSignedWrap()) { - // Upgrade the add to an "add nsw" if we can prove that it will never - // sign-overflow or sign-underflow. - - const SCEV *SignedMax = - SE->getConstant(APInt::getSignedMaxValue(IT->getBitWidth())); - const SCEV *SignedMin = - SE->getConstant(APInt::getSignedMinValue(IT->getBitWidth())); - - // The addition "IVOperand + OtherOp" does not sign-overflow if the result - // is sign-representable in 2's complement in the given bit-width. - // - // If OtherOp is SLT 0, then for an IVOperand in [SignedMin - OtherOp, - // SignedMax], "IVOperand + OtherOp" is in [SignedMin, SignedMax + OtherOp]. - // Everything in [SignedMin, SignedMax + OtherOp] is representable since - // SignedMax + OtherOp is at least -1. - // - // If OtherOp is SGE 0, then for an IVOperand in [SignedMin, SignedMax - - // OtherOp], "IVOperand + OtherOp" is in [SignedMin + OtherOp, SignedMax]. - // Everything in [SignedMin + OtherOp, SignedMax] is representable since - // SignedMin + OtherOp is at most -1. - // - // It follows that for all values of IVOperand in [SignedMin - smin(0, - // OtherOp), SignedMax - smax(0, OtherOp)] the result of the add is - // representable (i.e. there is no sign-overflow). - - const SCEV *UpperDelta = SE->getSMaxExpr(ZeroSCEV, OtherOpSCEV); - const SCEV *UpperLimit = SE->getMinusSCEV(SignedMax, UpperDelta); - - bool NeverSignedOverflows = - SE->isKnownPredicate(ICmpInst::ICMP_SLE, IVOpSCEV, UpperLimit); - - if (NeverSignedOverflows) { - const SCEV *LowerDelta = SE->getSMinExpr(ZeroSCEV, OtherOpSCEV); - const SCEV *LowerLimit = SE->getMinusSCEV(SignedMin, LowerDelta); - - bool NeverSignedUnderflows = - SE->isKnownPredicate(ICmpInst::ICMP_SGE, IVOpSCEV, LowerLimit); - if (NeverSignedUnderflows) { - BO->setHasNoSignedWrap(true); - Changed = true; - } + if (!BO->hasNoUnsignedWrap()) { + const SCEV *ExtendAfterOp = SE->getZeroExtendExpr(SE->getSCEV(BO), WideTy); + const SCEV *OpAfterExtend = (SE->*GetExprForBO)( + SE->getZeroExtendExpr(LHS, WideTy), SE->getZeroExtendExpr(RHS, WideTy), + SCEV::FlagAnyWrap); + if (ExtendAfterOp == OpAfterExtend) { + BO->setHasNoUnsignedWrap(); + SE->forgetValue(BO); + Changed = true; } } - if (!BO->hasNoUnsignedWrap()) { - // Upgrade the add computing "IVOperand + OtherOp" to an "add nuw" if we can - // prove that it will never unsigned-overflow (i.e. the result will always - // be representable in the given bit-width). - // - // "IVOperand + OtherOp" is unsigned-representable in 2's complement iff it - // does not produce a carry. "IVOperand + OtherOp" produces no carry iff - // IVOperand ULE (UnsignedMax - OtherOp). - - const SCEV *UnsignedMax = - SE->getConstant(APInt::getMaxValue(IT->getBitWidth())); - const SCEV *UpperLimit = SE->getMinusSCEV(UnsignedMax, OtherOpSCEV); - - bool NeverUnsignedOverflows = - SE->isKnownPredicate(ICmpInst::ICMP_ULE, IVOpSCEV, UpperLimit); - - if (NeverUnsignedOverflows) { - BO->setHasNoUnsignedWrap(true); + if (!BO->hasNoSignedWrap()) { + const SCEV *ExtendAfterOp = SE->getSignExtendExpr(SE->getSCEV(BO), WideTy); + const SCEV *OpAfterExtend = (SE->*GetExprForBO)( + SE->getSignExtendExpr(LHS, WideTy), SE->getSignExtendExpr(RHS, WideTy), + SCEV::FlagAnyWrap); + if (ExtendAfterOp == OpAfterExtend) { + BO->setHasNoSignedWrap(); + SE->forgetValue(BO); Changed = true; } } @@ -562,8 +516,8 @@ void IVVisitor::anchor() { } bool simplifyUsersOfIV(PHINode *CurrIV, ScalarEvolution *SE, LPPassManager *LPM, SmallVectorImpl<WeakVH> &Dead, IVVisitor *V) { - LoopInfo *LI = &LPM->getAnalysis<LoopInfo>(); - SimplifyIndvar SIV(LI->getLoopFor(CurrIV->getParent()), SE, LPM, Dead); + LoopInfo *LI = &LPM->getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + SimplifyIndvar SIV(LI->getLoopFor(CurrIV->getParent()), SE, LI, Dead); SIV.simplifyUsers(CurrIV, V); return SIV.hasChanged(); } diff --git a/contrib/llvm/lib/Transforms/Utils/SimplifyInstructions.cpp b/contrib/llvm/lib/Transforms/Utils/SimplifyInstructions.cpp index cc97098..c499c87 100644 --- a/contrib/llvm/lib/Transforms/Utils/SimplifyInstructions.cpp +++ b/contrib/llvm/lib/Transforms/Utils/SimplifyInstructions.cpp @@ -25,7 +25,7 @@ #include "llvm/IR/Function.h" #include "llvm/IR/Type.h" #include "llvm/Pass.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; @@ -43,7 +43,7 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); AU.addRequired<AssumptionCacheTracker>(); - AU.addRequired<TargetLibraryInfo>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); } /// runOnFunction - Remove instructions that simplify. @@ -51,9 +51,9 @@ namespace { const DominatorTreeWrapperPass *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); const DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr; - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr; - const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>(); + const DataLayout &DL = F.getParent()->getDataLayout(); + const TargetLibraryInfo *TLI = + &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); AssumptionCache *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); SmallPtrSet<const Instruction*, 8> S1, S2, *ToSimplify = &S1, *Next = &S2; @@ -106,7 +106,7 @@ char InstSimplifier::ID = 0; INITIALIZE_PASS_BEGIN(InstSimplifier, "instsimplify", "Remove redundant instructions", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END(InstSimplifier, "instsimplify", "Remove redundant instructions", false, false) char &llvm::InstructionSimplifierID = InstSimplifier::ID; diff --git a/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp index 5a0d52e..6bbf828 100644 --- a/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -30,7 +30,7 @@ #include "llvm/IR/PatternMatch.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/CommandLine.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/BuildLibCalls.h" using namespace llvm; @@ -120,12 +120,12 @@ static bool hasUnaryFloatFn(const TargetLibraryInfo *TLI, Type *Ty, /// string/memory copying library function \p Func. /// Acceptable functions are st[rp][n]?cpy, memove, memcpy, and memset. /// Their fortified (_chk) counterparts are also accepted. -static bool checkStringCopyLibFuncSignature(Function *F, LibFunc::Func Func, - const DataLayout *DL) { +static bool checkStringCopyLibFuncSignature(Function *F, LibFunc::Func Func) { + const DataLayout &DL = F->getParent()->getDataLayout(); FunctionType *FT = F->getFunctionType(); LLVMContext &Context = F->getContext(); Type *PCharTy = Type::getInt8PtrTy(Context); - Type *SizeTTy = DL ? DL->getIntPtrType(Context) : nullptr; + Type *SizeTTy = DL.getIntPtrType(Context); unsigned NumParams = FT->getNumParams(); // All string libfuncs return the same type as the first parameter. @@ -208,10 +208,6 @@ Value *LibCallSimplifier::optimizeStrCat(CallInst *CI, IRBuilder<> &B) { if (Len == 0) return Dst; - // These optimizations require DataLayout. - if (!DL) - return nullptr; - return emitStrLenMemCpy(Src, Dst, Len, B); } @@ -226,13 +222,13 @@ Value *LibCallSimplifier::emitStrLenMemCpy(Value *Src, Value *Dst, uint64_t Len, // Now that we have the destination's length, we must index into the // destination's pointer to get the actual memcpy destination (end of // the string .. we're concatenating). - Value *CpyDst = B.CreateGEP(Dst, DstLen, "endptr"); + Value *CpyDst = B.CreateGEP(B.getInt8Ty(), Dst, DstLen, "endptr"); // We have enough information to now generate the memcpy call to do the // concatenation for us. Make a memcpy to copy the nul byte with align = 1. - B.CreateMemCpy( - CpyDst, Src, - ConstantInt::get(DL->getIntPtrType(Src->getContext()), Len + 1), 1); + B.CreateMemCpy(CpyDst, Src, + ConstantInt::get(DL.getIntPtrType(Src->getContext()), Len + 1), + 1); return Dst; } @@ -269,10 +265,6 @@ Value *LibCallSimplifier::optimizeStrNCat(CallInst *CI, IRBuilder<> &B) { if (SrcLen == 0 || Len == 0) return Dst; - // These optimizations require DataLayout. - if (!DL) - return nullptr; - // We don't optimize this case if (Len < SrcLen) return nullptr; @@ -297,25 +289,21 @@ Value *LibCallSimplifier::optimizeStrChr(CallInst *CI, IRBuilder<> &B) { // of the input string and turn this into memchr. ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1)); if (!CharC) { - // These optimizations require DataLayout. - if (!DL) - return nullptr; - uint64_t Len = GetStringLength(SrcStr); if (Len == 0 || !FT->getParamType(1)->isIntegerTy(32)) // memchr needs i32. return nullptr; - return EmitMemChr( - SrcStr, CI->getArgOperand(1), // include nul. - ConstantInt::get(DL->getIntPtrType(CI->getContext()), Len), B, DL, TLI); + return EmitMemChr(SrcStr, CI->getArgOperand(1), // include nul. + ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len), + B, DL, TLI); } // Otherwise, the character is a constant, see if the first argument is // a string literal. If so, we can constant fold. StringRef Str; if (!getConstantStringInfo(SrcStr, Str)) { - if (DL && CharC->isZero()) // strchr(p, 0) -> p + strlen(p) - return B.CreateGEP(SrcStr, EmitStrLen(SrcStr, B, DL, TLI), "strchr"); + if (CharC->isZero()) // strchr(p, 0) -> p + strlen(p) + return B.CreateGEP(B.getInt8Ty(), SrcStr, EmitStrLen(SrcStr, B, DL, TLI), "strchr"); return nullptr; } @@ -328,7 +316,7 @@ Value *LibCallSimplifier::optimizeStrChr(CallInst *CI, IRBuilder<> &B) { return Constant::getNullValue(CI->getType()); // strchr(s+n,c) -> gep(s+n+i,c) - return B.CreateGEP(SrcStr, B.getInt64(I), "strchr"); + return B.CreateGEP(B.getInt8Ty(), SrcStr, B.getInt64(I), "strchr"); } Value *LibCallSimplifier::optimizeStrRChr(CallInst *CI, IRBuilder<> &B) { @@ -350,8 +338,8 @@ Value *LibCallSimplifier::optimizeStrRChr(CallInst *CI, IRBuilder<> &B) { StringRef Str; if (!getConstantStringInfo(SrcStr, Str)) { // strrchr(s, 0) -> strchr(s, 0) - if (DL && CharC->isZero()) - return EmitStrChr(SrcStr, '\0', B, DL, TLI); + if (CharC->isZero()) + return EmitStrChr(SrcStr, '\0', B, TLI); return nullptr; } @@ -363,7 +351,7 @@ Value *LibCallSimplifier::optimizeStrRChr(CallInst *CI, IRBuilder<> &B) { return Constant::getNullValue(CI->getType()); // strrchr(s+n,c) -> gep(s+n+i,c) - return B.CreateGEP(SrcStr, B.getInt64(I), "strrchr"); + return B.CreateGEP(B.getInt8Ty(), SrcStr, B.getInt64(I), "strrchr"); } Value *LibCallSimplifier::optimizeStrCmp(CallInst *CI, IRBuilder<> &B) { @@ -398,12 +386,8 @@ Value *LibCallSimplifier::optimizeStrCmp(CallInst *CI, IRBuilder<> &B) { uint64_t Len1 = GetStringLength(Str1P); uint64_t Len2 = GetStringLength(Str2P); if (Len1 && Len2) { - // These optimizations require DataLayout. - if (!DL) - return nullptr; - return EmitMemCmp(Str1P, Str2P, - ConstantInt::get(DL->getIntPtrType(CI->getContext()), + ConstantInt::get(DL.getIntPtrType(CI->getContext()), std::min(Len1, Len2)), B, DL, TLI); } @@ -435,7 +419,7 @@ Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilder<> &B) { if (Length == 0) // strncmp(x,y,0) -> 0 return ConstantInt::get(CI->getType(), 0); - if (DL && Length == 1) // strncmp(x,y,1) -> memcmp(x,y,1) + if (Length == 1) // strncmp(x,y,1) -> memcmp(x,y,1) return EmitMemCmp(Str1P, Str2P, CI->getArgOperand(2), B, DL, TLI); StringRef Str1, Str2; @@ -462,17 +446,13 @@ Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilder<> &B) { Value *LibCallSimplifier::optimizeStrCpy(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); - if (!checkStringCopyLibFuncSignature(Callee, LibFunc::strcpy, DL)) + if (!checkStringCopyLibFuncSignature(Callee, LibFunc::strcpy)) return nullptr; Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1); if (Dst == Src) // strcpy(x,x) -> x return Src; - // These optimizations require DataLayout. - if (!DL) - return nullptr; - // See if we can get the length of the input string. uint64_t Len = GetStringLength(Src); if (Len == 0) @@ -481,7 +461,7 @@ Value *LibCallSimplifier::optimizeStrCpy(CallInst *CI, IRBuilder<> &B) { // We have enough information to now generate the memcpy call to do the // copy for us. Make a memcpy to copy the nul byte with align = 1. B.CreateMemCpy(Dst, Src, - ConstantInt::get(DL->getIntPtrType(CI->getContext()), Len), 1); + ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len), 1); return Dst; } @@ -490,17 +470,13 @@ Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilder<> &B) { // Verify the "stpcpy" function prototype. FunctionType *FT = Callee->getFunctionType(); - if (!checkStringCopyLibFuncSignature(Callee, LibFunc::stpcpy, DL)) - return nullptr; - - // These optimizations require DataLayout. - if (!DL) + if (!checkStringCopyLibFuncSignature(Callee, LibFunc::stpcpy)) return nullptr; Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1); if (Dst == Src) { // stpcpy(x,x) -> x+strlen(x) Value *StrLen = EmitStrLen(Src, B, DL, TLI); - return StrLen ? B.CreateInBoundsGEP(Dst, StrLen) : nullptr; + return StrLen ? B.CreateInBoundsGEP(B.getInt8Ty(), Dst, StrLen) : nullptr; } // See if we can get the length of the input string. @@ -509,9 +485,9 @@ Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilder<> &B) { return nullptr; Type *PT = FT->getParamType(0); - Value *LenV = ConstantInt::get(DL->getIntPtrType(PT), Len); + Value *LenV = ConstantInt::get(DL.getIntPtrType(PT), Len); Value *DstEnd = - B.CreateGEP(Dst, ConstantInt::get(DL->getIntPtrType(PT), Len - 1)); + B.CreateGEP(B.getInt8Ty(), Dst, ConstantInt::get(DL.getIntPtrType(PT), Len - 1)); // We have enough information to now generate the memcpy call to do the // copy for us. Make a memcpy to copy the nul byte with align = 1. @@ -523,7 +499,7 @@ Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); FunctionType *FT = Callee->getFunctionType(); - if (!checkStringCopyLibFuncSignature(Callee, LibFunc::strncpy, DL)) + if (!checkStringCopyLibFuncSignature(Callee, LibFunc::strncpy)) return nullptr; Value *Dst = CI->getArgOperand(0); @@ -551,17 +527,13 @@ Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilder<> &B) { if (Len == 0) return Dst; // strncpy(x, y, 0) -> x - // These optimizations require DataLayout. - if (!DL) - return nullptr; - // Let strncpy handle the zero padding if (Len > SrcLen + 1) return nullptr; Type *PT = FT->getParamType(0); // strncpy(x, s, c) -> memcpy(x, s, c, 1) [s and c are constant] - B.CreateMemCpy(Dst, Src, ConstantInt::get(DL->getIntPtrType(PT), Len), 1); + B.CreateMemCpy(Dst, Src, ConstantInt::get(DL.getIntPtrType(PT), Len), 1); return Dst; } @@ -625,12 +597,12 @@ Value *LibCallSimplifier::optimizeStrPBrk(CallInst *CI, IRBuilder<> &B) { if (I == StringRef::npos) // No match. return Constant::getNullValue(CI->getType()); - return B.CreateGEP(CI->getArgOperand(0), B.getInt64(I), "strpbrk"); + return B.CreateGEP(B.getInt8Ty(), CI->getArgOperand(0), B.getInt64(I), "strpbrk"); } // strpbrk(s, "a") -> strchr(s, 'a') - if (DL && HasS2 && S2.size() == 1) - return EmitStrChr(CI->getArgOperand(0), S2[0], B, DL, TLI); + if (HasS2 && S2.size() == 1) + return EmitStrChr(CI->getArgOperand(0), S2[0], B, TLI); return nullptr; } @@ -706,7 +678,7 @@ Value *LibCallSimplifier::optimizeStrCSpn(CallInst *CI, IRBuilder<> &B) { } // strcspn(s, "") -> strlen(s) - if (DL && HasS2 && S2.empty()) + if (HasS2 && S2.empty()) return EmitStrLen(CI->getArgOperand(0), B, DL, TLI); return nullptr; @@ -725,7 +697,7 @@ Value *LibCallSimplifier::optimizeStrStr(CallInst *CI, IRBuilder<> &B) { return B.CreateBitCast(CI->getArgOperand(0), CI->getType()); // fold strstr(a, b) == a -> strncmp(a, b, strlen(b)) == 0 - if (DL && isOnlyUsedInEqualityComparison(CI, CI->getArgOperand(0))) { + if (isOnlyUsedInEqualityComparison(CI, CI->getArgOperand(0))) { Value *StrLen = EmitStrLen(CI->getArgOperand(1), B, DL, TLI); if (!StrLen) return nullptr; @@ -767,12 +739,98 @@ Value *LibCallSimplifier::optimizeStrStr(CallInst *CI, IRBuilder<> &B) { // fold strstr(x, "y") -> strchr(x, 'y'). if (HasStr2 && ToFindStr.size() == 1) { - Value *StrChr = EmitStrChr(CI->getArgOperand(0), ToFindStr[0], B, DL, TLI); + Value *StrChr = EmitStrChr(CI->getArgOperand(0), ToFindStr[0], B, TLI); return StrChr ? B.CreateBitCast(StrChr, CI->getType()) : nullptr; } return nullptr; } +Value *LibCallSimplifier::optimizeMemChr(CallInst *CI, IRBuilder<> &B) { + Function *Callee = CI->getCalledFunction(); + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 3 || !FT->getParamType(0)->isPointerTy() || + !FT->getParamType(1)->isIntegerTy(32) || + !FT->getParamType(2)->isIntegerTy() || + !FT->getReturnType()->isPointerTy()) + return nullptr; + + Value *SrcStr = CI->getArgOperand(0); + ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1)); + ConstantInt *LenC = dyn_cast<ConstantInt>(CI->getArgOperand(2)); + + // memchr(x, y, 0) -> null + if (LenC && LenC->isNullValue()) + return Constant::getNullValue(CI->getType()); + + // From now on we need at least constant length and string. + StringRef Str; + if (!LenC || !getConstantStringInfo(SrcStr, Str, 0, /*TrimAtNul=*/false)) + return nullptr; + + // Truncate the string to LenC. If Str is smaller than LenC we will still only + // scan the string, as reading past the end of it is undefined and we can just + // return null if we don't find the char. + Str = Str.substr(0, LenC->getZExtValue()); + + // If the char is variable but the input str and length are not we can turn + // this memchr call into a simple bit field test. Of course this only works + // when the return value is only checked against null. + // + // It would be really nice to reuse switch lowering here but we can't change + // the CFG at this point. + // + // memchr("\r\n", C, 2) != nullptr -> (C & ((1 << '\r') | (1 << '\n'))) != 0 + // after bounds check. + if (!CharC && !Str.empty() && isOnlyUsedInZeroEqualityComparison(CI)) { + unsigned char Max = + *std::max_element(reinterpret_cast<const unsigned char *>(Str.begin()), + reinterpret_cast<const unsigned char *>(Str.end())); + + // Make sure the bit field we're about to create fits in a register on the + // target. + // FIXME: On a 64 bit architecture this prevents us from using the + // interesting range of alpha ascii chars. We could do better by emitting + // two bitfields or shifting the range by 64 if no lower chars are used. + if (!DL.fitsInLegalInteger(Max + 1)) + return nullptr; + + // For the bit field use a power-of-2 type with at least 8 bits to avoid + // creating unnecessary illegal types. + unsigned char Width = NextPowerOf2(std::max((unsigned char)7, Max)); + + // Now build the bit field. + APInt Bitfield(Width, 0); + for (char C : Str) + Bitfield.setBit((unsigned char)C); + Value *BitfieldC = B.getInt(Bitfield); + + // First check that the bit field access is within bounds. + Value *C = B.CreateZExtOrTrunc(CI->getArgOperand(1), BitfieldC->getType()); + Value *Bounds = B.CreateICmp(ICmpInst::ICMP_ULT, C, B.getIntN(Width, Width), + "memchr.bounds"); + + // Create code that checks if the given bit is set in the field. + Value *Shl = B.CreateShl(B.getIntN(Width, 1ULL), C); + Value *Bits = B.CreateIsNotNull(B.CreateAnd(Shl, BitfieldC), "memchr.bits"); + + // Finally merge both checks and cast to pointer type. The inttoptr + // implicitly zexts the i1 to intptr type. + return B.CreateIntToPtr(B.CreateAnd(Bounds, Bits, "memchr"), CI->getType()); + } + + // Check if all arguments are constants. If so, we can constant fold. + if (!CharC) + return nullptr; + + // Compute the offset. + size_t I = Str.find(CharC->getSExtValue() & 0xFF); + if (I == StringRef::npos) // Didn't find the char. memchr returns null. + return Constant::getNullValue(CI->getType()); + + // memchr(s+n,c,l) -> gep(s+n+i,c) + return B.CreateGEP(B.getInt8Ty(), SrcStr, B.getInt64(I), "memchr"); +} + Value *LibCallSimplifier::optimizeMemCmp(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); FunctionType *FT = Callee->getFunctionType(); @@ -827,11 +885,8 @@ Value *LibCallSimplifier::optimizeMemCmp(CallInst *CI, IRBuilder<> &B) { Value *LibCallSimplifier::optimizeMemCpy(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); - // These optimizations require DataLayout. - if (!DL) - return nullptr; - if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memcpy, DL)) + if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memcpy)) return nullptr; // memcpy(x, y, n) -> llvm.memcpy(x, y, n, 1) @@ -842,11 +897,8 @@ Value *LibCallSimplifier::optimizeMemCpy(CallInst *CI, IRBuilder<> &B) { Value *LibCallSimplifier::optimizeMemMove(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); - // These optimizations require DataLayout. - if (!DL) - return nullptr; - if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memmove, DL)) + if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memmove)) return nullptr; // memmove(x, y, n) -> llvm.memmove(x, y, n, 1) @@ -857,11 +909,8 @@ Value *LibCallSimplifier::optimizeMemMove(CallInst *CI, IRBuilder<> &B) { Value *LibCallSimplifier::optimizeMemSet(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); - // These optimizations require DataLayout. - if (!DL) - return nullptr; - if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memset, DL)) + if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memset)) return nullptr; // memset(p, v, n) -> llvm.memset(p, v, n, 1) @@ -924,7 +973,7 @@ Value *LibCallSimplifier::optimizeUnaryDoubleFP(CallInst *CI, IRBuilder<> &B, // floor((double)floatval) -> (double)floorf(floatval) if (Callee->isIntrinsic()) { Module *M = CI->getParent()->getParent()->getParent(); - Intrinsic::ID IID = (Intrinsic::ID) Callee->getIntrinsicID(); + Intrinsic::ID IID = Callee->getIntrinsicID(); Function *F = Intrinsic::getDeclaration(M, IID, B.getFloatTy()); V = B.CreateCall(F, V); } else { @@ -1101,7 +1150,7 @@ Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilder<> &B) { Value *Callee = M->getOrInsertFunction(TLI->getName(LdExp), Op->getType(), Op->getType(), B.getInt32Ty(), nullptr); - CallInst *CI = B.CreateCall2(Callee, One, LdExpArg); + CallInst *CI = B.CreateCall(Callee, {One, LdExpArg}); if (const Function *F = dyn_cast<Function>(Callee->stripPointerCasts())) CI->setCallingConv(F->getCallingConv()); @@ -1387,7 +1436,7 @@ Value *LibCallSimplifier::optimizeFFS(CallInst *CI, IRBuilder<> &B) { Type *ArgType = Op->getType(); Value *F = Intrinsic::getDeclaration(Callee->getParent(), Intrinsic::cttz, ArgType); - Value *V = B.CreateCall2(F, Op, B.getFalse(), "cttz"); + Value *V = B.CreateCall(F, {Op, B.getFalse()}, "cttz"); V = B.CreateAdd(V, ConstantInt::get(V->getType(), 1)); V = B.CreateIntCast(V, B.getInt32Ty(), false); @@ -1521,7 +1570,7 @@ Value *LibCallSimplifier::optimizePrintFString(CallInst *CI, IRBuilder<> &B) { // printf("x") -> putchar('x'), even for '%'. if (FormatStr.size() == 1) { - Value *Res = EmitPutChar(B.getInt32(FormatStr[0]), B, DL, TLI); + Value *Res = EmitPutChar(B.getInt32(FormatStr[0]), B, TLI); if (CI->use_empty() || !Res) return Res; return B.CreateIntCast(Res, CI->getType(), true); @@ -1534,7 +1583,7 @@ Value *LibCallSimplifier::optimizePrintFString(CallInst *CI, IRBuilder<> &B) { // pass to be run after this pass, to merge duplicate strings. FormatStr = FormatStr.drop_back(); Value *GV = B.CreateGlobalString(FormatStr, "str"); - Value *NewCI = EmitPutS(GV, B, DL, TLI); + Value *NewCI = EmitPutS(GV, B, TLI); return (CI->use_empty() || !NewCI) ? NewCI : ConstantInt::get(CI->getType(), FormatStr.size() + 1); @@ -1544,7 +1593,7 @@ Value *LibCallSimplifier::optimizePrintFString(CallInst *CI, IRBuilder<> &B) { // printf("%c", chr) --> putchar(chr) if (FormatStr == "%c" && CI->getNumArgOperands() > 1 && CI->getArgOperand(1)->getType()->isIntegerTy()) { - Value *Res = EmitPutChar(CI->getArgOperand(1), B, DL, TLI); + Value *Res = EmitPutChar(CI->getArgOperand(1), B, TLI); if (CI->use_empty() || !Res) return Res; @@ -1554,7 +1603,7 @@ Value *LibCallSimplifier::optimizePrintFString(CallInst *CI, IRBuilder<> &B) { // printf("%s\n", str) --> puts(str) if (FormatStr == "%s\n" && CI->getNumArgOperands() > 1 && CI->getArgOperand(1)->getType()->isPointerTy()) { - return EmitPutS(CI->getArgOperand(1), B, DL, TLI); + return EmitPutS(CI->getArgOperand(1), B, TLI); } return nullptr; } @@ -1600,16 +1649,11 @@ Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI, IRBuilder<> &B) { if (FormatStr[i] == '%') return nullptr; // we found a format specifier, bail out. - // These optimizations require DataLayout. - if (!DL) - return nullptr; - // sprintf(str, fmt) -> llvm.memcpy(str, fmt, strlen(fmt)+1, 1) - B.CreateMemCpy( - CI->getArgOperand(0), CI->getArgOperand(1), - ConstantInt::get(DL->getIntPtrType(CI->getContext()), - FormatStr.size() + 1), - 1); // Copy the null byte. + B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1), + ConstantInt::get(DL.getIntPtrType(CI->getContext()), + FormatStr.size() + 1), + 1); // Copy the null byte. return ConstantInt::get(CI->getType(), FormatStr.size()); } @@ -1627,17 +1671,13 @@ Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI, IRBuilder<> &B) { Value *V = B.CreateTrunc(CI->getArgOperand(2), B.getInt8Ty(), "char"); Value *Ptr = CastToCStr(CI->getArgOperand(0), B); B.CreateStore(V, Ptr); - Ptr = B.CreateGEP(Ptr, B.getInt32(1), "nul"); + Ptr = B.CreateGEP(B.getInt8Ty(), Ptr, B.getInt32(1), "nul"); B.CreateStore(B.getInt8(0), Ptr); return ConstantInt::get(CI->getType(), 1); } if (FormatStr[1] == 's') { - // These optimizations require DataLayout. - if (!DL) - return nullptr; - // sprintf(dest, "%s", str) -> llvm.memcpy(dest, str, strlen(str)+1, 1) if (!CI->getArgOperand(2)->getType()->isPointerTy()) return nullptr; @@ -1702,13 +1742,9 @@ Value *LibCallSimplifier::optimizeFPrintFString(CallInst *CI, IRBuilder<> &B) { if (FormatStr[i] == '%') // Could handle %% -> % if we cared. return nullptr; // We found a format specifier. - // These optimizations require DataLayout. - if (!DL) - return nullptr; - return EmitFWrite( CI->getArgOperand(1), - ConstantInt::get(DL->getIntPtrType(CI->getContext()), FormatStr.size()), + ConstantInt::get(DL.getIntPtrType(CI->getContext()), FormatStr.size()), CI->getArgOperand(0), B, DL, TLI); } @@ -1723,14 +1759,14 @@ Value *LibCallSimplifier::optimizeFPrintFString(CallInst *CI, IRBuilder<> &B) { // fprintf(F, "%c", chr) --> fputc(chr, F) if (!CI->getArgOperand(2)->getType()->isIntegerTy()) return nullptr; - return EmitFPutC(CI->getArgOperand(2), CI->getArgOperand(0), B, DL, TLI); + return EmitFPutC(CI->getArgOperand(2), CI->getArgOperand(0), B, TLI); } if (FormatStr[1] == 's') { // fprintf(F, "%s", str) --> fputs(str, F) if (!CI->getArgOperand(2)->getType()->isPointerTy()) return nullptr; - return EmitFPutS(CI->getArgOperand(2), CI->getArgOperand(0), B, DL, TLI); + return EmitFPutS(CI->getArgOperand(2), CI->getArgOperand(0), B, TLI); } return nullptr; } @@ -1790,7 +1826,7 @@ Value *LibCallSimplifier::optimizeFWrite(CallInst *CI, IRBuilder<> &B) { // This optimisation is only valid, if the return value is unused. if (Bytes == 1 && CI->use_empty()) { // fwrite(S,1,1,F) -> fputc(S[0],F) Value *Char = B.CreateLoad(CastToCStr(CI->getArgOperand(0), B), "char"); - Value *NewCI = EmitFPutC(Char, CI->getArgOperand(3), B, DL, TLI); + Value *NewCI = EmitFPutC(Char, CI->getArgOperand(3), B, TLI); return NewCI ? ConstantInt::get(CI->getType(), 1) : nullptr; } @@ -1802,10 +1838,6 @@ Value *LibCallSimplifier::optimizeFPuts(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); - // These optimizations require DataLayout. - if (!DL) - return nullptr; - // Require two pointers. Also, we can't optimize if return value is used. FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() || @@ -1820,7 +1852,7 @@ Value *LibCallSimplifier::optimizeFPuts(CallInst *CI, IRBuilder<> &B) { // Known to have no uses (see above). return EmitFWrite( CI->getArgOperand(0), - ConstantInt::get(DL->getIntPtrType(CI->getContext()), Len - 1), + ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len - 1), CI->getArgOperand(1), B, DL, TLI); } @@ -1839,7 +1871,7 @@ Value *LibCallSimplifier::optimizePuts(CallInst *CI, IRBuilder<> &B) { if (Str.empty() && CI->use_empty()) { // puts("") -> putchar('\n') - Value *Res = EmitPutChar(B.getInt32('\n'), B, DL, TLI); + Value *Res = EmitPutChar(B.getInt32('\n'), B, TLI); if (CI->use_empty() || !Res) return Res; return B.CreateIntCast(Res, CI->getType(), true); @@ -1906,6 +1938,8 @@ Value *LibCallSimplifier::optimizeStringMemoryLibCall(CallInst *CI, return optimizeStrCSpn(CI, Builder); case LibFunc::strstr: return optimizeStrStr(CI, Builder); + case LibFunc::memchr: + return optimizeMemChr(CI, Builder); case LibFunc::memcmp: return optimizeMemCmp(CI, Builder); case LibFunc::memcpy: @@ -2088,15 +2122,19 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) { return nullptr; } -LibCallSimplifier::LibCallSimplifier(const DataLayout *DL, - const TargetLibraryInfo *TLI) : - FortifiedSimplifier(DL, TLI), - DL(DL), - TLI(TLI), - UnsafeFPShrink(false) { +LibCallSimplifier::LibCallSimplifier( + const DataLayout &DL, const TargetLibraryInfo *TLI, + function_ref<void(Instruction *, Value *)> Replacer) + : FortifiedSimplifier(TLI), DL(DL), TLI(TLI), UnsafeFPShrink(false), + Replacer(Replacer) {} + +void LibCallSimplifier::replaceAllUsesWith(Instruction *I, Value *With) { + // Indirect through the replacer used in this instance. + Replacer(I, With); } -void LibCallSimplifier::replaceAllUsesWith(Instruction *I, Value *With) const { +/*static*/ void LibCallSimplifier::replaceAllUsesWithDefault(Instruction *I, + Value *With) { I->replaceAllUsesWith(With); I->eraseFromParent(); } @@ -2183,7 +2221,7 @@ bool FortifiedLibCallSimplifier::isFortifiedCallFoldable(CallInst *CI, Value *FortifiedLibCallSimplifier::optimizeMemCpyChk(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); - if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memcpy_chk, DL)) + if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memcpy_chk)) return nullptr; if (isFortifiedCallFoldable(CI, 3, 2, false)) { @@ -2197,7 +2235,7 @@ Value *FortifiedLibCallSimplifier::optimizeMemCpyChk(CallInst *CI, IRBuilder<> & Value *FortifiedLibCallSimplifier::optimizeMemMoveChk(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); - if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memmove_chk, DL)) + if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memmove_chk)) return nullptr; if (isFortifiedCallFoldable(CI, 3, 2, false)) { @@ -2211,7 +2249,7 @@ Value *FortifiedLibCallSimplifier::optimizeMemMoveChk(CallInst *CI, IRBuilder<> Value *FortifiedLibCallSimplifier::optimizeMemSetChk(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); - if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memset_chk, DL)) + if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memset_chk)) return nullptr; if (isFortifiedCallFoldable(CI, 3, 2, false)) { @@ -2227,8 +2265,9 @@ Value *FortifiedLibCallSimplifier::optimizeStrpCpyChk(CallInst *CI, LibFunc::Func Func) { Function *Callee = CI->getCalledFunction(); StringRef Name = Callee->getName(); + const DataLayout &DL = CI->getModule()->getDataLayout(); - if (!checkStringCopyLibFuncSignature(Callee, Func, DL)) + if (!checkStringCopyLibFuncSignature(Callee, Func)) return nullptr; Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1), @@ -2237,7 +2276,7 @@ Value *FortifiedLibCallSimplifier::optimizeStrpCpyChk(CallInst *CI, // __stpcpy_chk(x,x,...) -> x+strlen(x) if (Func == LibFunc::stpcpy_chk && !OnlyLowerUnknownSize && Dst == Src) { Value *StrLen = EmitStrLen(Src, B, DL, TLI); - return StrLen ? B.CreateInBoundsGEP(Dst, StrLen) : nullptr; + return StrLen ? B.CreateInBoundsGEP(B.getInt8Ty(), Dst, StrLen) : nullptr; } // If a) we don't have any length information, or b) we know this will @@ -2245,29 +2284,25 @@ Value *FortifiedLibCallSimplifier::optimizeStrpCpyChk(CallInst *CI, // st[rp]cpy_chk call which may fail at runtime if the size is too long. // TODO: It might be nice to get a maximum length out of the possible // string lengths for varying. - if (isFortifiedCallFoldable(CI, 2, 1, true)) { - Value *Ret = EmitStrCpy(Dst, Src, B, DL, TLI, Name.substr(2, 6)); - return Ret; - } else if (!OnlyLowerUnknownSize) { - // Maybe we can stil fold __st[rp]cpy_chk to __memcpy_chk. - uint64_t Len = GetStringLength(Src); - if (Len == 0) - return nullptr; + if (isFortifiedCallFoldable(CI, 2, 1, true)) + return EmitStrCpy(Dst, Src, B, TLI, Name.substr(2, 6)); - // This optimization requires DataLayout. - if (!DL) - return nullptr; + if (OnlyLowerUnknownSize) + return nullptr; - Type *SizeTTy = DL->getIntPtrType(CI->getContext()); - Value *LenV = ConstantInt::get(SizeTTy, Len); - Value *Ret = EmitMemCpyChk(Dst, Src, LenV, ObjSize, B, DL, TLI); - // If the function was an __stpcpy_chk, and we were able to fold it into - // a __memcpy_chk, we still need to return the correct end pointer. - if (Ret && Func == LibFunc::stpcpy_chk) - return B.CreateGEP(Dst, ConstantInt::get(SizeTTy, Len - 1)); - return Ret; - } - return nullptr; + // Maybe we can stil fold __st[rp]cpy_chk to __memcpy_chk. + uint64_t Len = GetStringLength(Src); + if (Len == 0) + return nullptr; + + Type *SizeTTy = DL.getIntPtrType(CI->getContext()); + Value *LenV = ConstantInt::get(SizeTTy, Len); + Value *Ret = EmitMemCpyChk(Dst, Src, LenV, ObjSize, B, DL, TLI); + // If the function was an __stpcpy_chk, and we were able to fold it into + // a __memcpy_chk, we still need to return the correct end pointer. + if (Ret && Func == LibFunc::stpcpy_chk) + return B.CreateGEP(B.getInt8Ty(), Dst, ConstantInt::get(SizeTTy, Len - 1)); + return Ret; } Value *FortifiedLibCallSimplifier::optimizeStrpNCpyChk(CallInst *CI, @@ -2276,20 +2311,29 @@ Value *FortifiedLibCallSimplifier::optimizeStrpNCpyChk(CallInst *CI, Function *Callee = CI->getCalledFunction(); StringRef Name = Callee->getName(); - if (!checkStringCopyLibFuncSignature(Callee, Func, DL)) + if (!checkStringCopyLibFuncSignature(Callee, Func)) return nullptr; if (isFortifiedCallFoldable(CI, 3, 2, false)) { - Value *Ret = - EmitStrNCpy(CI->getArgOperand(0), CI->getArgOperand(1), - CI->getArgOperand(2), B, DL, TLI, Name.substr(2, 7)); + Value *Ret = EmitStrNCpy(CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(2), B, TLI, Name.substr(2, 7)); return Ret; } return nullptr; } Value *FortifiedLibCallSimplifier::optimizeCall(CallInst *CI) { - if (CI->isNoBuiltin()) - return nullptr; + // FIXME: We shouldn't be changing "nobuiltin" or TLI unavailable calls here. + // Some clang users checked for _chk libcall availability using: + // __has_builtin(__builtin___memcpy_chk) + // When compiling with -fno-builtin, this is always true. + // When passing -ffreestanding/-mkernel, which both imply -fno-builtin, we + // end up with fortified libcalls, which isn't acceptable in a freestanding + // environment which only provides their non-fortified counterparts. + // + // Until we change clang and/or teach external users to check for availability + // differently, disregard the "nobuiltin" attribute and TLI::has. + // + // PR23093. LibFunc::Func Func; Function *Callee = CI->getCalledFunction(); @@ -2298,7 +2342,7 @@ Value *FortifiedLibCallSimplifier::optimizeCall(CallInst *CI) { bool isCallingConvC = CI->getCallingConv() == llvm::CallingConv::C; // First, check that this is a known library functions. - if (!TLI->getLibFunc(FuncName, Func) || !TLI->has(Func)) + if (!TLI->getLibFunc(FuncName, Func)) return nullptr; // We never change the calling convention. @@ -2324,8 +2368,6 @@ Value *FortifiedLibCallSimplifier::optimizeCall(CallInst *CI) { return nullptr; } -FortifiedLibCallSimplifier:: -FortifiedLibCallSimplifier(const DataLayout *DL, const TargetLibraryInfo *TLI, - bool OnlyLowerUnknownSize) - : DL(DL), TLI(TLI), OnlyLowerUnknownSize(OnlyLowerUnknownSize) { -} +FortifiedLibCallSimplifier::FortifiedLibCallSimplifier( + const TargetLibraryInfo *TLI, bool OnlyLowerUnknownSize) + : TLI(TLI), OnlyLowerUnknownSize(OnlyLowerUnknownSize) {} diff --git a/contrib/llvm/lib/Transforms/Utils/SymbolRewriter.cpp b/contrib/llvm/lib/Transforms/Utils/SymbolRewriter.cpp index d36283e..a2a54da 100644 --- a/contrib/llvm/lib/Transforms/Utils/SymbolRewriter.cpp +++ b/contrib/llvm/lib/Transforms/Utils/SymbolRewriter.cpp @@ -60,7 +60,8 @@ #define DEBUG_TYPE "symbol-rewriter" #include "llvm/CodeGen/Passes.h" #include "llvm/Pass.h" -#include "llvm/PassManager.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/IR/LegacyPassManager.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/MemoryBuffer.h" @@ -72,15 +73,15 @@ #include "llvm/Transforms/Utils/SymbolRewriter.h" using namespace llvm; +using namespace SymbolRewriter; static cl::list<std::string> RewriteMapFiles("rewrite-map-file", cl::desc("Symbol Rewrite Map"), cl::value_desc("filename")); -namespace llvm { -namespace SymbolRewriter { -void rewriteComdat(Module &M, GlobalObject *GO, const std::string &Source, - const std::string &Target) { +static void rewriteComdat(Module &M, GlobalObject *GO, + const std::string &Source, + const std::string &Target) { if (Comdat *CD = GO->getComdat()) { auto &Comdats = M.getComdatSymbolTable(); @@ -92,6 +93,7 @@ void rewriteComdat(Module &M, GlobalObject *GO, const std::string &Source, } } +namespace { template <RewriteDescriptor::Type DT, typename ValueType, ValueType *(llvm::Module::*Get)(StringRef) const> class ExplicitRewriteDescriptor : public RewriteDescriptor { @@ -226,6 +228,7 @@ typedef PatternRewriteDescriptor<RewriteDescriptor::Type::NamedAlias, &llvm::Module::getNamedAlias, &llvm::Module::aliases> PatternRewriteNamedAliasDescriptor; +} // namespace bool RewriteMapParser::parse(const std::string &MapFile, RewriteDescriptorList *DL) { @@ -489,8 +492,6 @@ parseRewriteGlobalAliasDescriptor(yaml::Stream &YS, yaml::ScalarNode *K, return true; } -} -} namespace { class RewriteSymbols : public ModulePass { diff --git a/contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp b/contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp index 65f2ae2..cac80ac 100644 --- a/contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp +++ b/contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/ValueMapper.h" +#include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/InlineAsm.h" @@ -155,13 +156,12 @@ static Metadata *mapToSelf(ValueToValueMapTy &VM, const Metadata *MD) { } static Metadata *MapMetadataImpl(const Metadata *MD, - SmallVectorImpl<UniquableMDNode *> &Cycles, + SmallVectorImpl<MDNode *> &Cycles, ValueToValueMapTy &VM, RemapFlags Flags, ValueMapTypeRemapper *TypeMapper, ValueMaterializer *Materializer); -static Metadata *mapMetadataOp(Metadata *Op, - SmallVectorImpl<UniquableMDNode *> &Cycles, +static Metadata *mapMetadataOp(Metadata *Op, SmallVectorImpl<MDNode *> &Cycles, ValueToValueMapTy &VM, RemapFlags Flags, ValueMapTypeRemapper *TypeMapper, ValueMaterializer *Materializer) { @@ -182,157 +182,85 @@ static Metadata *mapMetadataOp(Metadata *Op, return nullptr; } -static Metadata *cloneMDTuple(const MDTuple *Node, - SmallVectorImpl<UniquableMDNode *> &Cycles, - ValueToValueMapTy &VM, RemapFlags Flags, - ValueMapTypeRemapper *TypeMapper, - ValueMaterializer *Materializer, - bool IsDistinct) { - // Distinct MDTuples have their own code path. - assert(!IsDistinct && "Unexpected distinct tuple"); - (void)IsDistinct; - - SmallVector<Metadata *, 4> Elts; - Elts.reserve(Node->getNumOperands()); - for (unsigned I = 0, E = Node->getNumOperands(); I != E; ++I) - Elts.push_back(mapMetadataOp(Node->getOperand(I), Cycles, VM, Flags, - TypeMapper, Materializer)); - - return MDTuple::get(Node->getContext(), Elts); -} - -static Metadata *cloneMDLocation(const MDLocation *Node, - SmallVectorImpl<UniquableMDNode *> &Cycles, - ValueToValueMapTy &VM, RemapFlags Flags, - ValueMapTypeRemapper *TypeMapper, - ValueMaterializer *Materializer, - bool IsDistinct) { - return (IsDistinct ? MDLocation::getDistinct : MDLocation::get)( - Node->getContext(), Node->getLine(), Node->getColumn(), - mapMetadataOp(Node->getScope(), Cycles, VM, Flags, TypeMapper, - Materializer), - mapMetadataOp(Node->getInlinedAt(), Cycles, VM, Flags, TypeMapper, - Materializer)); -} - -static Metadata *cloneMDNode(const UniquableMDNode *Node, - SmallVectorImpl<UniquableMDNode *> &Cycles, - ValueToValueMapTy &VM, RemapFlags Flags, - ValueMapTypeRemapper *TypeMapper, - ValueMaterializer *Materializer, bool IsDistinct) { - switch (Node->getMetadataID()) { - default: - llvm_unreachable("Invalid UniquableMDNode subclass"); -#define HANDLE_UNIQUABLE_LEAF(CLASS) \ - case Metadata::CLASS##Kind: \ - return clone##CLASS(cast<CLASS>(Node), Cycles, VM, Flags, TypeMapper, \ - Materializer, IsDistinct); -#include "llvm/IR/Metadata.def" +/// \brief Remap nodes. +/// +/// Insert \c NewNode in the value map, and then remap \c OldNode's operands. +/// Assumes that \c NewNode is already a clone of \c OldNode. +/// +/// \pre \c NewNode is a clone of \c OldNode. +static bool remap(const MDNode *OldNode, MDNode *NewNode, + SmallVectorImpl<MDNode *> &Cycles, ValueToValueMapTy &VM, + RemapFlags Flags, ValueMapTypeRemapper *TypeMapper, + ValueMaterializer *Materializer) { + assert(OldNode->getNumOperands() == NewNode->getNumOperands() && + "Expected nodes to match"); + assert(OldNode->isResolved() && "Expected resolved node"); + assert(!NewNode->isUniqued() && "Expected non-uniqued node"); + + // Map the node upfront so it's available for cyclic references. + mapToMetadata(VM, OldNode, NewNode); + bool AnyChanged = false; + for (unsigned I = 0, E = OldNode->getNumOperands(); I != E; ++I) { + Metadata *Old = OldNode->getOperand(I); + assert(NewNode->getOperand(I) == Old && + "Expected old operands to already be in place"); + + Metadata *New = mapMetadataOp(OldNode->getOperand(I), Cycles, VM, Flags, + TypeMapper, Materializer); + if (Old != New) { + AnyChanged = true; + NewNode->replaceOperandWith(I, New); + } } -} -static void -trackCyclesUnderDistinct(const UniquableMDNode *Node, - SmallVectorImpl<UniquableMDNode *> &Cycles) { - // Track any cycles beneath this node. - for (Metadata *Op : Node->operands()) - if (auto *N = dyn_cast_or_null<UniquableMDNode>(Op)) - if (!N->isResolved()) - Cycles.push_back(N); + return AnyChanged; } /// \brief Map a distinct MDNode. /// /// Distinct nodes are not uniqued, so they must always recreated. -static Metadata *mapDistinctNode(const UniquableMDNode *Node, - SmallVectorImpl<UniquableMDNode *> &Cycles, +static Metadata *mapDistinctNode(const MDNode *Node, + SmallVectorImpl<MDNode *> &Cycles, ValueToValueMapTy &VM, RemapFlags Flags, ValueMapTypeRemapper *TypeMapper, ValueMaterializer *Materializer) { assert(Node->isDistinct() && "Expected distinct node"); - // Optimization for MDTuples. - if (isa<MDTuple>(Node)) { - // Create the node first so it's available for cyclical references. - SmallVector<Metadata *, 4> EmptyOps(Node->getNumOperands()); - MDTuple *NewMD = MDTuple::getDistinct(Node->getContext(), EmptyOps); - mapToMetadata(VM, Node, NewMD); - - // Fix the operands. - for (unsigned I = 0, E = Node->getNumOperands(); I != E; ++I) - NewMD->replaceOperandWith(I, - mapMetadataOp(Node->getOperand(I), Cycles, VM, - Flags, TypeMapper, Materializer)); - - trackCyclesUnderDistinct(NewMD, Cycles); - return NewMD; - } + MDNode *NewMD = MDNode::replaceWithDistinct(Node->clone()); + remap(Node, NewMD, Cycles, VM, Flags, TypeMapper, Materializer); - // In general we need a dummy node, since whether the operands are null can - // affect the size of the node. - std::unique_ptr<MDNodeFwdDecl> Dummy( - MDNode::getTemporary(Node->getContext(), None)); - mapToMetadata(VM, Node, Dummy.get()); - auto *NewMD = cast<UniquableMDNode>(cloneMDNode(Node, Cycles, VM, Flags, - TypeMapper, Materializer, - /* IsDistinct */ true)); - Dummy->replaceAllUsesWith(NewMD); - trackCyclesUnderDistinct(NewMD, Cycles); - return mapToMetadata(VM, Node, NewMD); -} + // Track any cycles beneath this node. + for (Metadata *Op : NewMD->operands()) + if (auto *Node = dyn_cast_or_null<MDNode>(Op)) + if (!Node->isResolved()) + Cycles.push_back(Node); -/// \brief Check whether a uniqued node needs to be remapped. -/// -/// Check whether a uniqued node needs to be remapped (due to any operands -/// changing). -static bool shouldRemapUniquedNode(const UniquableMDNode *Node, - SmallVectorImpl<UniquableMDNode *> &Cycles, - ValueToValueMapTy &VM, RemapFlags Flags, - ValueMapTypeRemapper *TypeMapper, - ValueMaterializer *Materializer) { - // Check all operands to see if any need to be remapped. - for (unsigned I = 0, E = Node->getNumOperands(); I != E; ++I) { - Metadata *Op = Node->getOperand(I); - if (Op != mapMetadataOp(Op, Cycles, VM, Flags, TypeMapper, Materializer)) - return true; - } - return false; + return NewMD; } /// \brief Map a uniqued MDNode. /// /// Uniqued nodes may not need to be recreated (they may map to themselves). -static Metadata *mapUniquedNode(const UniquableMDNode *Node, - SmallVectorImpl<UniquableMDNode *> &Cycles, +static Metadata *mapUniquedNode(const MDNode *Node, + SmallVectorImpl<MDNode *> &Cycles, ValueToValueMapTy &VM, RemapFlags Flags, ValueMapTypeRemapper *TypeMapper, ValueMaterializer *Materializer) { - assert(!Node->isDistinct() && "Expected uniqued node"); - - // Create a dummy node in case we have a metadata cycle. - MDNodeFwdDecl *Dummy = MDNode::getTemporary(Node->getContext(), None); - mapToMetadata(VM, Node, Dummy); - - // Check all operands to see if any need to be remapped. - if (!shouldRemapUniquedNode(Node, Cycles, VM, Flags, TypeMapper, - Materializer)) { - // Use an identity mapping. - mapToSelf(VM, Node); - MDNode::deleteTemporary(Dummy); - return const_cast<Metadata *>(static_cast<const Metadata *>(Node)); - } + assert(Node->isUniqued() && "Expected uniqued node"); - // At least one operand needs remapping. - Metadata *NewMD = - cloneMDNode(Node, Cycles, VM, Flags, TypeMapper, Materializer, - /* IsDistinct */ false); - Dummy->replaceAllUsesWith(NewMD); - MDNode::deleteTemporary(Dummy); - return mapToMetadata(VM, Node, NewMD); + // Create a temporary node upfront in case we have a metadata cycle. + auto ClonedMD = Node->clone(); + if (!remap(Node, ClonedMD.get(), Cycles, VM, Flags, TypeMapper, Materializer)) + // No operands changed, so use the identity mapping. + return mapToSelf(VM, Node); + + // At least one operand has changed, so uniquify the cloned node. + return mapToMetadata(VM, Node, + MDNode::replaceWithUniqued(std::move(ClonedMD))); } static Metadata *MapMetadataImpl(const Metadata *MD, - SmallVectorImpl<UniquableMDNode *> &Cycles, + SmallVectorImpl<MDNode *> &Cycles, ValueToValueMapTy &VM, RemapFlags Flags, ValueMapTypeRemapper *TypeMapper, ValueMaterializer *Materializer) { @@ -364,14 +292,18 @@ static Metadata *MapMetadataImpl(const Metadata *MD, return nullptr; } - const UniquableMDNode *Node = cast<UniquableMDNode>(MD); - assert(Node->isResolved() && "Unexpected unresolved node"); + // Note: this cast precedes the Flags check so we always get its associated + // assertion. + const MDNode *Node = cast<MDNode>(MD); // If this is a module-level metadata and we know that nothing at the // module level is changing, then use an identity mapping. if (Flags & RF_NoModuleLevelChanges) return mapToSelf(VM, MD); + // Require resolved nodes whenever metadata might be remapped. + assert(Node->isResolved() && "Unexpected unresolved node"); + if (Node->isDistinct()) return mapDistinctNode(Node, Cycles, VM, Flags, TypeMapper, Materializer); @@ -381,17 +313,19 @@ static Metadata *MapMetadataImpl(const Metadata *MD, Metadata *llvm::MapMetadata(const Metadata *MD, ValueToValueMapTy &VM, RemapFlags Flags, ValueMapTypeRemapper *TypeMapper, ValueMaterializer *Materializer) { - SmallVector<UniquableMDNode *, 8> Cycles; + SmallVector<MDNode *, 8> Cycles; Metadata *NewMD = MapMetadataImpl(MD, Cycles, VM, Flags, TypeMapper, Materializer); // Resolve cycles underneath MD. if (NewMD && NewMD != MD) { - if (auto *N = dyn_cast<UniquableMDNode>(NewMD)) - N->resolveCycles(); + if (auto *N = dyn_cast<MDNode>(NewMD)) + if (!N->isResolved()) + N->resolveCycles(); - for (UniquableMDNode *N : Cycles) - N->resolveCycles(); + for (MDNode *N : Cycles) + if (!N->isResolved()) + N->resolveCycles(); } else { // Shouldn't get unresolved cycles if nothing was remapped. assert(Cycles.empty() && "Expected no unresolved cycles"); @@ -450,7 +384,24 @@ void llvm::RemapInstruction(Instruction *I, ValueToValueMapTy &VMap, I->setMetadata(MI->first, New); } + if (!TypeMapper) + return; + // If the instruction's type is being remapped, do so now. - if (TypeMapper) - I->mutateType(TypeMapper->remapType(I->getType())); + if (auto CS = CallSite(I)) { + SmallVector<Type *, 3> Tys; + FunctionType *FTy = CS.getFunctionType(); + Tys.reserve(FTy->getNumParams()); + for (Type *Ty : FTy->params()) + Tys.push_back(TypeMapper->remapType(Ty)); + CS.mutateFunctionType(FunctionType::get( + TypeMapper->remapType(I->getType()), Tys, FTy->isVarArg())); + return; + } + if (auto *AI = dyn_cast<AllocaInst>(I)) + AI->setAllocatedType(TypeMapper->remapType(AI->getAllocatedType())); + if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) + GEP->setSourceElementType( + TypeMapper->remapType(GEP->getSourceElementType())); + I->mutateType(TypeMapper->remapType(I->getType())); } diff --git a/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp b/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp index a0ccf9d..215d6f9 100644 --- a/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp +++ b/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp @@ -39,6 +39,7 @@ #include "llvm/IR/Intrinsics.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Metadata.h" +#include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/IR/ValueHandle.h" #include "llvm/Pass.h" @@ -201,14 +202,14 @@ namespace { initializeBBVectorizePass(*PassRegistry::getPassRegistry()); } - BBVectorize(Pass *P, const VectorizeConfig &C) + BBVectorize(Pass *P, Function &F, const VectorizeConfig &C) : BasicBlockPass(ID), Config(C) { AA = &P->getAnalysis<AliasAnalysis>(); DT = &P->getAnalysis<DominatorTreeWrapperPass>().getDomTree(); SE = &P->getAnalysis<ScalarEvolution>(); - DataLayoutPass *DLP = P->getAnalysisIfAvailable<DataLayoutPass>(); - DL = DLP ? &DLP->getDataLayout() : nullptr; - TTI = IgnoreTargetInfo ? nullptr : &P->getAnalysis<TargetTransformInfo>(); + TTI = IgnoreTargetInfo + ? nullptr + : &P->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); } typedef std::pair<Value *, Value *> ValuePair; @@ -220,7 +221,6 @@ namespace { AliasAnalysis *AA; DominatorTree *DT; ScalarEvolution *SE; - const DataLayout *DL; const TargetTransformInfo *TTI; // FIXME: const correct? @@ -440,9 +440,10 @@ namespace { AA = &getAnalysis<AliasAnalysis>(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); SE = &getAnalysis<ScalarEvolution>(); - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - DL = DLP ? &DLP->getDataLayout() : nullptr; - TTI = IgnoreTargetInfo ? nullptr : &getAnalysis<TargetTransformInfo>(); + TTI = IgnoreTargetInfo + ? nullptr + : &getAnalysis<TargetTransformInfoWrapperPass>().getTTI( + *BB.getParent()); return vectorizeBB(BB); } @@ -452,7 +453,7 @@ namespace { AU.addRequired<AliasAnalysis>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<ScalarEvolution>(); - AU.addRequired<TargetTransformInfo>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); AU.addPreserved<AliasAnalysis>(); AU.addPreserved<DominatorTreeWrapperPass>(); AU.addPreserved<ScalarEvolution>(); @@ -637,19 +638,19 @@ namespace { dyn_cast<SCEVConstant>(OffsetSCEV)) { ConstantInt *IntOff = ConstOffSCEV->getValue(); int64_t Offset = IntOff->getSExtValue(); - + const DataLayout &DL = I->getModule()->getDataLayout(); Type *VTy = IPtr->getType()->getPointerElementType(); - int64_t VTyTSS = (int64_t) DL->getTypeStoreSize(VTy); + int64_t VTyTSS = (int64_t)DL.getTypeStoreSize(VTy); Type *VTy2 = JPtr->getType()->getPointerElementType(); if (VTy != VTy2 && Offset < 0) { - int64_t VTy2TSS = (int64_t) DL->getTypeStoreSize(VTy2); + int64_t VTy2TSS = (int64_t)DL.getTypeStoreSize(VTy2); OffsetInElmts = Offset/VTy2TSS; - return (abs64(Offset) % VTy2TSS) == 0; + return (std::abs(Offset) % VTy2TSS) == 0; } OffsetInElmts = Offset/VTyTSS; - return (abs64(Offset) % VTyTSS) == 0; + return (std::abs(Offset) % VTyTSS) == 0; } return false; @@ -661,7 +662,7 @@ namespace { Function *F = I->getCalledFunction(); if (!F) return false; - Intrinsic::ID IID = (Intrinsic::ID) F->getIntrinsicID(); + Intrinsic::ID IID = F->getIntrinsicID(); if (!IID) return false; switch(IID) { @@ -841,7 +842,7 @@ namespace { // It is important to cleanup here so that future iterations of this // function have less work to do. - (void) SimplifyInstructionsInBlock(&BB, DL, AA->getTargetLibraryInfo()); + (void)SimplifyInstructionsInBlock(&BB, AA->getTargetLibraryInfo()); return true; } @@ -895,10 +896,6 @@ namespace { return false; } - // We can't vectorize memory operations without target data - if (!DL && IsSimpleLoadStore) - return false; - Type *T1, *T2; getInstructionTypes(I, T1, T2); @@ -933,9 +930,8 @@ namespace { if (T2->isX86_FP80Ty() || T2->isPPC_FP128Ty() || T2->isX86_MMXTy()) return false; - if ((!Config.VectorizePointers || !DL) && - (T1->getScalarType()->isPointerTy() || - T2->getScalarType()->isPointerTy())) + if (!Config.VectorizePointers && (T1->getScalarType()->isPointerTy() || + T2->getScalarType()->isPointerTy())) return false; if (!TTI && (T1->getPrimitiveSizeInBits() >= Config.VectorBits || @@ -980,8 +976,8 @@ namespace { unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace; int64_t OffsetInElmts = 0; if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, - IAddressSpace, JAddressSpace, - OffsetInElmts) && abs64(OffsetInElmts) == 1) { + IAddressSpace, JAddressSpace, OffsetInElmts) && + std::abs(OffsetInElmts) == 1) { FixedOrder = (int) OffsetInElmts; unsigned BottomAlignment = IAlignment; if (OffsetInElmts < 0) BottomAlignment = JAlignment; @@ -996,8 +992,8 @@ namespace { // An aligned load or store is possible only if the instruction // with the lower offset has an alignment suitable for the // vector type. - - unsigned VecAlignment = DL->getPrefTypeAlignment(VType); + const DataLayout &DL = I->getModule()->getDataLayout(); + unsigned VecAlignment = DL.getPrefTypeAlignment(VType); if (BottomAlignment < VecAlignment) return false; } @@ -1102,7 +1098,7 @@ namespace { CallInst *CI = dyn_cast<CallInst>(I); Function *FI; if (CI && (FI = CI->getCalledFunction())) { - Intrinsic::ID IID = (Intrinsic::ID) FI->getIntrinsicID(); + Intrinsic::ID IID = FI->getIntrinsicID(); if (IID == Intrinsic::powi || IID == Intrinsic::ctlz || IID == Intrinsic::cttz) { Value *A1I = CI->getArgOperand(1), @@ -1277,7 +1273,7 @@ namespace { CostSavings, FixedOrder)) continue; // J is a candidate for merging with I. - if (!PairableInsts.size() || + if (PairableInsts.empty() || PairableInsts[PairableInsts.size()-1] != I) { PairableInsts.push_back(I); } @@ -2774,7 +2770,7 @@ namespace { continue; } else if (isa<CallInst>(I)) { Function *F = cast<CallInst>(I)->getCalledFunction(); - Intrinsic::ID IID = (Intrinsic::ID) F->getIntrinsicID(); + Intrinsic::ID IID = F->getIntrinsicID(); if (o == NumOperands-1) { BasicBlock &BB = *I->getParent(); @@ -3107,7 +3103,17 @@ namespace { else if (H->hasName()) K->takeName(H); - if (!isa<StoreInst>(K)) + if (auto CS = CallSite(K)) { + SmallVector<Type *, 3> Tys; + FunctionType *Old = CS.getFunctionType(); + unsigned NumOld = Old->getNumParams(); + assert(NumOld <= ReplacedOperands.size()); + for (unsigned i = 0; i != NumOld; ++i) + Tys.push_back(ReplacedOperands[i]->getType()); + CS.mutateFunctionType( + FunctionType::get(getVecTypeForPair(L->getType(), H->getType()), + Tys, Old->isVarArg())); + } else if (!isa<StoreInst>(K)) K->mutateType(getVecTypeForPair(L->getType(), H->getType())); unsigned KnownIDs[] = { @@ -3192,7 +3198,7 @@ char BBVectorize::ID = 0; static const char bb_vectorize_name[] = "Basic-Block Vectorization"; INITIALIZE_PASS_BEGIN(BBVectorize, BBV_NAME, bb_vectorize_name, false, false) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) -INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_END(BBVectorize, BBV_NAME, bb_vectorize_name, false, false) @@ -3203,7 +3209,7 @@ BasicBlockPass *llvm::createBBVectorizePass(const VectorizeConfig &C) { bool llvm::vectorizeBasicBlock(Pass *P, BasicBlock &BB, const VectorizeConfig &C) { - BBVectorize BBVectorizer(P, C); + BBVectorize BBVectorizer(P, *BB.getParent(), C); return BBVectorizer.vectorizeBB(BB); } diff --git a/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 47b92a3..011fd0f 100644 --- a/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -58,6 +58,7 @@ #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/LoopPass.h" @@ -92,6 +93,7 @@ #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/VectorUtils.h" +#include "llvm/Transforms/Utils/LoopUtils.h" #include <algorithm> #include <map> #include <tuple> @@ -105,15 +107,6 @@ using namespace llvm::PatternMatch; STATISTIC(LoopsVectorized, "Number of loops vectorized"); STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); -static cl::opt<unsigned> -VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, - cl::desc("Sets the SIMD width. Zero is autoselect.")); - -static cl::opt<unsigned> -VectorizationInterleave("force-vector-interleave", cl::init(0), cl::Hidden, - cl::desc("Sets the vectorization interleave count. " - "Zero is autoselect.")); - static cl::opt<bool> EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, cl::desc("Enable if-conversion during vectorization.")); @@ -144,13 +137,6 @@ static cl::opt<bool> EnableMemAccessVersioning( /// We don't unroll loops with a known constant trip count below this number. static const unsigned TinyTripCountUnrollThreshold = 128; -/// When performing memory disambiguation checks at runtime do not make more -/// than this number of comparisons. -static const unsigned RuntimeMemoryCheckThreshold = 8; - -/// Maximum simd width. -static const unsigned MaxVectorWidth = 64; - static cl::opt<unsigned> ForceTargetNumScalarRegs( "force-target-num-scalar-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of scalar registers.")); @@ -218,29 +204,30 @@ class LoopVectorizationLegality; class LoopVectorizationCostModel; class LoopVectorizeHints; -/// Optimization analysis message produced during vectorization. Messages inform -/// the user why vectorization did not occur. -class Report { - std::string Message; - raw_string_ostream Out; - Instruction *Instr; - +/// \brief This modifies LoopAccessReport to initialize message with +/// loop-vectorizer-specific part. +class VectorizationReport : public LoopAccessReport { public: - Report(Instruction *I = nullptr) : Out(Message), Instr(I) { - Out << "loop not vectorized: "; - } - - template <typename A> Report &operator<<(const A &Value) { - Out << Value; - return *this; - } - - Instruction *getInstr() { return Instr; } - - std::string &str() { return Out.str(); } - operator Twine() { return Out.str(); } + VectorizationReport(Instruction *I = nullptr) + : LoopAccessReport("loop not vectorized: ", I) {} + + /// \brief This allows promotion of the loop-access analysis report into the + /// loop-vectorizer report. It modifies the message to add the + /// loop-vectorizer-specific part of the message. + explicit VectorizationReport(const LoopAccessReport &R) + : LoopAccessReport(Twine("loop not vectorized: ") + R.str(), + R.getInstr()) {} }; +/// A helper function for converting Scalar types to vector types. +/// If the incoming type is void, we return void. If the VF is 1, we return +/// the scalar type. +static Type* ToVectorTy(Type *Scalar, unsigned VF) { + if (Scalar->isVoidTy() || VF == 1) + return Scalar; + return VectorType::get(Scalar, VF); +} + /// InnerLoopVectorizer vectorizes loops which contain only one basic /// block to a specified vectorization factor (VF). /// This class performs the widening of scalars into vectors, or multiple @@ -258,13 +245,13 @@ public: class InnerLoopVectorizer { public: InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI, - DominatorTree *DT, const DataLayout *DL, - const TargetLibraryInfo *TLI, unsigned VecWidth, + DominatorTree *DT, const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI, unsigned VecWidth, unsigned UnrollFactor) - : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), DL(DL), TLI(TLI), + : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()), Induction(nullptr), OldInduction(nullptr), WidenMap(UnrollFactor), - Legal(nullptr) {} + Legal(nullptr), AddedSafetyChecks(false) {} // Perform the actual loop widening (vectorization). void vectorize(LoopVectorizationLegality *L) { @@ -278,6 +265,11 @@ public: updateAnalysis(); } + // Return true if any runtime check is added. + bool IsSafetyChecksAdded() { + return AddedSafetyChecks; + } + virtual ~InnerLoopVectorizer() {} protected: @@ -288,19 +280,12 @@ protected: /// originated from one scalar instruction. typedef SmallVector<Value*, 2> VectorParts; - // When we if-convert we need create edge masks. We have to cache values so - // that we don't end up with exponential recursion/IR. + // When we if-convert we need to create edge masks. We have to cache values + // so that we don't end up with exponential recursion/IR. typedef DenseMap<std::pair<BasicBlock*, BasicBlock*>, VectorParts> EdgeMaskCache; - /// \brief Add code that checks at runtime if the accessed arrays overlap. - /// - /// Returns a pair of instructions where the first element is the first - /// instruction generated in possibly a sequence of instructions and the - /// second value is the final comparator value or NULL if no check is needed. - std::pair<Instruction *, Instruction *> addRuntimeCheck(Instruction *Loc); - - /// \brief Add checks for strides that where assumed to be 1. + /// \brief Add checks for strides that were assumed to be 1. /// /// Returns the last check instruction and the first check instruction in the /// pair as (first, last). @@ -355,10 +340,9 @@ protected: /// element. virtual Value *getBroadcastInstrs(Value *V); - /// This function adds 0, 1, 2 ... to each vector element, starting at zero. - /// If Negate is set then negative numbers are added e.g. (0, -1, -2, ...). - /// The sequence starts at StartIndex. - virtual Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate); + /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) + /// to each vector element of Val. The sequence starts at StartIndex. + virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step); /// When we go over instructions in the basic block we rely on previous /// values within the current basic block or on loop invariant values. @@ -420,10 +404,10 @@ protected: DominatorTree *DT; /// Alias Analysis. AliasAnalysis *AA; - /// Data Layout. - const DataLayout *DL; /// Target Library Info. const TargetLibraryInfo *TLI; + /// Target Transform Info. + const TargetTransformInfo *TTI; /// The vectorization SIMD factor to use. Each vector will have this many /// vector elements. @@ -465,21 +449,24 @@ protected: EdgeMaskCache MaskCache; LoopVectorizationLegality *Legal; + + // Record whether runtime check is added. + bool AddedSafetyChecks; }; class InnerLoopUnroller : public InnerLoopVectorizer { public: InnerLoopUnroller(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI, - DominatorTree *DT, const DataLayout *DL, - const TargetLibraryInfo *TLI, unsigned UnrollFactor) : - InnerLoopVectorizer(OrigLoop, SE, LI, DT, DL, TLI, 1, UnrollFactor) { } + DominatorTree *DT, const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI, unsigned UnrollFactor) + : InnerLoopVectorizer(OrigLoop, SE, LI, DT, TLI, TTI, 1, UnrollFactor) {} private: void scalarizeInstruction(Instruction *Instr, bool IfPredicateStore = false) override; void vectorizeMemoryInstruction(Instruction *Instr) override; Value *getBroadcastInstrs(Value *V) override; - Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate) override; + Value *getStepVector(Value *Val, int StartIdx, Value *Step) override; Value *reverseVector(Value *Vec) override; }; @@ -517,9 +504,8 @@ static std::string getDebugLocString(const Loop *L) { std::string Result; if (L) { raw_string_ostream OS(Result); - const DebugLoc LoopDbgLoc = L->getStartLoc(); - if (!LoopDbgLoc.isUnknown()) - LoopDbgLoc.print(L->getHeader()->getContext(), OS); + if (const DebugLoc LoopDbgLoc = L->getStartLoc()) + LoopDbgLoc.print(OS); else // Just print the module name. OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); @@ -574,135 +560,84 @@ static void propagateMetadata(SmallVectorImpl<Value *> &To, const Instruction *F /// induction variable and the different reduction variables. class LoopVectorizationLegality { public: - unsigned NumLoads; - unsigned NumStores; - unsigned NumPredStores; - - LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, const DataLayout *DL, - DominatorTree *DT, TargetLibraryInfo *TLI, - AliasAnalysis *AA, Function *F, - const TargetTransformInfo *TTI) - : NumLoads(0), NumStores(0), NumPredStores(0), TheLoop(L), SE(SE), DL(DL), - DT(DT), TLI(TLI), AA(AA), TheFunction(F), TTI(TTI), Induction(nullptr), - WidestIndTy(nullptr), HasFunNoNaNAttr(false), MaxSafeDepDistBytes(-1U) { - } - - /// This enum represents the kinds of reductions that we support. - enum ReductionKind { - RK_NoReduction, ///< Not a reduction. - RK_IntegerAdd, ///< Sum of integers. - RK_IntegerMult, ///< Product of integers. - RK_IntegerOr, ///< Bitwise or logical OR of numbers. - RK_IntegerAnd, ///< Bitwise or logical AND of numbers. - RK_IntegerXor, ///< Bitwise or logical XOR of numbers. - RK_IntegerMinMax, ///< Min/max implemented in terms of select(cmp()). - RK_FloatAdd, ///< Sum of floats. - RK_FloatMult, ///< Product of floats. - RK_FloatMinMax ///< Min/max implemented in terms of select(cmp()). - }; + LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DominatorTree *DT, + TargetLibraryInfo *TLI, AliasAnalysis *AA, + Function *F, const TargetTransformInfo *TTI, + LoopAccessAnalysis *LAA) + : NumPredStores(0), TheLoop(L), SE(SE), TLI(TLI), TheFunction(F), + TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), Induction(nullptr), + WidestIndTy(nullptr), HasFunNoNaNAttr(false) {} /// This enum represents the kinds of inductions that we support. enum InductionKind { - IK_NoInduction, ///< Not an induction variable. - IK_IntInduction, ///< Integer induction variable. Step = 1. - IK_ReverseIntInduction, ///< Reverse int induction variable. Step = -1. - IK_PtrInduction, ///< Pointer induction var. Step = sizeof(elem). - IK_ReversePtrInduction ///< Reverse ptr indvar. Step = - sizeof(elem). - }; - - // This enum represents the kind of minmax reduction. - enum MinMaxReductionKind { - MRK_Invalid, - MRK_UIntMin, - MRK_UIntMax, - MRK_SIntMin, - MRK_SIntMax, - MRK_FloatMin, - MRK_FloatMax - }; - - /// This struct holds information about reduction variables. - struct ReductionDescriptor { - ReductionDescriptor() : StartValue(nullptr), LoopExitInstr(nullptr), - Kind(RK_NoReduction), MinMaxKind(MRK_Invalid) {} - - ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K, - MinMaxReductionKind MK) - : StartValue(Start), LoopExitInstr(Exit), Kind(K), MinMaxKind(MK) {} - - // The starting value of the reduction. - // It does not have to be zero! - TrackingVH<Value> StartValue; - // The instruction who's value is used outside the loop. - Instruction *LoopExitInstr; - // The kind of the reduction. - ReductionKind Kind; - // If this a min/max reduction the kind of reduction. - MinMaxReductionKind MinMaxKind; + IK_NoInduction, ///< Not an induction variable. + IK_IntInduction, ///< Integer induction variable. Step = C. + IK_PtrInduction ///< Pointer induction var. Step = C / sizeof(elem). }; - /// This POD struct holds information about a potential reduction operation. - struct ReductionInstDesc { - ReductionInstDesc(bool IsRedux, Instruction *I) : - IsReduction(IsRedux), PatternLastInst(I), MinMaxKind(MRK_Invalid) {} - - ReductionInstDesc(Instruction *I, MinMaxReductionKind K) : - IsReduction(true), PatternLastInst(I), MinMaxKind(K) {} - - // Is this instruction a reduction candidate. - bool IsReduction; - // The last instruction in a min/max pattern (select of the select(icmp()) - // pattern), or the current reduction instruction otherwise. - Instruction *PatternLastInst; - // If this is a min/max pattern the comparison predicate. - MinMaxReductionKind MinMaxKind; - }; - - /// This struct holds information about the memory runtime legality - /// check that a group of pointers do not overlap. - struct RuntimePointerCheck { - RuntimePointerCheck() : Need(false) {} - - /// Reset the state of the pointer runtime information. - void reset() { - Need = false; - Pointers.clear(); - Starts.clear(); - Ends.clear(); - IsWritePtr.clear(); - DependencySetId.clear(); - AliasSetId.clear(); + /// A struct for saving information about induction variables. + struct InductionInfo { + InductionInfo(Value *Start, InductionKind K, ConstantInt *Step) + : StartValue(Start), IK(K), StepValue(Step) { + assert(IK != IK_NoInduction && "Not an induction"); + assert(StartValue && "StartValue is null"); + assert(StepValue && !StepValue->isZero() && "StepValue is zero"); + assert((IK != IK_PtrInduction || StartValue->getType()->isPointerTy()) && + "StartValue is not a pointer for pointer induction"); + assert((IK != IK_IntInduction || StartValue->getType()->isIntegerTy()) && + "StartValue is not an integer for integer induction"); + assert(StepValue->getType()->isIntegerTy() && + "StepValue is not an integer"); + } + InductionInfo() + : StartValue(nullptr), IK(IK_NoInduction), StepValue(nullptr) {} + + /// Get the consecutive direction. Returns: + /// 0 - unknown or non-consecutive. + /// 1 - consecutive and increasing. + /// -1 - consecutive and decreasing. + int getConsecutiveDirection() const { + if (StepValue && (StepValue->isOne() || StepValue->isMinusOne())) + return StepValue->getSExtValue(); + return 0; } - /// Insert a pointer and calculate the start and end SCEVs. - void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr, - unsigned DepSetId, unsigned ASId, ValueToValueMap &Strides); - - /// This flag indicates if we need to add the runtime check. - bool Need; - /// Holds the pointers that we need to check. - SmallVector<TrackingVH<Value>, 2> Pointers; - /// Holds the pointer value at the beginning of the loop. - SmallVector<const SCEV*, 2> Starts; - /// Holds the pointer value at the end of the loop. - SmallVector<const SCEV*, 2> Ends; - /// Holds the information if this pointer is used for writing to memory. - SmallVector<bool, 2> IsWritePtr; - /// Holds the id of the set of pointers that could be dependent because of a - /// shared underlying object. - SmallVector<unsigned, 2> DependencySetId; - /// Holds the id of the disjoint alias set to which this pointer belongs. - SmallVector<unsigned, 2> AliasSetId; - }; + /// Compute the transformed value of Index at offset StartValue using step + /// StepValue. + /// For integer induction, returns StartValue + Index * StepValue. + /// For pointer induction, returns StartValue[Index * StepValue]. + /// FIXME: The newly created binary instructions should contain nsw/nuw + /// flags, which can be found from the original scalar operations. + Value *transform(IRBuilder<> &B, Value *Index) const { + switch (IK) { + case IK_IntInduction: + assert(Index->getType() == StartValue->getType() && + "Index type does not match StartValue type"); + if (StepValue->isMinusOne()) + return B.CreateSub(StartValue, Index); + if (!StepValue->isOne()) + Index = B.CreateMul(Index, StepValue); + return B.CreateAdd(StartValue, Index); + + case IK_PtrInduction: + if (StepValue->isMinusOne()) + Index = B.CreateNeg(Index); + else if (!StepValue->isOne()) + Index = B.CreateMul(Index, StepValue); + return B.CreateGEP(nullptr, StartValue, Index); + + case IK_NoInduction: + return nullptr; + } + llvm_unreachable("invalid enum"); + } - /// A struct for saving information about induction variables. - struct InductionInfo { - InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {} - InductionInfo() : StartValue(nullptr), IK(IK_NoInduction) {} /// Start value. TrackingVH<Value> StartValue; /// Induction kind. InductionKind IK; + /// Step value. + ConstantInt *StepValue; }; /// ReductionList contains the reduction descriptors for all @@ -754,13 +689,15 @@ public: bool isUniformAfterVectorization(Instruction* I) { return Uniforms.count(I); } /// Returns the information that we collected about runtime memory check. - RuntimePointerCheck *getRuntimePointerCheck() { return &PtrRtCheck; } + const LoopAccessInfo::RuntimePointerCheck *getRuntimePointerCheck() const { + return LAI->getRuntimePointerCheck(); + } - /// This function returns the identity element (or neutral element) for - /// the operation K. - static Constant *getReductionIdentity(ReductionKind K, Type *Tp); + const LoopAccessInfo *getLAI() const { + return LAI; + } - unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; } + unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); } bool hasStride(Value *V) { return StrideSet.count(V); } bool mustCheckStrides() { return !StrideSet.empty(); } @@ -784,6 +721,15 @@ public: bool isMaskRequired(const Instruction* I) { return (MaskedOp.count(I) != 0); } + unsigned getNumStores() const { + return LAI->getNumStores(); + } + unsigned getNumLoads() const { + return LAI->getNumLoads(); + } + unsigned getNumPredStores() const { + return NumPredStores; + } private: /// Check if a single basic block loop is vectorizable. /// At this point we know that this is a loop with a constant trip count @@ -808,23 +754,9 @@ private: /// and we know that we can read from them without segfault. bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs); - /// Returns True, if 'Phi' is the kind of reduction variable for type - /// 'Kind'. If this is a reduction variable, it adds it to ReductionList. - bool AddReductionVar(PHINode *Phi, ReductionKind Kind); - /// Returns a struct describing if the instruction 'I' can be a reduction - /// variable of type 'Kind'. If the reduction is a min/max pattern of - /// select(icmp()) this function advances the instruction pointer 'I' from the - /// compare instruction to the select instruction and stores this pointer in - /// 'PatternLastInst' member of the returned struct. - ReductionInstDesc isReductionInstr(Instruction *I, ReductionKind Kind, - ReductionInstDesc &Desc); - /// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction - /// pattern corresponding to a min(X, Y) or max(X, Y). - static ReductionInstDesc isMinMaxSelectCmpPattern(Instruction *I, - ReductionInstDesc &Prev); - /// Returns the induction kind of Phi. This function may return NoInduction - /// if the PHI is not an induction variable. - InductionKind isInductionVariable(PHINode *Phi); + /// Returns the induction kind of Phi and record the step. This function may + /// return NoInduction if the PHI is not an induction variable. + InductionKind isInductionVariable(PHINode *Phi, ConstantInt *&StepValue); /// \brief Collect memory access with loop invariant strides. /// @@ -833,31 +765,32 @@ private: void collectStridedAccess(Value *LoadOrStoreInst); /// Report an analysis message to assist the user in diagnosing loops that are - /// not vectorized. - void emitAnalysis(Report &Message) { - DebugLoc DL = TheLoop->getStartLoc(); - if (Instruction *I = Message.getInstr()) - DL = I->getDebugLoc(); - emitOptimizationRemarkAnalysis(TheFunction->getContext(), DEBUG_TYPE, - *TheFunction, DL, Message.str()); + /// not vectorized. These are handled as LoopAccessReport rather than + /// VectorizationReport because the << operator of VectorizationReport returns + /// LoopAccessReport. + void emitAnalysis(const LoopAccessReport &Message) { + LoopAccessReport::emitAnalysis(Message, TheFunction, TheLoop, LV_NAME); } + unsigned NumPredStores; + /// The loop that we evaluate. Loop *TheLoop; /// Scev analysis. ScalarEvolution *SE; - /// DataLayout analysis. - const DataLayout *DL; - /// Dominators. - DominatorTree *DT; /// Target Library Info. TargetLibraryInfo *TLI; - /// Alias analysis. - AliasAnalysis *AA; /// Parent function Function *TheFunction; /// Target Transform Info const TargetTransformInfo *TTI; + /// Dominator Tree. + DominatorTree *DT; + // LoopAccess analysis. + LoopAccessAnalysis *LAA; + // And the loop-accesses info corresponding to this loop. This pointer is + // null until canVectorizeMemory sets it up. + const LoopAccessInfo *LAI; // --- vectorization state --- // @@ -879,17 +812,13 @@ private: /// This set holds the variables which are known to be uniform after /// vectorization. SmallPtrSet<Instruction*, 4> Uniforms; - /// We need to check that all of the pointers in this list are disjoint - /// at runtime. - RuntimePointerCheck PtrRtCheck; + /// Can we assume the absence of NaNs. bool HasFunNoNaNAttr; - unsigned MaxSafeDepDistBytes; - ValueToValueMap Strides; SmallPtrSet<Value *, 8> StrideSet; - + /// While vectorizing these instructions we have to generate a /// call to the appropriate masked intrinsic SmallPtrSet<const Instruction*, 8> MaskedOp; @@ -907,10 +836,9 @@ public: LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI, LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, - const DataLayout *DL, const TargetLibraryInfo *TLI, - AssumptionCache *AC, const Function *F, - const LoopVectorizeHints *Hints) - : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), DL(DL), TLI(TLI), + const TargetLibraryInfo *TLI, AssumptionCache *AC, + const Function *F, const LoopVectorizeHints *Hints) + : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), TheFunction(F), Hints(Hints) { CodeMetrics::collectEphemeralValues(L, AC, EphValues); } @@ -963,23 +891,16 @@ private: /// width. Vector width of one means scalar. unsigned getInstructionCost(Instruction *I, unsigned VF); - /// A helper function for converting Scalar types to vector types. - /// If the incoming type is void, we return void. If the VF is 1, we return - /// the scalar type. - static Type* ToVectorTy(Type *Scalar, unsigned VF); - /// Returns whether the instruction is a load or store and will be a emitted /// as a vector operation. bool isConsecutiveLoadOrStore(Instruction *I); /// Report an analysis message to assist the user in diagnosing loops that are - /// not vectorized. - void emitAnalysis(Report &Message) { - DebugLoc DL = TheLoop->getStartLoc(); - if (Instruction *I = Message.getInstr()) - DL = I->getDebugLoc(); - emitOptimizationRemarkAnalysis(TheFunction->getContext(), DEBUG_TYPE, - *TheFunction, DL, Message.str()); + /// not vectorized. These are handled as LoopAccessReport rather than + /// VectorizationReport because the << operator of VectorizationReport returns + /// LoopAccessReport. + void emitAnalysis(const LoopAccessReport &Message) { + LoopAccessReport::emitAnalysis(Message, TheFunction, TheLoop, LV_NAME); } /// Values used only by @llvm.assume calls. @@ -995,8 +916,6 @@ private: LoopVectorizationLegality *Legal; /// Vector target information. const TargetTransformInfo &TTI; - /// Target data layout information. - const DataLayout *DL; /// Target Library Info. const TargetLibraryInfo *TLI; const Function *TheFunction; @@ -1032,7 +951,7 @@ class LoopVectorizeHints { bool validate(unsigned Val) { switch (Kind) { case HK_WIDTH: - return isPowerOf2_32(Val) && Val <= MaxVectorWidth; + return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth; case HK_UNROLL: return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor; case HK_FORCE: @@ -1060,7 +979,8 @@ public: }; LoopVectorizeHints(const Loop *L, bool DisableInterleaving) - : Width("vectorize.width", VectorizationFactor, HK_WIDTH), + : Width("vectorize.width", VectorizerParams::VectorizationFactor, + HK_WIDTH), Interleave("interleave.count", DisableInterleaving, HK_UNROLL), Force("vectorize.enable", FK_Undefined, HK_FORCE), TheLoop(L) { @@ -1068,8 +988,8 @@ public: getHintsFromMetadata(); // force-vector-interleave overrides DisableInterleaving. - if (VectorizationInterleave.getNumOccurrences() > 0) - Interleave.Value = VectorizationInterleave; + if (VectorizerParams::isInterleaveForced()) + Interleave.Value = VectorizerParams::VectorizationInterleave; DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs() << "LV: Interleaving disabled by the pass manager\n"); @@ -1084,7 +1004,7 @@ public: /// Dumps all the hint information. std::string emitRemark() const { - Report R; + VectorizationReport R; if (Force.Value == LoopVectorizeHints::FK_Disabled) R << "vectorization is explicitly disabled"; else { @@ -1260,7 +1180,6 @@ struct LoopVectorize : public FunctionPass { } ScalarEvolution *SE; - const DataLayout *DL; LoopInfo *LI; TargetTransformInfo *TTI; DominatorTree *DT; @@ -1268,6 +1187,7 @@ struct LoopVectorize : public FunctionPass { TargetLibraryInfo *TLI; AliasAnalysis *AA; AssumptionCache *AC; + LoopAccessAnalysis *LAA; bool DisableUnrolling; bool AlwaysVectorize; @@ -1275,15 +1195,15 @@ struct LoopVectorize : public FunctionPass { bool runOnFunction(Function &F) override { SE = &getAnalysis<ScalarEvolution>(); - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - DL = DLP ? &DLP->getDataLayout() : nullptr; - LI = &getAnalysis<LoopInfo>(); - TTI = &getAnalysis<TargetTransformInfo>(); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); BFI = &getAnalysis<BlockFrequencyInfo>(); - TLI = getAnalysisIfAvailable<TargetLibraryInfo>(); + auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); + TLI = TLIP ? &TLIP->getTLI() : nullptr; AA = &getAnalysis<AliasAnalysis>(); AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + LAA = &getAnalysis<LoopAccessAnalysis>(); // Compute some weights outside of the loop over the loops. Compute this // using a BranchProbability to re-use its scaling math. @@ -1295,12 +1215,6 @@ struct LoopVectorize : public FunctionPass { if (!TTI->getNumberOfRegisters(true)) return false; - if (!DL) { - DEBUG(dbgs() << "\nLV: Not vectorizing " << F.getName() - << ": Missing data layout\n"); - return false; - } - // Build up a worklist of inner-loops to vectorize. This is necessary as // the act of vectorizing or partially unrolling a loop creates new loops // and can invalidate iterators across the loops. @@ -1320,6 +1234,40 @@ struct LoopVectorize : public FunctionPass { return Changed; } + static void AddRuntimeUnrollDisableMetaData(Loop *L) { + SmallVector<Metadata *, 4> MDs; + // Reserve first location for self reference to the LoopID metadata node. + MDs.push_back(nullptr); + bool IsUnrollMetadata = false; + MDNode *LoopID = L->getLoopID(); + if (LoopID) { + // First find existing loop unrolling disable metadata. + for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { + MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); + if (MD) { + const MDString *S = dyn_cast<MDString>(MD->getOperand(0)); + IsUnrollMetadata = + S && S->getString().startswith("llvm.loop.unroll.disable"); + } + MDs.push_back(LoopID->getOperand(i)); + } + } + + if (!IsUnrollMetadata) { + // Add runtime unroll disable metadata. + LLVMContext &Context = L->getHeader()->getContext(); + SmallVector<Metadata *, 1> DisableOperands; + DisableOperands.push_back( + MDString::get(Context, "llvm.loop.unroll.runtime.disable")); + MDNode *DisableNode = MDNode::get(Context, DisableOperands); + MDs.push_back(DisableNode); + MDNode *NewLoopID = MDNode::get(Context, MDs); + // Set operand 0 to refer to the loop id itself. + NewLoopID->replaceOperandWith(0, NewLoopID); + L->setLoopID(NewLoopID); + } + } + bool processLoop(Loop *L) { assert(L->empty() && "Only process inner loops."); @@ -1394,7 +1342,7 @@ struct LoopVectorize : public FunctionPass { } // Check if it is legal to vectorize the loop. - LoopVectorizationLegality LVL(L, SE, DL, DT, TLI, AA, F, TTI); + LoopVectorizationLegality LVL(L, SE, DT, TLI, AA, F, TTI, LAA); if (!LVL.canVectorize()) { DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); emitMissedWarning(F, L, Hints); @@ -1402,8 +1350,7 @@ struct LoopVectorize : public FunctionPass { } // Use the cost model. - LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, DL, TLI, AC, F, - &Hints); + LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, TLI, AC, F, &Hints); // Check the function attributes to find out if this function should be // optimized for size. @@ -1467,14 +1414,20 @@ struct LoopVectorize : public FunctionPass { // We decided not to vectorize, but we may want to unroll. - InnerLoopUnroller Unroller(L, SE, LI, DT, DL, TLI, UF); + InnerLoopUnroller Unroller(L, SE, LI, DT, TLI, TTI, UF); Unroller.vectorize(&LVL); } else { // If we decided that it is *legal* to vectorize the loop then do it. - InnerLoopVectorizer LB(L, SE, LI, DT, DL, TLI, VF.Width, UF); + InnerLoopVectorizer LB(L, SE, LI, DT, TLI, TTI, VF.Width, UF); LB.vectorize(&LVL); ++LoopsVectorized; + // Add metadata to disable runtime unrolling scalar loop when there's no + // runtime check about strides and memory. Because at this situation, + // scalar loop is rarely used not worthy to be unrolled. + if (!LB.IsSafetyChecksAdded()) + AddRuntimeUnrollDisableMetaData(L); + // Report the vectorization decision. emitOptimizationRemark( F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), @@ -1495,11 +1448,12 @@ struct LoopVectorize : public FunctionPass { AU.addRequiredID(LCSSAID); AU.addRequired<BlockFrequencyInfo>(); AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<LoopInfo>(); + AU.addRequired<LoopInfoWrapperPass>(); AU.addRequired<ScalarEvolution>(); - AU.addRequired<TargetTransformInfo>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); AU.addRequired<AliasAnalysis>(); - AU.addPreserved<LoopInfo>(); + AU.addRequired<LoopAccessAnalysis>(); + AU.addPreserved<LoopInfoWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); AU.addPreserved<AliasAnalysis>(); } @@ -1513,65 +1467,6 @@ struct LoopVectorize : public FunctionPass { // LoopVectorizationCostModel. //===----------------------------------------------------------------------===// -static Value *stripIntegerCast(Value *V) { - if (CastInst *CI = dyn_cast<CastInst>(V)) - if (CI->getOperand(0)->getType()->isIntegerTy()) - return CI->getOperand(0); - return V; -} - -///\brief Replaces the symbolic stride in a pointer SCEV expression by one. -/// -/// If \p OrigPtr is not null, use it to look up the stride value instead of -/// \p Ptr. -static const SCEV *replaceSymbolicStrideSCEV(ScalarEvolution *SE, - ValueToValueMap &PtrToStride, - Value *Ptr, Value *OrigPtr = nullptr) { - - const SCEV *OrigSCEV = SE->getSCEV(Ptr); - - // If there is an entry in the map return the SCEV of the pointer with the - // symbolic stride replaced by one. - ValueToValueMap::iterator SI = PtrToStride.find(OrigPtr ? OrigPtr : Ptr); - if (SI != PtrToStride.end()) { - Value *StrideVal = SI->second; - - // Strip casts. - StrideVal = stripIntegerCast(StrideVal); - - // Replace symbolic stride by one. - Value *One = ConstantInt::get(StrideVal->getType(), 1); - ValueToValueMap RewriteMap; - RewriteMap[StrideVal] = One; - - const SCEV *ByOne = - SCEVParameterRewriter::rewrite(OrigSCEV, *SE, RewriteMap, true); - DEBUG(dbgs() << "LV: Replacing SCEV: " << *OrigSCEV << " by: " << *ByOne - << "\n"); - return ByOne; - } - - // Otherwise, just return the SCEV of the original pointer. - return SE->getSCEV(Ptr); -} - -void LoopVectorizationLegality::RuntimePointerCheck::insert( - ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr, unsigned DepSetId, - unsigned ASId, ValueToValueMap &Strides) { - // Get the stride replaced scev. - const SCEV *Sc = replaceSymbolicStrideSCEV(SE, Strides, Ptr); - const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc); - assert(AR && "Invalid addrec expression"); - const SCEV *Ex = SE->getBackedgeTakenCount(Lp); - const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE); - Pointers.push_back(Ptr); - Starts.push_back(AR->getStart()); - Ends.push_back(ScEnd); - IsWritePtr.push_back(WritePtr); - DependencySetId.push_back(DepSetId); - AliasSetId.push_back(ASId); -} - Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { // We need to place the broadcast of invariant variables outside the loop. Instruction *Instr = dyn_cast<Instruction>(V); @@ -1591,11 +1486,13 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { return Shuf; } -Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, int StartIdx, - bool Negate) { +Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, + Value *Step) { assert(Val->getType()->isVectorTy() && "Must be a vector"); assert(Val->getType()->getScalarType()->isIntegerTy() && "Elem must be an integer"); + assert(Step->getType() == Val->getType()->getScalarType() && + "Step has wrong type"); // Create the types. Type *ITy = Val->getType()->getScalarType(); VectorType *Ty = cast<VectorType>(Val->getType()); @@ -1603,24 +1500,27 @@ Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, int StartIdx, SmallVector<Constant*, 8> Indices; // Create a vector of consecutive numbers from zero to VF. - for (int i = 0; i < VLen; ++i) { - int64_t Idx = Negate ? (-i) : i; - Indices.push_back(ConstantInt::get(ITy, StartIdx + Idx, Negate)); - } + for (int i = 0; i < VLen; ++i) + Indices.push_back(ConstantInt::get(ITy, StartIdx + i)); // Add the consecutive indices to the vector value. Constant *Cv = ConstantVector::get(Indices); assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); - return Builder.CreateAdd(Val, Cv, "induction"); + Step = Builder.CreateVectorSplat(VLen, Step); + assert(Step->getType() == Val->getType() && "Invalid step vec"); + // FIXME: The newly created binary instructions should contain nsw/nuw flags, + // which can be found from the original scalar operations. + Step = Builder.CreateMul(Cv, Step); + return Builder.CreateAdd(Val, Step, "induction"); } /// \brief Find the operand of the GEP that should be checked for consecutive /// stores. This ignores trailing indices that have no effect on the final /// pointer. -static unsigned getGEPInductionOperand(const DataLayout *DL, - const GetElementPtrInst *Gep) { +static unsigned getGEPInductionOperand(const GetElementPtrInst *Gep) { + const DataLayout &DL = Gep->getModule()->getDataLayout(); unsigned LastOperand = Gep->getNumOperands() - 1; - unsigned GEPAllocSize = DL->getTypeAllocSize( + unsigned GEPAllocSize = DL.getTypeAllocSize( cast<PointerType>(Gep->getType()->getScalarType())->getElementType()); // Walk backwards and try to peel off zeros. @@ -1631,7 +1531,7 @@ static unsigned getGEPInductionOperand(const DataLayout *DL, // If it's a type with the same allocation size as the result of the GEP we // can peel off the zero index. - if (DL->getTypeAllocSize(*GEPTI) != GEPAllocSize) + if (DL.getTypeAllocSize(*GEPTI) != GEPAllocSize) break; --LastOperand; } @@ -1649,10 +1549,7 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr); if (Phi && Inductions.count(Phi)) { InductionInfo II = Inductions[Phi]; - if (IK_PtrInduction == II.IK) - return 1; - else if (IK_ReversePtrInduction == II.IK) - return -1; + return II.getConsecutiveDirection(); } GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr); @@ -1677,13 +1574,10 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { return 0; InductionInfo II = Inductions[Phi]; - if (IK_PtrInduction == II.IK) - return 1; - else if (IK_ReversePtrInduction == II.IK) - return -1; + return II.getConsecutiveDirection(); } - unsigned InductionOperand = getGEPInductionOperand(DL, Gep); + unsigned InductionOperand = getGEPInductionOperand(Gep); // Check that all of the gep indices are uniform except for our induction // operand. @@ -1730,7 +1624,7 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { } bool LoopVectorizationLegality::isUniform(Value *V) { - return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop)); + return LAI->isUniform(V); } InnerLoopVectorizer::VectorParts& @@ -1776,11 +1670,12 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment(); // An alignment of 0 means target abi alignment. We need to use the scalar's // target abi alignment in such a case. + const DataLayout &DL = Instr->getModule()->getDataLayout(); if (!Alignment) - Alignment = DL->getABITypeAlignment(ScalarDataTy); + Alignment = DL.getABITypeAlignment(ScalarDataTy); unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); - unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy); - unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF; + unsigned ScalarAllocatedSize = DL.getTypeAllocSize(ScalarDataTy); + unsigned VectorElementSize = DL.getTypeStoreSize(DataTy) / VF; if (SI && Legal->blockNeedsPredication(SI->getParent()) && !Legal->isMaskRequired(SI)) @@ -1821,7 +1716,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { // The last index does not have to be the induction. It can be // consecutive and be a function of the index. For example A[I+1]; unsigned NumOperands = Gep->getNumOperands(); - unsigned InductionOperand = getGEPInductionOperand(DL, Gep); + unsigned InductionOperand = getGEPInductionOperand(Gep); // Create the new GEP with the new induction variable. GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); @@ -1864,7 +1759,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { for (unsigned Part = 0; Part < UF; ++Part) { // Calculate the pointer for the specific unroll-part. - Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF)); + Value *PartPtr = + Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF)); if (Reverse) { // If we store to reverse consecutive memory locations then we need @@ -1872,8 +1768,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { StoredVal[Part] = reverseVector(StoredVal[Part]); // If the address is consecutive but reversed, then the // wide store needs to start at the last vector element. - PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF)); - PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)); + PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF)); + PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF)); Mask[Part] = reverseVector(Mask[Part]); } @@ -1896,13 +1792,14 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { setDebugLocFromInst(Builder, LI); for (unsigned Part = 0; Part < UF; ++Part) { // Calculate the pointer for the specific unroll-part. - Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF)); + Value *PartPtr = + Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF)); if (Reverse) { // If the address is consecutive but reversed, then the // wide load needs to start at the last vector element. - PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF)); - PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)); + PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF)); + PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF)); Mask[Part] = reverseVector(Mask[Part]); } @@ -1992,7 +1889,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, ConstantInt::get(Cmp->getType(), 1)); CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store"); LoopVectorBody.push_back(CondBlock); - VectorLp->addBasicBlockToLoop(CondBlock, LI->getBase()); + VectorLp->addBasicBlockToLoop(CondBlock, *LI); // Update Builder with newly created basic block. Builder.SetInsertPoint(InsertPt); } @@ -2021,7 +1918,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic if (IfPredicateStore) { BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); LoopVectorBody.push_back(NewIfBlock); - VectorLp->addBasicBlockToLoop(NewIfBlock, LI->getBase()); + VectorLp->addBasicBlockToLoop(NewIfBlock, *LI); Builder.SetInsertPoint(InsertPt); Instruction *OldBr = IfBlock->getTerminator(); BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); @@ -2078,102 +1975,6 @@ InnerLoopVectorizer::addStrideCheck(Instruction *Loc) { return std::make_pair(FirstInst, TheCheck); } -std::pair<Instruction *, Instruction *> -InnerLoopVectorizer::addRuntimeCheck(Instruction *Loc) { - LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck = - Legal->getRuntimePointerCheck(); - - Instruction *tnullptr = nullptr; - if (!PtrRtCheck->Need) - return std::pair<Instruction *, Instruction *>(tnullptr, tnullptr); - - unsigned NumPointers = PtrRtCheck->Pointers.size(); - SmallVector<TrackingVH<Value> , 2> Starts; - SmallVector<TrackingVH<Value> , 2> Ends; - - LLVMContext &Ctx = Loc->getContext(); - SCEVExpander Exp(*SE, "induction"); - Instruction *FirstInst = nullptr; - - for (unsigned i = 0; i < NumPointers; ++i) { - Value *Ptr = PtrRtCheck->Pointers[i]; - const SCEV *Sc = SE->getSCEV(Ptr); - - if (SE->isLoopInvariant(Sc, OrigLoop)) { - DEBUG(dbgs() << "LV: Adding RT check for a loop invariant ptr:" << - *Ptr <<"\n"); - Starts.push_back(Ptr); - Ends.push_back(Ptr); - } else { - DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr << '\n'); - unsigned AS = Ptr->getType()->getPointerAddressSpace(); - - // Use this type for pointer arithmetic. - Type *PtrArithTy = Type::getInt8PtrTy(Ctx, AS); - - Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], PtrArithTy, Loc); - Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc); - Starts.push_back(Start); - Ends.push_back(End); - } - } - - IRBuilder<> ChkBuilder(Loc); - // Our instructions might fold to a constant. - Value *MemoryRuntimeCheck = nullptr; - for (unsigned i = 0; i < NumPointers; ++i) { - for (unsigned j = i+1; j < NumPointers; ++j) { - // No need to check if two readonly pointers intersect. - if (!PtrRtCheck->IsWritePtr[i] && !PtrRtCheck->IsWritePtr[j]) - continue; - - // Only need to check pointers between two different dependency sets. - if (PtrRtCheck->DependencySetId[i] == PtrRtCheck->DependencySetId[j]) - continue; - // Only need to check pointers in the same alias set. - if (PtrRtCheck->AliasSetId[i] != PtrRtCheck->AliasSetId[j]) - continue; - - unsigned AS0 = Starts[i]->getType()->getPointerAddressSpace(); - unsigned AS1 = Starts[j]->getType()->getPointerAddressSpace(); - - assert((AS0 == Ends[j]->getType()->getPointerAddressSpace()) && - (AS1 == Ends[i]->getType()->getPointerAddressSpace()) && - "Trying to bounds check pointers with different address spaces"); - - Type *PtrArithTy0 = Type::getInt8PtrTy(Ctx, AS0); - Type *PtrArithTy1 = Type::getInt8PtrTy(Ctx, AS1); - - Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy0, "bc"); - Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy1, "bc"); - Value *End0 = ChkBuilder.CreateBitCast(Ends[i], PtrArithTy1, "bc"); - Value *End1 = ChkBuilder.CreateBitCast(Ends[j], PtrArithTy0, "bc"); - - Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0"); - FirstInst = getFirstInst(FirstInst, Cmp0, Loc); - Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1"); - FirstInst = getFirstInst(FirstInst, Cmp1, Loc); - Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict"); - FirstInst = getFirstInst(FirstInst, IsConflict, Loc); - if (MemoryRuntimeCheck) { - IsConflict = ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, - "conflict.rdx"); - FirstInst = getFirstInst(FirstInst, IsConflict, Loc); - } - MemoryRuntimeCheck = IsConflict; - } - } - - // We have to do this trickery because the IRBuilder might fold the check to a - // constant expression in which case there is no Instruction anchored in a - // the block. - Instruction *Check = BinaryOperator::CreateAnd(MemoryRuntimeCheck, - ConstantInt::getTrue(Ctx)); - ChkBuilder.Insert(Check, "memcheck.conflict"); - FirstInst = getFirstInst(FirstInst, Check, Loc); - return std::make_pair(FirstInst, Check); -} - void InnerLoopVectorizer::createEmptyLoop() { /* In this function we generate a new loop. The new loop will contain @@ -2238,9 +2039,11 @@ void InnerLoopVectorizer::createEmptyLoop() { ExitCount = SE->getAddExpr(BackedgeTakeCount, SE->getConstant(BackedgeTakeCount->getType(), 1)); + const DataLayout &DL = OldBasicBlock->getModule()->getDataLayout(); + // Expand the trip count and place the new instructions in the preheader. // Notice that the pre-header does not change, only the loop body. - SCEVExpander Exp(*SE, "induction"); + SCEVExpander Exp(*SE, DL, "induction"); // We need to test whether the backedge-taken count is uint##_max. Adding one // to it will cause overflow and an incorrect loop trip count in the vector @@ -2299,13 +2102,13 @@ void InnerLoopVectorizer::createEmptyLoop() { // before calling any utilities such as SCEV that require valid LoopInfo. if (ParentLoop) { ParentLoop->addChildLoop(Lp); - ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase()); - ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase()); - ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase()); + ParentLoop->addBasicBlockToLoop(ScalarPH, *LI); + ParentLoop->addBasicBlockToLoop(VectorPH, *LI); + ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI); } else { LI->addTopLevelLoop(Lp); } - Lp->addBasicBlockToLoop(VecBody, LI->getBase()); + Lp->addBasicBlockToLoop(VecBody, *LI); // Use this IR builder to create the loop instructions (Phi, Br, Cmp) // inside the loop. @@ -2360,7 +2163,7 @@ void InnerLoopVectorizer::createEmptyLoop() { BasicBlock *CheckBlock = LastBypassBlock->splitBasicBlock(PastOverflowCheck, "overflow.checked"); if (ParentLoop) - ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase()); + ParentLoop->addBasicBlockToLoop(CheckBlock, *LI); LoopBypassBlocks.push_back(CheckBlock); Instruction *OldTerm = LastBypassBlock->getTerminator(); BranchInst::Create(ScalarPH, CheckBlock, CheckBCOverflow, OldTerm); @@ -2376,11 +2179,12 @@ void InnerLoopVectorizer::createEmptyLoop() { std::tie(FirstCheckInst, StrideCheck) = addStrideCheck(LastBypassBlock->getTerminator()); if (StrideCheck) { + AddedSafetyChecks = true; // Create a new block containing the stride check. BasicBlock *CheckBlock = LastBypassBlock->splitBasicBlock(FirstCheckInst, "vector.stridecheck"); if (ParentLoop) - ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase()); + ParentLoop->addBasicBlockToLoop(CheckBlock, *LI); LoopBypassBlocks.push_back(CheckBlock); // Replace the branch into the memory check block with a conditional branch @@ -2398,13 +2202,14 @@ void InnerLoopVectorizer::createEmptyLoop() { // faster. Instruction *MemRuntimeCheck; std::tie(FirstCheckInst, MemRuntimeCheck) = - addRuntimeCheck(LastBypassBlock->getTerminator()); + Legal->getLAI()->addRuntimeCheck(LastBypassBlock->getTerminator()); if (MemRuntimeCheck) { + AddedSafetyChecks = true; // Create a new block containing the memory check. BasicBlock *CheckBlock = - LastBypassBlock->splitBasicBlock(MemRuntimeCheck, "vector.memcheck"); + LastBypassBlock->splitBasicBlock(FirstCheckInst, "vector.memcheck"); if (ParentLoop) - ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase()); + ParentLoop->addBasicBlockToLoop(CheckBlock, *LI); LoopBypassBlocks.push_back(CheckBlock); // Replace the branch into the memory check block with a conditional branch @@ -2495,33 +2300,13 @@ void InnerLoopVectorizer::createEmptyLoop() { Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown, II.StartValue->getType(), "cast.crd"); - EndValue = BypassBuilder.CreateAdd(CRD, II.StartValue , "ind.end"); - break; - } - case LoopVectorizationLegality::IK_ReverseIntInduction: { - // Convert the CountRoundDown variable to the PHI size. - Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown, - II.StartValue->getType(), - "cast.crd"); - // Handle reverse integer induction counter. - EndValue = BypassBuilder.CreateSub(II.StartValue, CRD, "rev.ind.end"); + EndValue = II.transform(BypassBuilder, CRD); + EndValue->setName("ind.end"); break; } case LoopVectorizationLegality::IK_PtrInduction: { - // For pointer induction variables, calculate the offset using - // the end index. - EndValue = BypassBuilder.CreateGEP(II.StartValue, CountRoundDown, - "ptr.ind.end"); - break; - } - case LoopVectorizationLegality::IK_ReversePtrInduction: { - // The value at the end of the loop for the reverse pointer is calculated - // by creating a GEP with a negative index starting from the start value. - Value *Zero = ConstantInt::get(CountRoundDown->getType(), 0); - Value *NegIdx = BypassBuilder.CreateSub(Zero, CountRoundDown, - "rev.ind.end"); - EndValue = BypassBuilder.CreateGEP(II.StartValue, NegIdx, - "rev.ptr.ind.end"); + EndValue = II.transform(BypassBuilder, CountRoundDown); + EndValue->setName("ptr.ind.end"); break; } }// end of case @@ -2604,99 +2389,6 @@ void InnerLoopVectorizer::createEmptyLoop() { Hints.setAlreadyVectorized(); } -/// This function returns the identity element (or neutral element) for -/// the operation K. -Constant* -LoopVectorizationLegality::getReductionIdentity(ReductionKind K, Type *Tp) { - switch (K) { - case RK_IntegerXor: - case RK_IntegerAdd: - case RK_IntegerOr: - // Adding, Xoring, Oring zero to a number does not change it. - return ConstantInt::get(Tp, 0); - case RK_IntegerMult: - // Multiplying a number by 1 does not change it. - return ConstantInt::get(Tp, 1); - case RK_IntegerAnd: - // AND-ing a number with an all-1 value does not change it. - return ConstantInt::get(Tp, -1, true); - case RK_FloatMult: - // Multiplying a number by 1 does not change it. - return ConstantFP::get(Tp, 1.0L); - case RK_FloatAdd: - // Adding zero to a number does not change it. - return ConstantFP::get(Tp, 0.0L); - default: - llvm_unreachable("Unknown reduction kind"); - } -} - -/// This function translates the reduction kind to an LLVM binary operator. -static unsigned -getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) { - switch (Kind) { - case LoopVectorizationLegality::RK_IntegerAdd: - return Instruction::Add; - case LoopVectorizationLegality::RK_IntegerMult: - return Instruction::Mul; - case LoopVectorizationLegality::RK_IntegerOr: - return Instruction::Or; - case LoopVectorizationLegality::RK_IntegerAnd: - return Instruction::And; - case LoopVectorizationLegality::RK_IntegerXor: - return Instruction::Xor; - case LoopVectorizationLegality::RK_FloatMult: - return Instruction::FMul; - case LoopVectorizationLegality::RK_FloatAdd: - return Instruction::FAdd; - case LoopVectorizationLegality::RK_IntegerMinMax: - return Instruction::ICmp; - case LoopVectorizationLegality::RK_FloatMinMax: - return Instruction::FCmp; - default: - llvm_unreachable("Unknown reduction operation"); - } -} - -Value *createMinMaxOp(IRBuilder<> &Builder, - LoopVectorizationLegality::MinMaxReductionKind RK, - Value *Left, - Value *Right) { - CmpInst::Predicate P = CmpInst::ICMP_NE; - switch (RK) { - default: - llvm_unreachable("Unknown min/max reduction kind"); - case LoopVectorizationLegality::MRK_UIntMin: - P = CmpInst::ICMP_ULT; - break; - case LoopVectorizationLegality::MRK_UIntMax: - P = CmpInst::ICMP_UGT; - break; - case LoopVectorizationLegality::MRK_SIntMin: - P = CmpInst::ICMP_SLT; - break; - case LoopVectorizationLegality::MRK_SIntMax: - P = CmpInst::ICMP_SGT; - break; - case LoopVectorizationLegality::MRK_FloatMin: - P = CmpInst::FCMP_OLT; - break; - case LoopVectorizationLegality::MRK_FloatMax: - P = CmpInst::FCMP_OGT; - break; - } - - Value *Cmp; - if (RK == LoopVectorizationLegality::MRK_FloatMin || - RK == LoopVectorizationLegality::MRK_FloatMax) - Cmp = Builder.CreateFCmp(P, Left, Right, "rdx.minmax.cmp"); - else - Cmp = Builder.CreateICmp(P, Left, Right, "rdx.minmax.cmp"); - - Value *Select = Builder.CreateSelect(Cmp, Left, Right, "rdx.minmax.select"); - return Select; -} - namespace { struct CSEDenseMapInfo { static bool canHandle(Instruction *I) { @@ -2772,6 +2464,95 @@ static Value *addFastMathFlag(Value *V) { return V; } +/// Estimate the overhead of scalarizing a value. Insert and Extract are set if +/// the result needs to be inserted and/or extracted from vectors. +static unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract, + const TargetTransformInfo &TTI) { + if (Ty->isVoidTy()) + return 0; + + assert(Ty->isVectorTy() && "Can only scalarize vectors"); + unsigned Cost = 0; + + for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) { + if (Insert) + Cost += TTI.getVectorInstrCost(Instruction::InsertElement, Ty, i); + if (Extract) + Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, Ty, i); + } + + return Cost; +} + +// Estimate cost of a call instruction CI if it were vectorized with factor VF. +// Return the cost of the instruction, including scalarization overhead if it's +// needed. The flag NeedToScalarize shows if the call needs to be scalarized - +// i.e. either vector version isn't available, or is too expensive. +static unsigned getVectorCallCost(CallInst *CI, unsigned VF, + const TargetTransformInfo &TTI, + const TargetLibraryInfo *TLI, + bool &NeedToScalarize) { + Function *F = CI->getCalledFunction(); + StringRef FnName = CI->getCalledFunction()->getName(); + Type *ScalarRetTy = CI->getType(); + SmallVector<Type *, 4> Tys, ScalarTys; + for (auto &ArgOp : CI->arg_operands()) + ScalarTys.push_back(ArgOp->getType()); + + // Estimate cost of scalarized vector call. The source operands are assumed + // to be vectors, so we need to extract individual elements from there, + // execute VF scalar calls, and then gather the result into the vector return + // value. + unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys); + if (VF == 1) + return ScalarCallCost; + + // Compute corresponding vector type for return value and arguments. + Type *RetTy = ToVectorTy(ScalarRetTy, VF); + for (unsigned i = 0, ie = ScalarTys.size(); i != ie; ++i) + Tys.push_back(ToVectorTy(ScalarTys[i], VF)); + + // Compute costs of unpacking argument values for the scalar calls and + // packing the return values to a vector. + unsigned ScalarizationCost = + getScalarizationOverhead(RetTy, true, false, TTI); + for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) + ScalarizationCost += getScalarizationOverhead(Tys[i], false, true, TTI); + + unsigned Cost = ScalarCallCost * VF + ScalarizationCost; + + // If we can't emit a vector call for this function, then the currently found + // cost is the cost we need to return. + NeedToScalarize = true; + if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin()) + return Cost; + + // If the corresponding vector cost is cheaper, return its cost. + unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys); + if (VectorCallCost < Cost) { + NeedToScalarize = false; + return VectorCallCost; + } + return Cost; +} + +// Estimate cost of an intrinsic call instruction CI if it were vectorized with +// factor VF. Return the cost of the instruction, including scalarization +// overhead if it's needed. +static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF, + const TargetTransformInfo &TTI, + const TargetLibraryInfo *TLI) { + Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); + assert(ID && "Expected intrinsic call!"); + + Type *RetTy = ToVectorTy(CI->getType(), VF); + SmallVector<Type *, 4> Tys; + for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) + Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF)); + + return TTI.getIntrinsicInstrCost(ID, RetTy, Tys); +} + void InnerLoopVectorizer::vectorizeLoop() { //===------------------------------------------------===// // @@ -2819,10 +2600,14 @@ void InnerLoopVectorizer::vectorizeLoop() { // Find the reduction variable descriptor. assert(Legal->getReductionVars()->count(RdxPhi) && "Unable to find the reduction variable"); - LoopVectorizationLegality::ReductionDescriptor RdxDesc = - (*Legal->getReductionVars())[RdxPhi]; + ReductionDescriptor RdxDesc = (*Legal->getReductionVars())[RdxPhi]; - setDebugLocFromInst(Builder, RdxDesc.StartValue); + ReductionDescriptor::ReductionKind RK = RdxDesc.getReductionKind(); + TrackingVH<Value> ReductionStartValue = RdxDesc.getReductionStartValue(); + Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); + ReductionInstDesc::MinMaxReductionKind MinMaxKind = + RdxDesc.getMinMaxReductionKind(); + setDebugLocFromInst(Builder, ReductionStartValue); // We need to generate a reduction vector from the incoming scalar. // To do so, we need to generate the 'identity' vector and override @@ -2831,40 +2616,38 @@ void InnerLoopVectorizer::vectorizeLoop() { Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator()); // This is the vector-clone of the value that leaves the loop. - VectorParts &VectorExit = getVectorValue(RdxDesc.LoopExitInstr); + VectorParts &VectorExit = getVectorValue(LoopExitInst); Type *VecTy = VectorExit[0]->getType(); // Find the reduction identity variable. Zero for addition, or, xor, // one for multiplication, -1 for And. Value *Identity; Value *VectorStart; - if (RdxDesc.Kind == LoopVectorizationLegality::RK_IntegerMinMax || - RdxDesc.Kind == LoopVectorizationLegality::RK_FloatMinMax) { + if (RK == ReductionDescriptor::RK_IntegerMinMax || + RK == ReductionDescriptor::RK_FloatMinMax) { // MinMax reduction have the start value as their identify. if (VF == 1) { - VectorStart = Identity = RdxDesc.StartValue; + VectorStart = Identity = ReductionStartValue; } else { - VectorStart = Identity = Builder.CreateVectorSplat(VF, - RdxDesc.StartValue, - "minmax.ident"); + VectorStart = Identity = + Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident"); } } else { // Handle other reduction kinds: Constant *Iden = - LoopVectorizationLegality::getReductionIdentity(RdxDesc.Kind, - VecTy->getScalarType()); + ReductionDescriptor::getReductionIdentity(RK, VecTy->getScalarType()); if (VF == 1) { Identity = Iden; // This vector is the Identity vector where the first element is the // incoming scalar reduction. - VectorStart = RdxDesc.StartValue; + VectorStart = ReductionStartValue; } else { Identity = ConstantVector::getSplat(VF, Iden); // This vector is the Identity vector where the first element is the // incoming scalar reduction. - VectorStart = Builder.CreateInsertElement(Identity, - RdxDesc.StartValue, Zero); + VectorStart = + Builder.CreateInsertElement(Identity, ReductionStartValue, Zero); } } @@ -2893,11 +2676,11 @@ void InnerLoopVectorizer::vectorizeLoop() { Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt()); VectorParts RdxParts; - setDebugLocFromInst(Builder, RdxDesc.LoopExitInstr); + setDebugLocFromInst(Builder, LoopExitInst); for (unsigned part = 0; part < UF; ++part) { // This PHINode contains the vectorized reduction variable, or // the initial value vector, if we bypass the vector loop. - VectorParts &RdxExitVal = getVectorValue(RdxDesc.LoopExitInstr); + VectorParts &RdxExitVal = getVectorValue(LoopExitInst); PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi"); Value *StartVal = (part == 0) ? VectorStart : Identity; for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) @@ -2909,7 +2692,7 @@ void InnerLoopVectorizer::vectorizeLoop() { // Reduce all of the unrolled parts into a single vector. Value *ReducedPartRdx = RdxParts[0]; - unsigned Op = getReductionBinOp(RdxDesc.Kind); + unsigned Op = ReductionDescriptor::getReductionBinOp(RK); setDebugLocFromInst(Builder, ReducedPartRdx); for (unsigned part = 1; part < UF; ++part) { if (Op != Instruction::ICmp && Op != Instruction::FCmp) @@ -2918,8 +2701,8 @@ void InnerLoopVectorizer::vectorizeLoop() { Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxParts[part], ReducedPartRdx, "bin.rdx")); else - ReducedPartRdx = createMinMaxOp(Builder, RdxDesc.MinMaxKind, - ReducedPartRdx, RdxParts[part]); + ReducedPartRdx = ReductionDescriptor::createMinMaxOp( + Builder, MinMaxKind, ReducedPartRdx, RdxParts[part]); } if (VF > 1) { @@ -2950,7 +2733,8 @@ void InnerLoopVectorizer::vectorizeLoop() { TmpVec = addFastMathFlag(Builder.CreateBinOp( (Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx")); else - TmpVec = createMinMaxOp(Builder, RdxDesc.MinMaxKind, TmpVec, Shuf); + TmpVec = ReductionDescriptor::createMinMaxOp(Builder, MinMaxKind, + TmpVec, Shuf); } // The result is in the first element of the vector. @@ -2962,7 +2746,7 @@ void InnerLoopVectorizer::vectorizeLoop() { // block and the middle block. PHINode *BCBlockPhi = PHINode::Create(RdxPhi->getType(), 2, "bc.merge.rdx", LoopScalarPreHeader->getTerminator()); - BCBlockPhi->addIncoming(RdxDesc.StartValue, LoopBypassBlocks[0]); + BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[0]); BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); // Now, we need to fix the users of the reduction variable @@ -2980,7 +2764,7 @@ void InnerLoopVectorizer::vectorizeLoop() { // We found our reduction value exit-PHI. Update it with the // incoming bypass edge. - if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) { + if (LCSSAPhi->getIncomingValue(0) == LoopExitInst) { // Add an edge coming from the bypass. LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); break; @@ -2995,7 +2779,7 @@ void InnerLoopVectorizer::vectorizeLoop() { // Pick the other block. int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); - (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr); + (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); }// end of for each redux variable. fixLCSSAPHIs(); @@ -3136,6 +2920,8 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, LoopVectorizationLegality::InductionInfo II = Legal->getInductionVars()->lookup(P); + // FIXME: The newly created binary instructions should contain nsw/nuw flags, + // which can be found from the original scalar operations. switch (II.IK) { case LoopVectorizationLegality::IK_NoInduction: llvm_unreachable("Unknown induction"); @@ -3153,80 +2939,42 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, Value *NormalizedIdx = Builder.CreateSub(Induction, ExtendedIdx, "normalized.idx"); NormalizedIdx = Builder.CreateSExtOrTrunc(NormalizedIdx, PhiTy); - Broadcasted = Builder.CreateAdd(II.StartValue, NormalizedIdx, - "offset.idx"); + Broadcasted = II.transform(Builder, NormalizedIdx); + Broadcasted->setName("offset.idx"); } Broadcasted = getBroadcastInstrs(Broadcasted); // After broadcasting the induction variable we need to make the vector // consecutive by adding 0, 1, 2, etc. for (unsigned part = 0; part < UF; ++part) - Entry[part] = getConsecutiveVector(Broadcasted, VF * part, false); + Entry[part] = getStepVector(Broadcasted, VF * part, II.StepValue); return; } - case LoopVectorizationLegality::IK_ReverseIntInduction: case LoopVectorizationLegality::IK_PtrInduction: - case LoopVectorizationLegality::IK_ReversePtrInduction: - // Handle reverse integer and pointer inductions. - Value *StartIdx = ExtendedIdx; - // This is the normalized GEP that starts counting at zero. - Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx, - "normalized.idx"); - - // Handle the reverse integer induction variable case. - if (LoopVectorizationLegality::IK_ReverseIntInduction == II.IK) { - IntegerType *DstTy = cast<IntegerType>(II.StartValue->getType()); - Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy, - "resize.norm.idx"); - Value *ReverseInd = Builder.CreateSub(II.StartValue, CNI, - "reverse.idx"); - - // This is a new value so do not hoist it out. - Value *Broadcasted = getBroadcastInstrs(ReverseInd); - // After broadcasting the induction variable we need to make the - // vector consecutive by adding ... -3, -2, -1, 0. - for (unsigned part = 0; part < UF; ++part) - Entry[part] = getConsecutiveVector(Broadcasted, -(int)VF * part, - true); - return; - } - // Handle the pointer induction variable case. assert(P->getType()->isPointerTy() && "Unexpected type."); - - // Is this a reverse induction ptr or a consecutive induction ptr. - bool Reverse = (LoopVectorizationLegality::IK_ReversePtrInduction == - II.IK); - + // This is the normalized GEP that starts counting at zero. + Value *NormalizedIdx = + Builder.CreateSub(Induction, ExtendedIdx, "normalized.idx"); // This is the vector of results. Notice that we don't generate // vector geps because scalar geps result in better code. for (unsigned part = 0; part < UF; ++part) { if (VF == 1) { - int EltIndex = (part) * (Reverse ? -1 : 1); + int EltIndex = part; Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex); - Value *GlobalIdx; - if (Reverse) - GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx"); - else - GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx"); - - Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx, - "next.gep"); + Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx); + Value *SclrGep = II.transform(Builder, GlobalIdx); + SclrGep->setName("next.gep"); Entry[part] = SclrGep; continue; } Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF)); for (unsigned int i = 0; i < VF; ++i) { - int EltIndex = (i + part * VF) * (Reverse ? -1 : 1); + int EltIndex = i + part * VF; Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex); - Value *GlobalIdx; - if (!Reverse) - GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx"); - else - GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx"); - - Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx, - "next.gep"); + Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx); + Value *SclrGep = II.transform(Builder, GlobalIdx); + SclrGep->setName("next.gep"); VecVal = Builder.CreateInsertElement(VecVal, SclrGep, Builder.getInt32(i), "insert.gep"); @@ -3246,7 +2994,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { // Nothing to do for PHIs and BR, since we already took care of the // loop control flow instructions. continue; - case Instruction::PHI:{ + case Instruction::PHI: { // Vectorize PHINodes. widenPHIInstruction(it, Entry, UF, VF, PV); continue; @@ -3367,8 +3115,12 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction, CI->getType()); Value *Broadcasted = getBroadcastInstrs(ScalarCast); + LoopVectorizationLegality::InductionInfo II = + Legal->getInductionVars()->lookup(OldInduction); + Constant *Step = + ConstantInt::getSigned(CI->getType(), II.StepValue->getSExtValue()); for (unsigned Part = 0; Part < UF; ++Part) - Entry[Part] = getConsecutiveVector(Broadcasted, VF * Part, false); + Entry[Part] = getStepVector(Broadcasted, VF * Part, Step); propagateMetadata(Entry, it); break; } @@ -3391,37 +3143,71 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { Module *M = BB->getParent()->getParent(); CallInst *CI = cast<CallInst>(it); + + StringRef FnName = CI->getCalledFunction()->getName(); + Function *F = CI->getCalledFunction(); + Type *RetTy = ToVectorTy(CI->getType(), VF); + SmallVector<Type *, 4> Tys; + for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) + Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF)); + Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); - assert(ID && "Not an intrinsic call!"); - switch (ID) { - case Intrinsic::assume: - case Intrinsic::lifetime_end: - case Intrinsic::lifetime_start: + if (ID && + (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || + ID == Intrinsic::lifetime_start)) { scalarizeInstruction(it); break; - default: - bool HasScalarOpd = hasVectorInstrinsicScalarOpd(ID, 1); - for (unsigned Part = 0; Part < UF; ++Part) { - SmallVector<Value *, 4> Args; - for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { - if (HasScalarOpd && i == 1) { - Args.push_back(CI->getArgOperand(i)); - continue; - } - VectorParts &Arg = getVectorValue(CI->getArgOperand(i)); - Args.push_back(Arg[Part]); - } - Type *Tys[] = {CI->getType()}; - if (VF > 1) - Tys[0] = VectorType::get(CI->getType()->getScalarType(), VF); + } + // The flag shows whether we use Intrinsic or a usual Call for vectorized + // version of the instruction. + // Is it beneficial to perform intrinsic call compared to lib call? + bool NeedToScalarize; + unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize); + bool UseVectorIntrinsic = + ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost; + if (!UseVectorIntrinsic && NeedToScalarize) { + scalarizeInstruction(it); + break; + } - Function *F = Intrinsic::getDeclaration(M, ID, Tys); - Entry[Part] = Builder.CreateCall(F, Args); + for (unsigned Part = 0; Part < UF; ++Part) { + SmallVector<Value *, 4> Args; + for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { + Value *Arg = CI->getArgOperand(i); + // Some intrinsics have a scalar argument - don't replace it with a + // vector. + if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) { + VectorParts &VectorArg = getVectorValue(CI->getArgOperand(i)); + Arg = VectorArg[Part]; + } + Args.push_back(Arg); } - propagateMetadata(Entry, it); - break; + Function *VectorF; + if (UseVectorIntrinsic) { + // Use vector version of the intrinsic. + Type *TysForDecl[] = {CI->getType()}; + if (VF > 1) + TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); + VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); + } else { + // Use vector version of the library call. + StringRef VFnName = TLI->getVectorizedFunction(FnName, VF); + assert(!VFnName.empty() && "Vector function name is empty."); + VectorF = M->getFunction(VFnName); + if (!VectorF) { + // Generate a declaration + FunctionType *FTy = FunctionType::get(RetTy, Tys, false); + VectorF = + Function::Create(FTy, Function::ExternalLinkage, VFnName, M); + VectorF->copyAttributesFrom(F); + } + } + assert(VectorF && "Can't create vector function."); + Entry[Part] = Builder.CreateCall(VectorF, Args); } + + propagateMetadata(Entry, it); break; } @@ -3484,7 +3270,7 @@ static bool canIfConvertPHINodes(BasicBlock *BB) { bool LoopVectorizationLegality::canVectorizeWithIfConvert() { if (!EnableIfConversion) { - emitAnalysis(Report() << "if-conversion is disabled"); + emitAnalysis(VectorizationReport() << "if-conversion is disabled"); return false; } @@ -3517,7 +3303,7 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { // We don't support switch statements inside loops. if (!isa<BranchInst>(BB->getTerminator())) { - emitAnalysis(Report(BB->getTerminator()) + emitAnalysis(VectorizationReport(BB->getTerminator()) << "loop contains a switch statement"); return false; } @@ -3525,12 +3311,12 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { // We must be able to predicate all blocks that need to be predicated. if (blockNeedsPredication(BB)) { if (!blockCanBePredicated(BB, SafePointes)) { - emitAnalysis(Report(BB->getTerminator()) + emitAnalysis(VectorizationReport(BB->getTerminator()) << "control flow cannot be substituted for a select"); return false; } } else if (BB != Header && !canIfConvertPHINodes(BB)) { - emitAnalysis(Report(BB->getTerminator()) + emitAnalysis(VectorizationReport(BB->getTerminator()) << "control flow cannot be substituted for a select"); return false; } @@ -3545,27 +3331,30 @@ bool LoopVectorizationLegality::canVectorize() { // be canonicalized. if (!TheLoop->getLoopPreheader()) { emitAnalysis( - Report() << "loop control flow is not understood by vectorizer"); + VectorizationReport() << + "loop control flow is not understood by vectorizer"); return false; } // We can only vectorize innermost loops. - if (TheLoop->getSubLoopsVector().size()) { - emitAnalysis(Report() << "loop is not the innermost loop"); + if (!TheLoop->getSubLoopsVector().empty()) { + emitAnalysis(VectorizationReport() << "loop is not the innermost loop"); return false; } // We must have a single backedge. if (TheLoop->getNumBackEdges() != 1) { emitAnalysis( - Report() << "loop control flow is not understood by vectorizer"); + VectorizationReport() << + "loop control flow is not understood by vectorizer"); return false; } // We must have a single exiting block. if (!TheLoop->getExitingBlock()) { emitAnalysis( - Report() << "loop control flow is not understood by vectorizer"); + VectorizationReport() << + "loop control flow is not understood by vectorizer"); return false; } @@ -3574,7 +3363,8 @@ bool LoopVectorizationLegality::canVectorize() { // instructions in the loop are executed the same number of times. if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { emitAnalysis( - Report() << "loop control flow is not understood by vectorizer"); + VectorizationReport() << + "loop control flow is not understood by vectorizer"); return false; } @@ -3592,7 +3382,8 @@ bool LoopVectorizationLegality::canVectorize() { // ScalarEvolution needs to be able to find the exit count. const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop); if (ExitCount == SE->getCouldNotCompute()) { - emitAnalysis(Report() << "could not determine number of loop iterations"); + emitAnalysis(VectorizationReport() << + "could not determine number of loop iterations"); DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n"); return false; } @@ -3613,7 +3404,8 @@ bool LoopVectorizationLegality::canVectorize() { collectLoopUniforms(); DEBUG(dbgs() << "LV: We can vectorize this loop" << - (PtrRtCheck.Need ? " (with a runtime bound check)" : "") + (LAI->getRuntimePointerCheck()->Need ? " (with a runtime bound check)" : + "") <<"!\n"); // Okay! We can vectorize. At this point we don't have any other mem analysis @@ -3667,10 +3459,10 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // Look for the attribute signaling the absence of NaNs. Function &F = *Header->getParent(); + const DataLayout &DL = F.getParent()->getDataLayout(); if (F.hasFnAttribute("no-nans-fp-math")) - HasFunNoNaNAttr = F.getAttributes().getAttribute( - AttributeSet::FunctionIndex, - "no-nans-fp-math").getValueAsString() == "true"; + HasFunNoNaNAttr = + F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true"; // For each block in the loop. for (Loop::block_iterator bb = TheLoop->block_begin(), @@ -3686,7 +3478,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() && !PhiTy->isPointerTy()) { - emitAnalysis(Report(it) + emitAnalysis(VectorizationReport(it) << "loop control flow is not understood by vectorizer"); DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n"); return false; @@ -3700,14 +3492,15 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // identified reduction value with an outside user. if (!hasOutsideLoopUser(TheLoop, it, AllowedExit)) continue; - emitAnalysis(Report(it) << "value could not be identified as " - "an induction or reduction variable"); + emitAnalysis(VectorizationReport(it) << + "value could not be identified as " + "an induction or reduction variable"); return false; } // We only allow if-converted PHIs with exactly two incoming values. if (Phi->getNumIncomingValues() != 2) { - emitAnalysis(Report(it) + emitAnalysis(VectorizationReport(it) << "control flow not understood by vectorizer"); DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); return false; @@ -3715,18 +3508,19 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // This is the value coming from the preheader. Value *StartValue = Phi->getIncomingValueForBlock(PreHeader); + ConstantInt *StepValue = nullptr; // Check if this is an induction variable. - InductionKind IK = isInductionVariable(Phi); + InductionKind IK = isInductionVariable(Phi, StepValue); if (IK_NoInduction != IK) { // Get the widest type. if (!WidestIndTy) - WidestIndTy = convertPointerToIntegerType(*DL, PhiTy); + WidestIndTy = convertPointerToIntegerType(DL, PhiTy); else - WidestIndTy = getWiderType(*DL, PhiTy, WidestIndTy); + WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy); // Int inductions are special because we only allow one IV. - if (IK == IK_IntInduction) { + if (IK == IK_IntInduction && StepValue->isOne()) { // Use the phi node with the widest type as induction. Use the last // one if there are multiple (no good reason for doing this other // than it is expedient). @@ -3735,69 +3529,44 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { } DEBUG(dbgs() << "LV: Found an induction variable.\n"); - Inductions[Phi] = InductionInfo(StartValue, IK); + Inductions[Phi] = InductionInfo(StartValue, IK, StepValue); // Until we explicitly handle the case of an induction variable with // an outside loop user we have to give up vectorizing this loop. if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) { - emitAnalysis(Report(it) << "use of induction value outside of the " - "loop is not handled by vectorizer"); + emitAnalysis(VectorizationReport(it) << + "use of induction value outside of the " + "loop is not handled by vectorizer"); return false; } continue; } - if (AddReductionVar(Phi, RK_IntegerAdd)) { - DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n"); - continue; - } - if (AddReductionVar(Phi, RK_IntegerMult)) { - DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n"); - continue; - } - if (AddReductionVar(Phi, RK_IntegerOr)) { - DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n"); - continue; - } - if (AddReductionVar(Phi, RK_IntegerAnd)) { - DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n"); - continue; - } - if (AddReductionVar(Phi, RK_IntegerXor)) { - DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n"); - continue; - } - if (AddReductionVar(Phi, RK_IntegerMinMax)) { - DEBUG(dbgs() << "LV: Found a MINMAX reduction PHI."<< *Phi <<"\n"); - continue; - } - if (AddReductionVar(Phi, RK_FloatMult)) { - DEBUG(dbgs() << "LV: Found an FMult reduction PHI."<< *Phi <<"\n"); - continue; - } - if (AddReductionVar(Phi, RK_FloatAdd)) { - DEBUG(dbgs() << "LV: Found an FAdd reduction PHI."<< *Phi <<"\n"); - continue; - } - if (AddReductionVar(Phi, RK_FloatMinMax)) { - DEBUG(dbgs() << "LV: Found an float MINMAX reduction PHI."<< *Phi << - "\n"); + if (ReductionDescriptor::isReductionPHI(Phi, TheLoop, + Reductions[Phi])) { + AllowedExit.insert(Reductions[Phi].getLoopExitInstr()); continue; } - emitAnalysis(Report(it) << "value that could not be identified as " - "reduction is used outside the loop"); + emitAnalysis(VectorizationReport(it) << + "value that could not be identified as " + "reduction is used outside the loop"); DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n"); return false; }// end of PHI handling - // We still don't handle functions. However, we can ignore dbg intrinsic - // calls and we do handle certain intrinsic and libm functions. + // We handle calls that: + // * Are debug info intrinsics. + // * Have a mapping to an IR intrinsic. + // * Have a vector version available. CallInst *CI = dyn_cast<CallInst>(it); - if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgInfoIntrinsic>(CI)) { - emitAnalysis(Report(it) << "call instruction cannot be vectorized"); - DEBUG(dbgs() << "LV: Found a call site.\n"); + if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgInfoIntrinsic>(CI) && + !(CI->getCalledFunction() && TLI && + TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) { + emitAnalysis(VectorizationReport(it) << + "call instruction cannot be vectorized"); + DEBUG(dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n"); return false; } @@ -3806,7 +3575,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (CI && hasVectorInstrinsicScalarOpd(getIntrinsicIDForCall(CI, TLI), 1)) { if (!SE->isLoopInvariant(SE->getSCEV(CI->getOperand(1)), TheLoop)) { - emitAnalysis(Report(it) + emitAnalysis(VectorizationReport(it) << "intrinsic instruction cannot be vectorized"); DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n"); return false; @@ -3817,7 +3586,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // Also, we can't vectorize extractelement instructions. if ((!VectorType::isValidElementType(it->getType()) && !it->getType()->isVoidTy()) || isa<ExtractElementInst>(it)) { - emitAnalysis(Report(it) + emitAnalysis(VectorizationReport(it) << "instruction return type cannot be vectorized"); DEBUG(dbgs() << "LV: Found unvectorizable type.\n"); return false; @@ -3827,7 +3596,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (StoreInst *ST = dyn_cast<StoreInst>(it)) { Type *T = ST->getValueOperand()->getType(); if (!VectorType::isValidElementType(T)) { - emitAnalysis(Report(ST) << "store instruction cannot be vectorized"); + emitAnalysis(VectorizationReport(ST) << + "store instruction cannot be vectorized"); return false; } if (EnableMemAccessVersioning) @@ -3841,7 +3611,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // Reduction instructions are allowed to have exit users. // All other instructions must not have external users. if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) { - emitAnalysis(Report(it) << "value cannot be used outside the loop"); + emitAnalysis(VectorizationReport(it) << + "value cannot be used outside the loop"); return false; } @@ -3852,7 +3623,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (!Induction) { DEBUG(dbgs() << "LV: Did not find one integer induction var.\n"); if (Inductions.empty()) { - emitAnalysis(Report() + emitAnalysis(VectorizationReport() << "loop induction variable could not be identified"); return false; } @@ -3863,13 +3634,12 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { ///\brief Remove GEPs whose indices but the last one are loop invariant and /// return the induction operand of the gep pointer. -static Value *stripGetElementPtr(Value *Ptr, ScalarEvolution *SE, - const DataLayout *DL, Loop *Lp) { +static Value *stripGetElementPtr(Value *Ptr, ScalarEvolution *SE, Loop *Lp) { GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr); if (!GEP) return Ptr; - unsigned InductionOperand = getGEPInductionOperand(DL, GEP); + unsigned InductionOperand = getGEPInductionOperand(GEP); // Check that all of the gep indices are uniform except for our induction // operand. @@ -3898,8 +3668,7 @@ static Value *getUniqueCastUse(Value *Ptr, Loop *Lp, Type *Ty) { ///\brief Get the stride of a pointer access in a loop. /// Looks for symbolic strides "a[i*stride]". Returns the symbolic stride as a /// pointer to the Value, or null otherwise. -static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, - const DataLayout *DL, Loop *Lp) { +static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *Lp) { const PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType()); if (!PtrTy || PtrTy->isAggregateType()) return nullptr; @@ -3912,7 +3681,7 @@ static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, // The size of the pointer access. int64_t PtrAccessSize = 1; - Ptr = stripGetElementPtr(Ptr, SE, DL, Lp); + Ptr = stripGetElementPtr(Ptr, SE, Lp); const SCEV *V = SE->getSCEV(Ptr); if (Ptr != OrigPtr) @@ -3931,7 +3700,8 @@ static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, // Strip off the size of access multiplication if we are still analyzing the // pointer. if (OrigPtr == Ptr) { - DL->getTypeAllocSize(PtrTy->getElementType()); + const DataLayout &DL = Lp->getHeader()->getModule()->getDataLayout(); + DL.getTypeAllocSize(PtrTy->getElementType()); if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(V)) { if (M->getOperand(0)->getSCEVType() != scConstant) return nullptr; @@ -3983,7 +3753,7 @@ void LoopVectorizationLegality::collectStridedAccess(Value *MemAccess) { else return; - Value *Stride = getStrideFromPointer(Ptr, SE, DL, TheLoop); + Value *Stride = getStrideFromPointer(Ptr, SE, TheLoop); if (!Stride) return; @@ -4012,7 +3782,7 @@ void LoopVectorizationLegality::collectLoopUniforms() { if (I->getType()->isPointerTy() && isConsecutivePtr(I)) Worklist.insert(Worklist.end(), I->op_begin(), I->op_end()); - while (Worklist.size()) { + while (!Worklist.empty()) { Instruction *I = dyn_cast<Instruction>(Worklist.back()); Worklist.pop_back(); @@ -4030,1305 +3800,46 @@ void LoopVectorizationLegality::collectLoopUniforms() { } } -namespace { -/// \brief Analyses memory accesses in a loop. -/// -/// Checks whether run time pointer checks are needed and builds sets for data -/// dependence checking. -class AccessAnalysis { -public: - /// \brief Read or write access location. - typedef PointerIntPair<Value *, 1, bool> MemAccessInfo; - typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet; - - /// \brief Set of potential dependent memory accesses. - typedef EquivalenceClasses<MemAccessInfo> DepCandidates; - - AccessAnalysis(const DataLayout *Dl, AliasAnalysis *AA, DepCandidates &DA) : - DL(Dl), AST(*AA), DepCands(DA), IsRTCheckNeeded(false) {} - - /// \brief Register a load and whether it is only read from. - void addLoad(AliasAnalysis::Location &Loc, bool IsReadOnly) { - Value *Ptr = const_cast<Value*>(Loc.Ptr); - AST.add(Ptr, AliasAnalysis::UnknownSize, Loc.AATags); - Accesses.insert(MemAccessInfo(Ptr, false)); - if (IsReadOnly) - ReadOnlyPtr.insert(Ptr); - } - - /// \brief Register a store. - void addStore(AliasAnalysis::Location &Loc) { - Value *Ptr = const_cast<Value*>(Loc.Ptr); - AST.add(Ptr, AliasAnalysis::UnknownSize, Loc.AATags); - Accesses.insert(MemAccessInfo(Ptr, true)); - } - - /// \brief Check whether we can check the pointers at runtime for - /// non-intersection. - bool canCheckPtrAtRT(LoopVectorizationLegality::RuntimePointerCheck &RtCheck, - unsigned &NumComparisons, ScalarEvolution *SE, - Loop *TheLoop, ValueToValueMap &Strides, - bool ShouldCheckStride = false); - - /// \brief Goes over all memory accesses, checks whether a RT check is needed - /// and builds sets of dependent accesses. - void buildDependenceSets() { - processMemAccesses(); - } - - bool isRTCheckNeeded() { return IsRTCheckNeeded; } - - bool isDependencyCheckNeeded() { return !CheckDeps.empty(); } - void resetDepChecks() { CheckDeps.clear(); } - - MemAccessInfoSet &getDependenciesToCheck() { return CheckDeps; } - -private: - typedef SetVector<MemAccessInfo> PtrAccessSet; - - /// \brief Go over all memory access and check whether runtime pointer checks - /// are needed /// and build sets of dependency check candidates. - void processMemAccesses(); - - /// Set of all accesses. - PtrAccessSet Accesses; - - /// Set of accesses that need a further dependence check. - MemAccessInfoSet CheckDeps; - - /// Set of pointers that are read only. - SmallPtrSet<Value*, 16> ReadOnlyPtr; - - const DataLayout *DL; - - /// An alias set tracker to partition the access set by underlying object and - //intrinsic property (such as TBAA metadata). - AliasSetTracker AST; - - /// Sets of potentially dependent accesses - members of one set share an - /// underlying pointer. The set "CheckDeps" identfies which sets really need a - /// dependence check. - DepCandidates &DepCands; - - bool IsRTCheckNeeded; -}; - -} // end anonymous namespace - -/// \brief Check whether a pointer can participate in a runtime bounds check. -static bool hasComputableBounds(ScalarEvolution *SE, ValueToValueMap &Strides, - Value *Ptr) { - const SCEV *PtrScev = replaceSymbolicStrideSCEV(SE, Strides, Ptr); - const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev); - if (!AR) - return false; - - return AR->isAffine(); -} - -/// \brief Check the stride of the pointer and ensure that it does not wrap in -/// the address space. -static int isStridedPtr(ScalarEvolution *SE, const DataLayout *DL, Value *Ptr, - const Loop *Lp, ValueToValueMap &StridesMap); - -bool AccessAnalysis::canCheckPtrAtRT( - LoopVectorizationLegality::RuntimePointerCheck &RtCheck, - unsigned &NumComparisons, ScalarEvolution *SE, Loop *TheLoop, - ValueToValueMap &StridesMap, bool ShouldCheckStride) { - // Find pointers with computable bounds. We are going to use this information - // to place a runtime bound check. - bool CanDoRT = true; - - bool IsDepCheckNeeded = isDependencyCheckNeeded(); - NumComparisons = 0; - - // We assign a consecutive id to access from different alias sets. - // Accesses between different groups doesn't need to be checked. - unsigned ASId = 1; - for (auto &AS : AST) { - unsigned NumReadPtrChecks = 0; - unsigned NumWritePtrChecks = 0; - - // We assign consecutive id to access from different dependence sets. - // Accesses within the same set don't need a runtime check. - unsigned RunningDepId = 1; - DenseMap<Value *, unsigned> DepSetId; - - for (auto A : AS) { - Value *Ptr = A.getValue(); - bool IsWrite = Accesses.count(MemAccessInfo(Ptr, true)); - MemAccessInfo Access(Ptr, IsWrite); - - if (IsWrite) - ++NumWritePtrChecks; - else - ++NumReadPtrChecks; - - if (hasComputableBounds(SE, StridesMap, Ptr) && - // When we run after a failing dependency check we have to make sure we - // don't have wrapping pointers. - (!ShouldCheckStride || - isStridedPtr(SE, DL, Ptr, TheLoop, StridesMap) == 1)) { - // The id of the dependence set. - unsigned DepId; - - if (IsDepCheckNeeded) { - Value *Leader = DepCands.getLeaderValue(Access).getPointer(); - unsigned &LeaderId = DepSetId[Leader]; - if (!LeaderId) - LeaderId = RunningDepId++; - DepId = LeaderId; - } else - // Each access has its own dependence set. - DepId = RunningDepId++; - - RtCheck.insert(SE, TheLoop, Ptr, IsWrite, DepId, ASId, StridesMap); - - DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *Ptr << '\n'); - } else { - CanDoRT = false; - } - } - - if (IsDepCheckNeeded && CanDoRT && RunningDepId == 2) - NumComparisons += 0; // Only one dependence set. - else { - NumComparisons += (NumWritePtrChecks * (NumReadPtrChecks + - NumWritePtrChecks - 1)); - } - - ++ASId; - } - - // If the pointers that we would use for the bounds comparison have different - // address spaces, assume the values aren't directly comparable, so we can't - // use them for the runtime check. We also have to assume they could - // overlap. In the future there should be metadata for whether address spaces - // are disjoint. - unsigned NumPointers = RtCheck.Pointers.size(); - for (unsigned i = 0; i < NumPointers; ++i) { - for (unsigned j = i + 1; j < NumPointers; ++j) { - // Only need to check pointers between two different dependency sets. - if (RtCheck.DependencySetId[i] == RtCheck.DependencySetId[j]) - continue; - // Only need to check pointers in the same alias set. - if (RtCheck.AliasSetId[i] != RtCheck.AliasSetId[j]) - continue; - - Value *PtrI = RtCheck.Pointers[i]; - Value *PtrJ = RtCheck.Pointers[j]; - - unsigned ASi = PtrI->getType()->getPointerAddressSpace(); - unsigned ASj = PtrJ->getType()->getPointerAddressSpace(); - if (ASi != ASj) { - DEBUG(dbgs() << "LV: Runtime check would require comparison between" - " different address spaces\n"); - return false; - } - } - } - - return CanDoRT; -} - -void AccessAnalysis::processMemAccesses() { - // We process the set twice: first we process read-write pointers, last we - // process read-only pointers. This allows us to skip dependence tests for - // read-only pointers. - - DEBUG(dbgs() << "LV: Processing memory accesses...\n"); - DEBUG(dbgs() << " AST: "; AST.dump()); - DEBUG(dbgs() << "LV: Accesses:\n"); - DEBUG({ - for (auto A : Accesses) - dbgs() << "\t" << *A.getPointer() << " (" << - (A.getInt() ? "write" : (ReadOnlyPtr.count(A.getPointer()) ? - "read-only" : "read")) << ")\n"; - }); - - // The AliasSetTracker has nicely partitioned our pointers by metadata - // compatibility and potential for underlying-object overlap. As a result, we - // only need to check for potential pointer dependencies within each alias - // set. - for (auto &AS : AST) { - // Note that both the alias-set tracker and the alias sets themselves used - // linked lists internally and so the iteration order here is deterministic - // (matching the original instruction order within each set). - - bool SetHasWrite = false; - - // Map of pointers to last access encountered. - typedef DenseMap<Value*, MemAccessInfo> UnderlyingObjToAccessMap; - UnderlyingObjToAccessMap ObjToLastAccess; - - // Set of access to check after all writes have been processed. - PtrAccessSet DeferredAccesses; - - // Iterate over each alias set twice, once to process read/write pointers, - // and then to process read-only pointers. - for (int SetIteration = 0; SetIteration < 2; ++SetIteration) { - bool UseDeferred = SetIteration > 0; - PtrAccessSet &S = UseDeferred ? DeferredAccesses : Accesses; - - for (auto AV : AS) { - Value *Ptr = AV.getValue(); - - // For a single memory access in AliasSetTracker, Accesses may contain - // both read and write, and they both need to be handled for CheckDeps. - for (auto AC : S) { - if (AC.getPointer() != Ptr) - continue; - - bool IsWrite = AC.getInt(); - - // If we're using the deferred access set, then it contains only - // reads. - bool IsReadOnlyPtr = ReadOnlyPtr.count(Ptr) && !IsWrite; - if (UseDeferred && !IsReadOnlyPtr) - continue; - // Otherwise, the pointer must be in the PtrAccessSet, either as a - // read or a write. - assert(((IsReadOnlyPtr && UseDeferred) || IsWrite || - S.count(MemAccessInfo(Ptr, false))) && - "Alias-set pointer not in the access set?"); - - MemAccessInfo Access(Ptr, IsWrite); - DepCands.insert(Access); - - // Memorize read-only pointers for later processing and skip them in - // the first round (they need to be checked after we have seen all - // write pointers). Note: we also mark pointer that are not - // consecutive as "read-only" pointers (so that we check - // "a[b[i]] +="). Hence, we need the second check for "!IsWrite". - if (!UseDeferred && IsReadOnlyPtr) { - DeferredAccesses.insert(Access); - continue; - } - - // If this is a write - check other reads and writes for conflicts. If - // this is a read only check other writes for conflicts (but only if - // there is no other write to the ptr - this is an optimization to - // catch "a[i] = a[i] + " without having to do a dependence check). - if ((IsWrite || IsReadOnlyPtr) && SetHasWrite) { - CheckDeps.insert(Access); - IsRTCheckNeeded = true; - } - - if (IsWrite) - SetHasWrite = true; - - // Create sets of pointers connected by a shared alias set and - // underlying object. - typedef SmallVector<Value *, 16> ValueVector; - ValueVector TempObjects; - GetUnderlyingObjects(Ptr, TempObjects, DL); - for (Value *UnderlyingObj : TempObjects) { - UnderlyingObjToAccessMap::iterator Prev = - ObjToLastAccess.find(UnderlyingObj); - if (Prev != ObjToLastAccess.end()) - DepCands.unionSets(Access, Prev->second); - - ObjToLastAccess[UnderlyingObj] = Access; - } - } - } - } - } -} - -namespace { -/// \brief Checks memory dependences among accesses to the same underlying -/// object to determine whether there vectorization is legal or not (and at -/// which vectorization factor). -/// -/// This class works under the assumption that we already checked that memory -/// locations with different underlying pointers are "must-not alias". -/// We use the ScalarEvolution framework to symbolically evalutate access -/// functions pairs. Since we currently don't restructure the loop we can rely -/// on the program order of memory accesses to determine their safety. -/// At the moment we will only deem accesses as safe for: -/// * A negative constant distance assuming program order. -/// -/// Safe: tmp = a[i + 1]; OR a[i + 1] = x; -/// a[i] = tmp; y = a[i]; -/// -/// The latter case is safe because later checks guarantuee that there can't -/// be a cycle through a phi node (that is, we check that "x" and "y" is not -/// the same variable: a header phi can only be an induction or a reduction, a -/// reduction can't have a memory sink, an induction can't have a memory -/// source). This is important and must not be violated (or we have to -/// resort to checking for cycles through memory). -/// -/// * A positive constant distance assuming program order that is bigger -/// than the biggest memory access. -/// -/// tmp = a[i] OR b[i] = x -/// a[i+2] = tmp y = b[i+2]; -/// -/// Safe distance: 2 x sizeof(a[0]), and 2 x sizeof(b[0]), respectively. -/// -/// * Zero distances and all accesses have the same size. -/// -class MemoryDepChecker { -public: - typedef PointerIntPair<Value *, 1, bool> MemAccessInfo; - typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet; - - MemoryDepChecker(ScalarEvolution *Se, const DataLayout *Dl, const Loop *L) - : SE(Se), DL(Dl), InnermostLoop(L), AccessIdx(0), - ShouldRetryWithRuntimeCheck(false) {} - - /// \brief Register the location (instructions are given increasing numbers) - /// of a write access. - void addAccess(StoreInst *SI) { - Value *Ptr = SI->getPointerOperand(); - Accesses[MemAccessInfo(Ptr, true)].push_back(AccessIdx); - InstMap.push_back(SI); - ++AccessIdx; - } - - /// \brief Register the location (instructions are given increasing numbers) - /// of a write access. - void addAccess(LoadInst *LI) { - Value *Ptr = LI->getPointerOperand(); - Accesses[MemAccessInfo(Ptr, false)].push_back(AccessIdx); - InstMap.push_back(LI); - ++AccessIdx; - } - - /// \brief Check whether the dependencies between the accesses are safe. - /// - /// Only checks sets with elements in \p CheckDeps. - bool areDepsSafe(AccessAnalysis::DepCandidates &AccessSets, - MemAccessInfoSet &CheckDeps, ValueToValueMap &Strides); - - /// \brief The maximum number of bytes of a vector register we can vectorize - /// the accesses safely with. - unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; } - - /// \brief In same cases when the dependency check fails we can still - /// vectorize the loop with a dynamic array access check. - bool shouldRetryWithRuntimeCheck() { return ShouldRetryWithRuntimeCheck; } - -private: - ScalarEvolution *SE; - const DataLayout *DL; - const Loop *InnermostLoop; - - /// \brief Maps access locations (ptr, read/write) to program order. - DenseMap<MemAccessInfo, std::vector<unsigned> > Accesses; - - /// \brief Memory access instructions in program order. - SmallVector<Instruction *, 16> InstMap; - - /// \brief The program order index to be used for the next instruction. - unsigned AccessIdx; - - // We can access this many bytes in parallel safely. - unsigned MaxSafeDepDistBytes; - - /// \brief If we see a non-constant dependence distance we can still try to - /// vectorize this loop with runtime checks. - bool ShouldRetryWithRuntimeCheck; - - /// \brief Check whether there is a plausible dependence between the two - /// accesses. - /// - /// Access \p A must happen before \p B in program order. The two indices - /// identify the index into the program order map. - /// - /// This function checks whether there is a plausible dependence (or the - /// absence of such can't be proved) between the two accesses. If there is a - /// plausible dependence but the dependence distance is bigger than one - /// element access it records this distance in \p MaxSafeDepDistBytes (if this - /// distance is smaller than any other distance encountered so far). - /// Otherwise, this function returns true signaling a possible dependence. - bool isDependent(const MemAccessInfo &A, unsigned AIdx, - const MemAccessInfo &B, unsigned BIdx, - ValueToValueMap &Strides); - - /// \brief Check whether the data dependence could prevent store-load - /// forwarding. - bool couldPreventStoreLoadForward(unsigned Distance, unsigned TypeByteSize); -}; - -} // end anonymous namespace - -static bool isInBoundsGep(Value *Ptr) { - if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr)) - return GEP->isInBounds(); - return false; -} - -/// \brief Check whether the access through \p Ptr has a constant stride. -static int isStridedPtr(ScalarEvolution *SE, const DataLayout *DL, Value *Ptr, - const Loop *Lp, ValueToValueMap &StridesMap) { - const Type *Ty = Ptr->getType(); - assert(Ty->isPointerTy() && "Unexpected non-ptr"); - - // Make sure that the pointer does not point to aggregate types. - const PointerType *PtrTy = cast<PointerType>(Ty); - if (PtrTy->getElementType()->isAggregateType()) { - DEBUG(dbgs() << "LV: Bad stride - Not a pointer to a scalar type" << *Ptr << - "\n"); - return 0; - } - - const SCEV *PtrScev = replaceSymbolicStrideSCEV(SE, StridesMap, Ptr); - - const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev); - if (!AR) { - DEBUG(dbgs() << "LV: Bad stride - Not an AddRecExpr pointer " - << *Ptr << " SCEV: " << *PtrScev << "\n"); - return 0; - } - - // The accesss function must stride over the innermost loop. - if (Lp != AR->getLoop()) { - DEBUG(dbgs() << "LV: Bad stride - Not striding over innermost loop " << - *Ptr << " SCEV: " << *PtrScev << "\n"); - } - - // The address calculation must not wrap. Otherwise, a dependence could be - // inverted. - // An inbounds getelementptr that is a AddRec with a unit stride - // cannot wrap per definition. The unit stride requirement is checked later. - // An getelementptr without an inbounds attribute and unit stride would have - // to access the pointer value "0" which is undefined behavior in address - // space 0, therefore we can also vectorize this case. - bool IsInBoundsGEP = isInBoundsGep(Ptr); - bool IsNoWrapAddRec = AR->getNoWrapFlags(SCEV::NoWrapMask); - bool IsInAddressSpaceZero = PtrTy->getAddressSpace() == 0; - if (!IsNoWrapAddRec && !IsInBoundsGEP && !IsInAddressSpaceZero) { - DEBUG(dbgs() << "LV: Bad stride - Pointer may wrap in the address space " - << *Ptr << " SCEV: " << *PtrScev << "\n"); - return 0; - } - - // Check the step is constant. - const SCEV *Step = AR->getStepRecurrence(*SE); - - // Calculate the pointer stride and check if it is consecutive. - const SCEVConstant *C = dyn_cast<SCEVConstant>(Step); - if (!C) { - DEBUG(dbgs() << "LV: Bad stride - Not a constant strided " << *Ptr << - " SCEV: " << *PtrScev << "\n"); - return 0; - } - - int64_t Size = DL->getTypeAllocSize(PtrTy->getElementType()); - const APInt &APStepVal = C->getValue()->getValue(); - - // Huge step value - give up. - if (APStepVal.getBitWidth() > 64) - return 0; - - int64_t StepVal = APStepVal.getSExtValue(); - - // Strided access. - int64_t Stride = StepVal / Size; - int64_t Rem = StepVal % Size; - if (Rem) - return 0; - - // If the SCEV could wrap but we have an inbounds gep with a unit stride we - // know we can't "wrap around the address space". In case of address space - // zero we know that this won't happen without triggering undefined behavior. - if (!IsNoWrapAddRec && (IsInBoundsGEP || IsInAddressSpaceZero) && - Stride != 1 && Stride != -1) - return 0; - - return Stride; -} - -bool MemoryDepChecker::couldPreventStoreLoadForward(unsigned Distance, - unsigned TypeByteSize) { - // If loads occur at a distance that is not a multiple of a feasible vector - // factor store-load forwarding does not take place. - // Positive dependences might cause troubles because vectorizing them might - // prevent store-load forwarding making vectorized code run a lot slower. - // a[i] = a[i-3] ^ a[i-8]; - // The stores to a[i:i+1] don't align with the stores to a[i-3:i-2] and - // hence on your typical architecture store-load forwarding does not take - // place. Vectorizing in such cases does not make sense. - // Store-load forwarding distance. - const unsigned NumCyclesForStoreLoadThroughMemory = 8*TypeByteSize; - // Maximum vector factor. - unsigned MaxVFWithoutSLForwardIssues = MaxVectorWidth*TypeByteSize; - if(MaxSafeDepDistBytes < MaxVFWithoutSLForwardIssues) - MaxVFWithoutSLForwardIssues = MaxSafeDepDistBytes; - - for (unsigned vf = 2*TypeByteSize; vf <= MaxVFWithoutSLForwardIssues; - vf *= 2) { - if (Distance % vf && Distance / vf < NumCyclesForStoreLoadThroughMemory) { - MaxVFWithoutSLForwardIssues = (vf >>=1); - break; - } - } - - if (MaxVFWithoutSLForwardIssues< 2*TypeByteSize) { - DEBUG(dbgs() << "LV: Distance " << Distance << - " that could cause a store-load forwarding conflict\n"); - return true; - } - - if (MaxVFWithoutSLForwardIssues < MaxSafeDepDistBytes && - MaxVFWithoutSLForwardIssues != MaxVectorWidth*TypeByteSize) - MaxSafeDepDistBytes = MaxVFWithoutSLForwardIssues; - return false; -} - -bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, - const MemAccessInfo &B, unsigned BIdx, - ValueToValueMap &Strides) { - assert (AIdx < BIdx && "Must pass arguments in program order"); - - Value *APtr = A.getPointer(); - Value *BPtr = B.getPointer(); - bool AIsWrite = A.getInt(); - bool BIsWrite = B.getInt(); - - // Two reads are independent. - if (!AIsWrite && !BIsWrite) - return false; - - // We cannot check pointers in different address spaces. - if (APtr->getType()->getPointerAddressSpace() != - BPtr->getType()->getPointerAddressSpace()) - return true; - - const SCEV *AScev = replaceSymbolicStrideSCEV(SE, Strides, APtr); - const SCEV *BScev = replaceSymbolicStrideSCEV(SE, Strides, BPtr); - - int StrideAPtr = isStridedPtr(SE, DL, APtr, InnermostLoop, Strides); - int StrideBPtr = isStridedPtr(SE, DL, BPtr, InnermostLoop, Strides); - - const SCEV *Src = AScev; - const SCEV *Sink = BScev; - - // If the induction step is negative we have to invert source and sink of the - // dependence. - if (StrideAPtr < 0) { - //Src = BScev; - //Sink = AScev; - std::swap(APtr, BPtr); - std::swap(Src, Sink); - std::swap(AIsWrite, BIsWrite); - std::swap(AIdx, BIdx); - std::swap(StrideAPtr, StrideBPtr); - } - - const SCEV *Dist = SE->getMinusSCEV(Sink, Src); - - DEBUG(dbgs() << "LV: Src Scev: " << *Src << "Sink Scev: " << *Sink - << "(Induction step: " << StrideAPtr << ")\n"); - DEBUG(dbgs() << "LV: Distance for " << *InstMap[AIdx] << " to " - << *InstMap[BIdx] << ": " << *Dist << "\n"); - - // Need consecutive accesses. We don't want to vectorize - // "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap in - // the address space. - if (!StrideAPtr || !StrideBPtr || StrideAPtr != StrideBPtr){ - DEBUG(dbgs() << "Non-consecutive pointer access\n"); - return true; - } - - const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist); - if (!C) { - DEBUG(dbgs() << "LV: Dependence because of non-constant distance\n"); - ShouldRetryWithRuntimeCheck = true; - return true; - } - - Type *ATy = APtr->getType()->getPointerElementType(); - Type *BTy = BPtr->getType()->getPointerElementType(); - unsigned TypeByteSize = DL->getTypeAllocSize(ATy); - - // Negative distances are not plausible dependencies. - const APInt &Val = C->getValue()->getValue(); - if (Val.isNegative()) { - bool IsTrueDataDependence = (AIsWrite && !BIsWrite); - if (IsTrueDataDependence && - (couldPreventStoreLoadForward(Val.abs().getZExtValue(), TypeByteSize) || - ATy != BTy)) - return true; - - DEBUG(dbgs() << "LV: Dependence is negative: NoDep\n"); - return false; - } - - // Write to the same location with the same size. - // Could be improved to assert type sizes are the same (i32 == float, etc). - if (Val == 0) { - if (ATy == BTy) - return false; - DEBUG(dbgs() << "LV: Zero dependence difference but different types\n"); - return true; - } - - assert(Val.isStrictlyPositive() && "Expect a positive value"); - - // Positive distance bigger than max vectorization factor. - if (ATy != BTy) { - DEBUG(dbgs() << - "LV: ReadWrite-Write positive dependency with different types\n"); - return false; - } - - unsigned Distance = (unsigned) Val.getZExtValue(); - - // Bail out early if passed-in parameters make vectorization not feasible. - unsigned ForcedFactor = VectorizationFactor ? VectorizationFactor : 1; - unsigned ForcedUnroll = VectorizationInterleave ? VectorizationInterleave : 1; - - // The distance must be bigger than the size needed for a vectorized version - // of the operation and the size of the vectorized operation must not be - // bigger than the currrent maximum size. - if (Distance < 2*TypeByteSize || - 2*TypeByteSize > MaxSafeDepDistBytes || - Distance < TypeByteSize * ForcedUnroll * ForcedFactor) { - DEBUG(dbgs() << "LV: Failure because of Positive distance " - << Val.getSExtValue() << '\n'); - return true; - } - - MaxSafeDepDistBytes = Distance < MaxSafeDepDistBytes ? - Distance : MaxSafeDepDistBytes; - - bool IsTrueDataDependence = (!AIsWrite && BIsWrite); - if (IsTrueDataDependence && - couldPreventStoreLoadForward(Distance, TypeByteSize)) - return true; - - DEBUG(dbgs() << "LV: Positive distance " << Val.getSExtValue() << - " with max VF = " << MaxSafeDepDistBytes / TypeByteSize << '\n'); - - return false; -} - -bool MemoryDepChecker::areDepsSafe(AccessAnalysis::DepCandidates &AccessSets, - MemAccessInfoSet &CheckDeps, - ValueToValueMap &Strides) { - - MaxSafeDepDistBytes = -1U; - while (!CheckDeps.empty()) { - MemAccessInfo CurAccess = *CheckDeps.begin(); - - // Get the relevant memory access set. - EquivalenceClasses<MemAccessInfo>::iterator I = - AccessSets.findValue(AccessSets.getLeaderValue(CurAccess)); - - // Check accesses within this set. - EquivalenceClasses<MemAccessInfo>::member_iterator AI, AE; - AI = AccessSets.member_begin(I), AE = AccessSets.member_end(); - - // Check every access pair. - while (AI != AE) { - CheckDeps.erase(*AI); - EquivalenceClasses<MemAccessInfo>::member_iterator OI = std::next(AI); - while (OI != AE) { - // Check every accessing instruction pair in program order. - for (std::vector<unsigned>::iterator I1 = Accesses[*AI].begin(), - I1E = Accesses[*AI].end(); I1 != I1E; ++I1) - for (std::vector<unsigned>::iterator I2 = Accesses[*OI].begin(), - I2E = Accesses[*OI].end(); I2 != I2E; ++I2) { - if (*I1 < *I2 && isDependent(*AI, *I1, *OI, *I2, Strides)) - return false; - if (*I2 < *I1 && isDependent(*OI, *I2, *AI, *I1, Strides)) - return false; - } - ++OI; - } - AI++; - } - } - return true; -} - bool LoopVectorizationLegality::canVectorizeMemory() { - - typedef SmallVector<Value*, 16> ValueVector; - typedef SmallPtrSet<Value*, 16> ValueSet; - - // Holds the Load and Store *instructions*. - ValueVector Loads; - ValueVector Stores; - - // Holds all the different accesses in the loop. - unsigned NumReads = 0; - unsigned NumReadWrites = 0; - - PtrRtCheck.Pointers.clear(); - PtrRtCheck.Need = false; - - const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel(); - MemoryDepChecker DepChecker(SE, DL, TheLoop); - - // For each block. - for (Loop::block_iterator bb = TheLoop->block_begin(), - be = TheLoop->block_end(); bb != be; ++bb) { - - // Scan the BB and collect legal loads and stores. - for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; - ++it) { - - // If this is a load, save it. If this instruction can read from memory - // but is not a load, then we quit. Notice that we don't handle function - // calls that read or write. - if (it->mayReadFromMemory()) { - // Many math library functions read the rounding mode. We will only - // vectorize a loop if it contains known function calls that don't set - // the flag. Therefore, it is safe to ignore this read from memory. - CallInst *Call = dyn_cast<CallInst>(it); - if (Call && getIntrinsicIDForCall(Call, TLI)) - continue; - - LoadInst *Ld = dyn_cast<LoadInst>(it); - if (!Ld || (!Ld->isSimple() && !IsAnnotatedParallel)) { - emitAnalysis(Report(Ld) - << "read with atomic ordering or volatile read"); - DEBUG(dbgs() << "LV: Found a non-simple load.\n"); - return false; - } - NumLoads++; - Loads.push_back(Ld); - DepChecker.addAccess(Ld); - continue; - } - - // Save 'store' instructions. Abort if other instructions write to memory. - if (it->mayWriteToMemory()) { - StoreInst *St = dyn_cast<StoreInst>(it); - if (!St) { - emitAnalysis(Report(it) << "instruction cannot be vectorized"); - return false; - } - if (!St->isSimple() && !IsAnnotatedParallel) { - emitAnalysis(Report(St) - << "write with atomic ordering or volatile write"); - DEBUG(dbgs() << "LV: Found a non-simple store.\n"); - return false; - } - NumStores++; - Stores.push_back(St); - DepChecker.addAccess(St); - } - } // Next instr. - } // Next block. - - // Now we have two lists that hold the loads and the stores. - // Next, we find the pointers that they use. - - // Check if we see any stores. If there are no stores, then we don't - // care if the pointers are *restrict*. - if (!Stores.size()) { - DEBUG(dbgs() << "LV: Found a read-only loop!\n"); - return true; - } - - AccessAnalysis::DepCandidates DependentAccesses; - AccessAnalysis Accesses(DL, AA, DependentAccesses); - - // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects - // multiple times on the same object. If the ptr is accessed twice, once - // for read and once for write, it will only appear once (on the write - // list). This is okay, since we are going to check for conflicts between - // writes and between reads and writes, but not between reads and reads. - ValueSet Seen; - - ValueVector::iterator I, IE; - for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) { - StoreInst *ST = cast<StoreInst>(*I); - Value* Ptr = ST->getPointerOperand(); - - if (isUniform(Ptr)) { - emitAnalysis( - Report(ST) - << "write to a loop invariant address could not be vectorized"); - DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n"); - return false; - } - - // If we did *not* see this pointer before, insert it to the read-write - // list. At this phase it is only a 'write' list. - if (Seen.insert(Ptr).second) { - ++NumReadWrites; - - AliasAnalysis::Location Loc = AA->getLocation(ST); - // The TBAA metadata could have a control dependency on the predication - // condition, so we cannot rely on it when determining whether or not we - // need runtime pointer checks. - if (blockNeedsPredication(ST->getParent())) - Loc.AATags.TBAA = nullptr; - - Accesses.addStore(Loc); - } - } - - if (IsAnnotatedParallel) { - DEBUG(dbgs() - << "LV: A loop annotated parallel, ignore memory dependency " - << "checks.\n"); - return true; - } - - for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) { - LoadInst *LD = cast<LoadInst>(*I); - Value* Ptr = LD->getPointerOperand(); - // If we did *not* see this pointer before, insert it to the - // read list. If we *did* see it before, then it is already in - // the read-write list. This allows us to vectorize expressions - // such as A[i] += x; Because the address of A[i] is a read-write - // pointer. This only works if the index of A[i] is consecutive. - // If the address of i is unknown (for example A[B[i]]) then we may - // read a few words, modify, and write a few words, and some of the - // words may be written to the same address. - bool IsReadOnlyPtr = false; - if (Seen.insert(Ptr).second || - !isStridedPtr(SE, DL, Ptr, TheLoop, Strides)) { - ++NumReads; - IsReadOnlyPtr = true; - } - - AliasAnalysis::Location Loc = AA->getLocation(LD); - // The TBAA metadata could have a control dependency on the predication - // condition, so we cannot rely on it when determining whether or not we - // need runtime pointer checks. - if (blockNeedsPredication(LD->getParent())) - Loc.AATags.TBAA = nullptr; - - Accesses.addLoad(Loc, IsReadOnlyPtr); - } - - // If we write (or read-write) to a single destination and there are no - // other reads in this loop then is it safe to vectorize. - if (NumReadWrites == 1 && NumReads == 0) { - DEBUG(dbgs() << "LV: Found a write-only loop!\n"); - return true; - } - - // Build dependence sets and check whether we need a runtime pointer bounds - // check. - Accesses.buildDependenceSets(); - bool NeedRTCheck = Accesses.isRTCheckNeeded(); - - // Find pointers with computable bounds. We are going to use this information - // to place a runtime bound check. - unsigned NumComparisons = 0; - bool CanDoRT = false; - if (NeedRTCheck) - CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE, TheLoop, - Strides); - - DEBUG(dbgs() << "LV: We need to do " << NumComparisons << - " pointer comparisons.\n"); - - // If we only have one set of dependences to check pointers among we don't - // need a runtime check. - if (NumComparisons == 0 && NeedRTCheck) - NeedRTCheck = false; - - // Check that we did not collect too many pointers or found an unsizeable - // pointer. - if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) { - PtrRtCheck.reset(); - CanDoRT = false; - } - - if (CanDoRT) { - DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n"); - } - - if (NeedRTCheck && !CanDoRT) { - emitAnalysis(Report() << "cannot identify array bounds"); - DEBUG(dbgs() << "LV: We can't vectorize because we can't find " << - "the array bounds.\n"); - PtrRtCheck.reset(); - return false; - } - - PtrRtCheck.Need = NeedRTCheck; - - bool CanVecMem = true; - if (Accesses.isDependencyCheckNeeded()) { - DEBUG(dbgs() << "LV: Checking memory dependencies\n"); - CanVecMem = DepChecker.areDepsSafe( - DependentAccesses, Accesses.getDependenciesToCheck(), Strides); - MaxSafeDepDistBytes = DepChecker.getMaxSafeDepDistBytes(); - - if (!CanVecMem && DepChecker.shouldRetryWithRuntimeCheck()) { - DEBUG(dbgs() << "LV: Retrying with memory checks\n"); - NeedRTCheck = true; - - // Clear the dependency checks. We assume they are not needed. - Accesses.resetDepChecks(); - - PtrRtCheck.reset(); - PtrRtCheck.Need = true; - - CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE, - TheLoop, Strides, true); - // Check that we did not collect too many pointers or found an unsizeable - // pointer. - if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) { - if (!CanDoRT && NumComparisons > 0) - emitAnalysis(Report() - << "cannot check memory dependencies at runtime"); - else - emitAnalysis(Report() - << NumComparisons << " exceeds limit of " - << RuntimeMemoryCheckThreshold - << " dependent memory operations checked at runtime"); - DEBUG(dbgs() << "LV: Can't vectorize with memory checks\n"); - PtrRtCheck.reset(); - return false; - } - - CanVecMem = true; - } - } - - if (!CanVecMem) - emitAnalysis(Report() << "unsafe dependent memory operations in loop"); - - DEBUG(dbgs() << "LV: We" << (NeedRTCheck ? "" : " don't") << - " need a runtime memory check.\n"); - - return CanVecMem; -} - -static bool hasMultipleUsesOf(Instruction *I, - SmallPtrSetImpl<Instruction *> &Insts) { - unsigned NumUses = 0; - for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use) { - if (Insts.count(dyn_cast<Instruction>(*Use))) - ++NumUses; - if (NumUses > 1) - return true; - } - - return false; -} - -static bool areAllUsesIn(Instruction *I, SmallPtrSetImpl<Instruction *> &Set) { - for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use) - if (!Set.count(dyn_cast<Instruction>(*Use))) - return false; - return true; -} - -bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, - ReductionKind Kind) { - if (Phi->getNumIncomingValues() != 2) + LAI = &LAA->getInfo(TheLoop, Strides); + auto &OptionalReport = LAI->getReport(); + if (OptionalReport) + emitAnalysis(VectorizationReport(*OptionalReport)); + if (!LAI->canVectorizeMemory()) return false; - // Reduction variables are only found in the loop header block. - if (Phi->getParent() != TheLoop->getHeader()) + if (LAI->hasStoreToLoopInvariantAddress()) { + emitAnalysis( + VectorizationReport() + << "write to a loop invariant address could not be vectorized"); + DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n"); return false; - - // Obtain the reduction start value from the value that comes from the loop - // preheader. - Value *RdxStart = Phi->getIncomingValueForBlock(TheLoop->getLoopPreheader()); - - // ExitInstruction is the single value which is used outside the loop. - // We only allow for a single reduction value to be used outside the loop. - // This includes users of the reduction, variables (which form a cycle - // which ends in the phi node). - Instruction *ExitInstruction = nullptr; - // Indicates that we found a reduction operation in our scan. - bool FoundReduxOp = false; - - // We start with the PHI node and scan for all of the users of this - // instruction. All users must be instructions that can be used as reduction - // variables (such as ADD). We must have a single out-of-block user. The cycle - // must include the original PHI. - bool FoundStartPHI = false; - - // To recognize min/max patterns formed by a icmp select sequence, we store - // the number of instruction we saw from the recognized min/max pattern, - // to make sure we only see exactly the two instructions. - unsigned NumCmpSelectPatternInst = 0; - ReductionInstDesc ReduxDesc(false, nullptr); - - SmallPtrSet<Instruction *, 8> VisitedInsts; - SmallVector<Instruction *, 8> Worklist; - Worklist.push_back(Phi); - VisitedInsts.insert(Phi); - - // A value in the reduction can be used: - // - By the reduction: - // - Reduction operation: - // - One use of reduction value (safe). - // - Multiple use of reduction value (not safe). - // - PHI: - // - All uses of the PHI must be the reduction (safe). - // - Otherwise, not safe. - // - By one instruction outside of the loop (safe). - // - By further instructions outside of the loop (not safe). - // - By an instruction that is not part of the reduction (not safe). - // This is either: - // * An instruction type other than PHI or the reduction operation. - // * A PHI in the header other than the initial PHI. - while (!Worklist.empty()) { - Instruction *Cur = Worklist.back(); - Worklist.pop_back(); - - // No Users. - // If the instruction has no users then this is a broken chain and can't be - // a reduction variable. - if (Cur->use_empty()) - return false; - - bool IsAPhi = isa<PHINode>(Cur); - - // A header PHI use other than the original PHI. - if (Cur != Phi && IsAPhi && Cur->getParent() == Phi->getParent()) - return false; - - // Reductions of instructions such as Div, and Sub is only possible if the - // LHS is the reduction variable. - if (!Cur->isCommutative() && !IsAPhi && !isa<SelectInst>(Cur) && - !isa<ICmpInst>(Cur) && !isa<FCmpInst>(Cur) && - !VisitedInsts.count(dyn_cast<Instruction>(Cur->getOperand(0)))) - return false; - - // Any reduction instruction must be of one of the allowed kinds. - ReduxDesc = isReductionInstr(Cur, Kind, ReduxDesc); - if (!ReduxDesc.IsReduction) - return false; - - // A reduction operation must only have one use of the reduction value. - if (!IsAPhi && Kind != RK_IntegerMinMax && Kind != RK_FloatMinMax && - hasMultipleUsesOf(Cur, VisitedInsts)) - return false; - - // All inputs to a PHI node must be a reduction value. - if(IsAPhi && Cur != Phi && !areAllUsesIn(Cur, VisitedInsts)) - return false; - - if (Kind == RK_IntegerMinMax && (isa<ICmpInst>(Cur) || - isa<SelectInst>(Cur))) - ++NumCmpSelectPatternInst; - if (Kind == RK_FloatMinMax && (isa<FCmpInst>(Cur) || - isa<SelectInst>(Cur))) - ++NumCmpSelectPatternInst; - - // Check whether we found a reduction operator. - FoundReduxOp |= !IsAPhi; - - // Process users of current instruction. Push non-PHI nodes after PHI nodes - // onto the stack. This way we are going to have seen all inputs to PHI - // nodes once we get to them. - SmallVector<Instruction *, 8> NonPHIs; - SmallVector<Instruction *, 8> PHIs; - for (User *U : Cur->users()) { - Instruction *UI = cast<Instruction>(U); - - // Check if we found the exit user. - BasicBlock *Parent = UI->getParent(); - if (!TheLoop->contains(Parent)) { - // Exit if you find multiple outside users or if the header phi node is - // being used. In this case the user uses the value of the previous - // iteration, in which case we would loose "VF-1" iterations of the - // reduction operation if we vectorize. - if (ExitInstruction != nullptr || Cur == Phi) - return false; - - // The instruction used by an outside user must be the last instruction - // before we feed back to the reduction phi. Otherwise, we loose VF-1 - // operations on the value. - if (std::find(Phi->op_begin(), Phi->op_end(), Cur) == Phi->op_end()) - return false; - - ExitInstruction = Cur; - continue; - } - - // Process instructions only once (termination). Each reduction cycle - // value must only be used once, except by phi nodes and min/max - // reductions which are represented as a cmp followed by a select. - ReductionInstDesc IgnoredVal(false, nullptr); - if (VisitedInsts.insert(UI).second) { - if (isa<PHINode>(UI)) - PHIs.push_back(UI); - else - NonPHIs.push_back(UI); - } else if (!isa<PHINode>(UI) && - ((!isa<FCmpInst>(UI) && - !isa<ICmpInst>(UI) && - !isa<SelectInst>(UI)) || - !isMinMaxSelectCmpPattern(UI, IgnoredVal).IsReduction)) - return false; - - // Remember that we completed the cycle. - if (UI == Phi) - FoundStartPHI = true; - } - Worklist.append(PHIs.begin(), PHIs.end()); - Worklist.append(NonPHIs.begin(), NonPHIs.end()); } - // This means we have seen one but not the other instruction of the - // pattern or more than just a select and cmp. - if ((Kind == RK_IntegerMinMax || Kind == RK_FloatMinMax) && - NumCmpSelectPatternInst != 2) - return false; - - if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction) + if (LAI->getNumRuntimePointerChecks() > + VectorizerParams::RuntimeMemoryCheckThreshold) { + emitAnalysis(VectorizationReport() + << LAI->getNumRuntimePointerChecks() << " exceeds limit of " + << VectorizerParams::RuntimeMemoryCheckThreshold + << " dependent memory operations checked at runtime"); + DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); return false; - - // We found a reduction var if we have reached the original phi node and we - // only have a single instruction with out-of-loop users. - - // This instruction is allowed to have out-of-loop users. - AllowedExit.insert(ExitInstruction); - - // Save the description of this reduction variable. - ReductionDescriptor RD(RdxStart, ExitInstruction, Kind, - ReduxDesc.MinMaxKind); - Reductions[Phi] = RD; - // We've ended the cycle. This is a reduction variable if we have an - // outside user and it has a binary op. - - return true; -} - -/// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction -/// pattern corresponding to a min(X, Y) or max(X, Y). -LoopVectorizationLegality::ReductionInstDesc -LoopVectorizationLegality::isMinMaxSelectCmpPattern(Instruction *I, - ReductionInstDesc &Prev) { - - assert((isa<ICmpInst>(I) || isa<FCmpInst>(I) || isa<SelectInst>(I)) && - "Expect a select instruction"); - Instruction *Cmp = nullptr; - SelectInst *Select = nullptr; - - // We must handle the select(cmp()) as a single instruction. Advance to the - // select. - if ((Cmp = dyn_cast<ICmpInst>(I)) || (Cmp = dyn_cast<FCmpInst>(I))) { - if (!Cmp->hasOneUse() || !(Select = dyn_cast<SelectInst>(*I->user_begin()))) - return ReductionInstDesc(false, I); - return ReductionInstDesc(Select, Prev.MinMaxKind); - } - - // Only handle single use cases for now. - if (!(Select = dyn_cast<SelectInst>(I))) - return ReductionInstDesc(false, I); - if (!(Cmp = dyn_cast<ICmpInst>(I->getOperand(0))) && - !(Cmp = dyn_cast<FCmpInst>(I->getOperand(0)))) - return ReductionInstDesc(false, I); - if (!Cmp->hasOneUse()) - return ReductionInstDesc(false, I); - - Value *CmpLeft; - Value *CmpRight; - - // Look for a min/max pattern. - if (m_UMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) - return ReductionInstDesc(Select, MRK_UIntMin); - else if (m_UMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) - return ReductionInstDesc(Select, MRK_UIntMax); - else if (m_SMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) - return ReductionInstDesc(Select, MRK_SIntMax); - else if (m_SMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) - return ReductionInstDesc(Select, MRK_SIntMin); - else if (m_OrdFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) - return ReductionInstDesc(Select, MRK_FloatMin); - else if (m_OrdFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) - return ReductionInstDesc(Select, MRK_FloatMax); - else if (m_UnordFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) - return ReductionInstDesc(Select, MRK_FloatMin); - else if (m_UnordFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) - return ReductionInstDesc(Select, MRK_FloatMax); - - return ReductionInstDesc(false, I); -} - -LoopVectorizationLegality::ReductionInstDesc -LoopVectorizationLegality::isReductionInstr(Instruction *I, - ReductionKind Kind, - ReductionInstDesc &Prev) { - bool FP = I->getType()->isFloatingPointTy(); - bool FastMath = FP && I->hasUnsafeAlgebra(); - switch (I->getOpcode()) { - default: - return ReductionInstDesc(false, I); - case Instruction::PHI: - if (FP && (Kind != RK_FloatMult && Kind != RK_FloatAdd && - Kind != RK_FloatMinMax)) - return ReductionInstDesc(false, I); - return ReductionInstDesc(I, Prev.MinMaxKind); - case Instruction::Sub: - case Instruction::Add: - return ReductionInstDesc(Kind == RK_IntegerAdd, I); - case Instruction::Mul: - return ReductionInstDesc(Kind == RK_IntegerMult, I); - case Instruction::And: - return ReductionInstDesc(Kind == RK_IntegerAnd, I); - case Instruction::Or: - return ReductionInstDesc(Kind == RK_IntegerOr, I); - case Instruction::Xor: - return ReductionInstDesc(Kind == RK_IntegerXor, I); - case Instruction::FMul: - return ReductionInstDesc(Kind == RK_FloatMult && FastMath, I); - case Instruction::FSub: - case Instruction::FAdd: - return ReductionInstDesc(Kind == RK_FloatAdd && FastMath, I); - case Instruction::FCmp: - case Instruction::ICmp: - case Instruction::Select: - if (Kind != RK_IntegerMinMax && - (!HasFunNoNaNAttr || Kind != RK_FloatMinMax)) - return ReductionInstDesc(false, I); - return isMinMaxSelectCmpPattern(I, Prev); } + return true; } LoopVectorizationLegality::InductionKind -LoopVectorizationLegality::isInductionVariable(PHINode *Phi) { - Type *PhiTy = Phi->getType(); - // We only handle integer and pointer inductions variables. - if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy()) - return IK_NoInduction; - - // Check that the PHI is consecutive. - const SCEV *PhiScev = SE->getSCEV(Phi); - const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev); - if (!AR) { - DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n"); - return IK_NoInduction; - } - const SCEV *Step = AR->getStepRecurrence(*SE); - - // Integer inductions need to have a stride of one. - if (PhiTy->isIntegerTy()) { - if (Step->isOne()) - return IK_IntInduction; - if (Step->isAllOnesValue()) - return IK_ReverseIntInduction; - return IK_NoInduction; - } - - // Calculate the pointer stride and check if it is consecutive. - const SCEVConstant *C = dyn_cast<SCEVConstant>(Step); - if (!C) - return IK_NoInduction; - - assert(PhiTy->isPointerTy() && "The PHI must be a pointer"); - Type *PointerElementType = PhiTy->getPointerElementType(); - // The pointer stride cannot be determined if the pointer element type is not - // sized. - if (!PointerElementType->isSized()) +LoopVectorizationLegality::isInductionVariable(PHINode *Phi, + ConstantInt *&StepValue) { + if (!isInductionPHI(Phi, SE, StepValue)) return IK_NoInduction; - uint64_t Size = DL->getTypeAllocSize(PointerElementType); - if (C->getValue()->equalsInt(Size)) - return IK_PtrInduction; - else if (C->getValue()->equalsInt(0 - Size)) - return IK_ReversePtrInduction; - - return IK_NoInduction; + Type *PhiTy = Phi->getType(); + // Found an Integer induction variable. + if (PhiTy->isIntegerTy()) + return IK_IntInduction; + // Found an Pointer induction variable. + return IK_PtrInduction; } bool LoopVectorizationLegality::isInductionVariable(const Value *V) { @@ -5341,11 +3852,7 @@ bool LoopVectorizationLegality::isInductionVariable(const Value *V) { } bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) { - assert(TheLoop->contains(BB) && "Unknown block used"); - - // Blocks that do not dominate the latch need predication. - BasicBlock* Latch = TheLoop->getLoopLatch(); - return !DT->dominates(BB, Latch); + return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT); } bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB, @@ -5421,13 +3928,17 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { // Width 1 means no vectorize VectorizationFactor Factor = { 1U, 0U }; if (OptForSize && Legal->getRuntimePointerCheck()->Need) { - emitAnalysis(Report() << "runtime pointer checks needed. Enable vectorization of this loop with '#pragma clang loop vectorize(enable)' when compiling with -Os"); + emitAnalysis(VectorizationReport() << + "runtime pointer checks needed. Enable vectorization of this " + "loop with '#pragma clang loop vectorize(enable)' when " + "compiling with -Os"); DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n"); return Factor; } - if (!EnableCondStoresVectorization && Legal->NumPredStores) { - emitAnalysis(Report() << "store that is conditionally executed prevents vectorization"); + if (!EnableCondStoresVectorization && Legal->getNumPredStores()) { + emitAnalysis(VectorizationReport() << + "store that is conditionally executed prevents vectorization"); DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n"); return Factor; } @@ -5462,7 +3973,9 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { if (OptForSize) { // If we are unable to calculate the trip count then don't try to vectorize. if (TC < 2) { - emitAnalysis(Report() << "unable to calculate the loop count due to complex control flow"); + emitAnalysis + (VectorizationReport() << + "unable to calculate the loop count due to complex control flow"); DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n"); return Factor; } @@ -5476,10 +3989,11 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { // If the trip count that we found modulo the vectorization factor is not // zero then we require a tail. if (VF < 2) { - emitAnalysis(Report() << "cannot optimize for size and vectorize at the " - "same time. Enable vectorization of this loop " - "with '#pragma clang loop vectorize(enable)' " - "when compiling with -Os"); + emitAnalysis(VectorizationReport() << + "cannot optimize for size and vectorize at the " + "same time. Enable vectorization of this loop " + "with '#pragma clang loop vectorize(enable)' " + "when compiling with -Os"); DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n"); return Factor; } @@ -5532,6 +4046,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { unsigned LoopVectorizationCostModel::getWidestType() { unsigned MaxWidth = 8; + const DataLayout &DL = TheFunction->getParent()->getDataLayout(); // For each block. for (Loop::block_iterator bb = TheLoop->block_begin(), @@ -5566,7 +4081,7 @@ unsigned LoopVectorizationCostModel::getWidestType() { continue; MaxWidth = std::max(MaxWidth, - (unsigned)DL->getTypeSizeInBits(T->getScalarType())); + (unsigned)DL.getTypeSizeInBits(T->getScalarType())); } } @@ -5645,7 +4160,7 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, std::max(1U, (R.MaxLocalUsers - 1))); // Clamp the unroll factor ranges to reasonable factors. - unsigned MaxInterleaveSize = TTI.getMaxInterleaveFactor(); + unsigned MaxInterleaveSize = TTI.getMaxInterleaveFactor(VF); // Check if the user has overridden the unroll max. if (VF == 1) { @@ -5692,8 +4207,10 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, // Unroll until store/load ports (estimated by max unroll factor) are // saturated. - unsigned StoresUF = UF / (Legal->NumStores ? Legal->NumStores : 1); - unsigned LoadsUF = UF / (Legal->NumLoads ? Legal->NumLoads : 1); + unsigned NumStores = Legal->getNumStores(); + unsigned NumLoads = Legal->getNumLoads(); + unsigned StoresUF = UF / (NumStores ? NumStores : 1); + unsigned LoadsUF = UF / (NumLoads ? NumLoads : 1); // If we have a scalar reduction (vector reductions are already dealt with // by this point), we can increase the critical path length if the loop @@ -5716,6 +4233,14 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, return SmallUF; } + // Unroll if this is a large loop (small loops are already dealt with by this + // point) that could benefit from interleaved unrolling. + bool HasReductions = (Legal->getReductionVars()->size() > 0); + if (TTI.enableAggressiveInterleaving(HasReductions)) { + DEBUG(dbgs() << "LV: Unrolling to expose ILP.\n"); + return UF; + } + DEBUG(dbgs() << "LV: Not Unrolling.\n"); return 1; } @@ -6053,8 +4578,9 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { // Scalarized loads/stores. int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); bool Reverse = ConsecutiveStride < 0; - unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ValTy); - unsigned VectorElementSize = DL->getTypeStoreSize(VectorTy)/VF; + const DataLayout &DL = I->getModule()->getDataLayout(); + unsigned ScalarAllocatedSize = DL.getTypeAllocSize(ValTy); + unsigned VectorElementSize = DL.getTypeStoreSize(VectorTy) / VF; if (!ConsecutiveStride || ScalarAllocatedSize != VectorElementSize) { bool IsComplexComputation = isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop); @@ -6081,7 +4607,11 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { // Wide load/stores. unsigned Cost = TTI.getAddressComputationCost(VectorTy); - Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); + if (Legal->isMaskRequired(I)) + Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, + AS); + else + Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); if (Reverse) Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, @@ -6111,14 +4641,12 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); } case Instruction::Call: { + bool NeedToScalarize; CallInst *CI = cast<CallInst>(I); - Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); - assert(ID && "Not an intrinsic call!"); - Type *RetTy = ToVectorTy(CI->getType(), VF); - SmallVector<Type*, 4> Tys; - for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) - Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF)); - return TTI.getIntrinsicInstrCost(ID, RetTy, Tys); + unsigned CallCost = getVectorCallCost(CI, VF, TTI, TLI, NeedToScalarize); + if (getIntrinsicIDForCall(CI, TLI)) + return std::min(CallCost, getVectorIntrinsicCost(CI, VF, TTI, TLI)); + return CallCost; } default: { // We are scalarizing the instruction. Return the cost of the scalar @@ -6145,24 +4673,19 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { }// end of switch. } -Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) { - if (Scalar->isVoidTy() || VF == 1) - return Scalar; - return VectorType::get(Scalar, VF); -} - char LoopVectorize::ID = 0; static const char lv_name[] = "Loop Vectorization"; INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) -INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfo) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_DEPENDENCY(LCSSA) -INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis) INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) namespace llvm { @@ -6259,7 +4782,7 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr, ConstantInt::get(Cond[Part]->getType(), 1)); CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store"); LoopVectorBody.push_back(CondBlock); - VectorLp->addBasicBlockToLoop(CondBlock, LI->getBase()); + VectorLp->addBasicBlockToLoop(CondBlock, *LI); // Update Builder with newly created basic block. Builder.SetInsertPoint(InsertPt); } @@ -6285,7 +4808,7 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr, if (IfPredicateStore) { BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); LoopVectorBody.push_back(NewIfBlock); - VectorLp->addBasicBlockToLoop(NewIfBlock, LI->getBase()); + VectorLp->addBasicBlockToLoop(NewIfBlock, *LI); Builder.SetInsertPoint(InsertPt); Instruction *OldBr = IfBlock->getTerminator(); BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); @@ -6310,11 +4833,10 @@ Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } -Value *InnerLoopUnroller::getConsecutiveVector(Value* Val, int StartIdx, - bool Negate) { +Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step) { // When unrolling and the VF is 1, we only need to add a simple scalar. Type *ITy = Val->getType(); assert(!ITy->isVectorTy() && "Val must be a scalar"); - Constant *C = ConstantInt::get(ITy, StartIdx, Negate); - return Builder.CreateAdd(Val, C, "induction"); + Constant *C = ConstantInt::get(ITy, StartIdx); + return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); } diff --git a/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index bd8a4b3..504425e 100644 --- a/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -17,9 +17,9 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Vectorize.h" #include "llvm/ADT/MapVector.h" +#include "llvm/ADT/Optional.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SetVector.h" -#include "llvm/ADT/Optional.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" @@ -75,6 +75,15 @@ static const unsigned MinVecRegSize = 128; static const unsigned RecursionMaxDepth = 12; +// Limit the number of alias checks. The limit is chosen so that +// it has no negative effect on the llvm benchmarks. +static const unsigned AliasedCheckLimit = 10; + +// Another limit for the alias checks: The maximum distance between load/store +// instructions where alias checks are done. +// This limit is useful for very large basic blocks. +static const unsigned MaxMemDepDistance = 160; + /// \brief Predicate for the element types that the SLP vectorizer supports. /// /// The most important thing to filter here are types which are invalid in LLVM @@ -278,104 +287,6 @@ static bool CanReuseExtract(ArrayRef<Value *> VL) { return true; } -static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, - SmallVectorImpl<Value *> &Left, - SmallVectorImpl<Value *> &Right) { - - SmallVector<Value *, 16> OrigLeft, OrigRight; - - bool AllSameOpcodeLeft = true; - bool AllSameOpcodeRight = true; - for (unsigned i = 0, e = VL.size(); i != e; ++i) { - Instruction *I = cast<Instruction>(VL[i]); - Value *V0 = I->getOperand(0); - Value *V1 = I->getOperand(1); - - OrigLeft.push_back(V0); - OrigRight.push_back(V1); - - Instruction *I0 = dyn_cast<Instruction>(V0); - Instruction *I1 = dyn_cast<Instruction>(V1); - - // Check whether all operands on one side have the same opcode. In this case - // we want to preserve the original order and not make things worse by - // reordering. - AllSameOpcodeLeft = I0; - AllSameOpcodeRight = I1; - - if (i && AllSameOpcodeLeft) { - if(Instruction *P0 = dyn_cast<Instruction>(OrigLeft[i-1])) { - if(P0->getOpcode() != I0->getOpcode()) - AllSameOpcodeLeft = false; - } else - AllSameOpcodeLeft = false; - } - if (i && AllSameOpcodeRight) { - if(Instruction *P1 = dyn_cast<Instruction>(OrigRight[i-1])) { - if(P1->getOpcode() != I1->getOpcode()) - AllSameOpcodeRight = false; - } else - AllSameOpcodeRight = false; - } - - // Sort two opcodes. In the code below we try to preserve the ability to use - // broadcast of values instead of individual inserts. - // vl1 = load - // vl2 = phi - // vr1 = load - // vr2 = vr2 - // = vl1 x vr1 - // = vl2 x vr2 - // If we just sorted according to opcode we would leave the first line in - // tact but we would swap vl2 with vr2 because opcode(phi) > opcode(load). - // = vl1 x vr1 - // = vr2 x vl2 - // Because vr2 and vr1 are from the same load we loose the opportunity of a - // broadcast for the packed right side in the backend: we have [vr1, vl2] - // instead of [vr1, vr2=vr1]. - if (I0 && I1) { - if(!i && I0->getOpcode() > I1->getOpcode()) { - Left.push_back(I1); - Right.push_back(I0); - } else if (i && I0->getOpcode() > I1->getOpcode() && Right[i-1] != I1) { - // Try not to destroy a broad cast for no apparent benefit. - Left.push_back(I1); - Right.push_back(I0); - } else if (i && I0->getOpcode() == I1->getOpcode() && Right[i-1] == I0) { - // Try preserve broadcasts. - Left.push_back(I1); - Right.push_back(I0); - } else if (i && I0->getOpcode() == I1->getOpcode() && Left[i-1] == I1) { - // Try preserve broadcasts. - Left.push_back(I1); - Right.push_back(I0); - } else { - Left.push_back(I0); - Right.push_back(I1); - } - continue; - } - // One opcode, put the instruction on the right. - if (I0) { - Left.push_back(V1); - Right.push_back(I0); - continue; - } - Left.push_back(V0); - Right.push_back(V1); - } - - bool LeftBroadcast = isSplat(Left); - bool RightBroadcast = isSplat(Right); - - // Don't reorder if the operands where good to begin with. - if (!(LeftBroadcast || RightBroadcast) && - (AllSameOpcodeRight || AllSameOpcodeLeft)) { - Left = OrigLeft; - Right = OrigRight; - } -} - /// \returns True if in-tree use also needs extract. This refers to /// possible scalar operand in vectorized instruction. static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, @@ -412,6 +323,17 @@ static AliasAnalysis::Location getLocation(Instruction *I, AliasAnalysis *AA) { return AliasAnalysis::Location(); } +/// \returns True if the instruction is not a volatile or atomic load/store. +static bool isSimple(Instruction *I) { + if (LoadInst *LI = dyn_cast<LoadInst>(I)) + return LI->isSimple(); + if (StoreInst *SI = dyn_cast<StoreInst>(I)) + return SI->isSimple(); + if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) + return !MI->isVolatile(); + return true; +} + /// Bottom Up SLP Vectorizer. class BoUpSLP { public: @@ -420,11 +342,11 @@ public: typedef SmallPtrSet<Value *, 16> ValueSet; typedef SmallVector<StoreInst *, 8> StoreList; - BoUpSLP(Function *Func, ScalarEvolution *Se, const DataLayout *Dl, - TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AliasAnalysis *Aa, - LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC) + BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, + TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li, + DominatorTree *Dt, AssumptionCache *AC) : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func), - SE(Se), DL(Dl), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), + SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), Builder(Se->getContext()) { CodeMetrics::collectEphemeralValues(F, AC, EphValues); } @@ -461,7 +383,7 @@ public: } /// \returns true if the memory operations A and B are consecutive. - bool isConsecutiveAccess(Value *A, Value *B); + bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL); /// \brief Perform LICM and CSE on the newly generated gather sequences. void optimizeGatherSequence(); @@ -518,6 +440,16 @@ private: /// be beneficial even the tree height is tiny. bool isFullyVectorizableTinyTree(); + /// \reorder commutative operands in alt shuffle if they result in + /// vectorized code. + void reorderAltShuffleOperands(ArrayRef<Value *> VL, + SmallVectorImpl<Value *> &Left, + SmallVectorImpl<Value *> &Right); + /// \reorder commutative operands to get better probability of + /// generating vectorized code. + void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, + SmallVectorImpl<Value *> &Left, + SmallVectorImpl<Value *> &Right); struct TreeEntry { TreeEntry() : Scalars(), VectorizedValue(nullptr), NeedToGather(0) {} @@ -594,7 +526,7 @@ private: } AliasAnalysis::Location Loc2 = getLocation(Inst2, AA); bool aliased = true; - if (Loc1.Ptr && Loc2.Ptr) { + if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) { // Do the alias check. aliased = AA->alias(Loc1, Loc2); } @@ -945,7 +877,6 @@ private: // Analysis and block reference. Function *F; ScalarEvolution *SE; - const DataLayout *DL; TargetTransformInfo *TTI; TargetLibraryInfo *TLI; AliasAnalysis *AA; @@ -1198,8 +1129,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); return; } - if (!isConsecutiveAccess(VL[i], VL[i + 1])) { - if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0])) { + const DataLayout &DL = F->getParent()->getDataLayout(); + if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) { + if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0], DL)) { ++NumLoadsWantToChangeOrder; } BS.cancelScheduling(VL); @@ -1251,7 +1183,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { case Instruction::ICmp: case Instruction::FCmp: { // Check that all of the compares have the same predicate. - CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate(); + CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate(); Type *ComparedTy = cast<Instruction>(VL[0])->getOperand(0)->getType(); for (unsigned i = 1, e = VL.size(); i < e; ++i) { CmpInst *Cmp = cast<CmpInst>(VL[i]); @@ -1368,9 +1300,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { return; } case Instruction::Store: { + const DataLayout &DL = F->getParent()->getDataLayout(); // Check if the stores are consecutive or of we need to swizzle them. for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) - if (!isConsecutiveAccess(VL[i], VL[i + 1])) { + if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) { BS.cancelScheduling(VL); newTreeEntry(VL, false); DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); @@ -1451,6 +1384,16 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { } newTreeEntry(VL, true); DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); + + // Reorder operands if reordering would enable vectorization. + if (isa<BinaryOperator>(VL0)) { + ValueList Left, Right; + reorderAltShuffleOperands(VL, Left, Right); + buildTree_rec(Left, Depth + 1); + buildTree_rec(Right, Depth + 1); + return; + } + for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. @@ -1774,7 +1717,7 @@ int BoUpSLP::getTreeCost() { // We only vectorize tiny trees if it is fully vectorizable. if (VectorizableTree.size() < 3 && !isFullyVectorizableTinyTree()) { - if (!VectorizableTree.size()) { + if (VectorizableTree.empty()) { assert(!ExternalUses.size() && "We should not have any external users"); } return INT_MAX; @@ -1847,7 +1790,7 @@ unsigned BoUpSLP::getAddressSpaceOperand(Value *I) { return -1; } -bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) { +bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL) { Value *PtrA = getPointerOperand(A); Value *PtrB = getPointerOperand(B); unsigned ASA = getAddressSpaceOperand(A); @@ -1861,13 +1804,13 @@ bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) { if (PtrA == PtrB || PtrA->getType() != PtrB->getType()) return false; - unsigned PtrBitWidth = DL->getPointerSizeInBits(ASA); + unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA); Type *Ty = cast<PointerType>(PtrA->getType())->getElementType(); - APInt Size(PtrBitWidth, DL->getTypeStoreSize(Ty)); + APInt Size(PtrBitWidth, DL.getTypeStoreSize(Ty)); APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0); - PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(*DL, OffsetA); - PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(*DL, OffsetB); + PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA); + PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB); APInt OffsetDelta = OffsetB - OffsetA; @@ -1888,6 +1831,198 @@ bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) { return X == PtrSCEVB; } +// Reorder commutative operations in alternate shuffle if the resulting vectors +// are consecutive loads. This would allow us to vectorize the tree. +// If we have something like- +// load a[0] - load b[0] +// load b[1] + load a[1] +// load a[2] - load b[2] +// load a[3] + load b[3] +// Reordering the second load b[1] load a[1] would allow us to vectorize this +// code. +void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL, + SmallVectorImpl<Value *> &Left, + SmallVectorImpl<Value *> &Right) { + const DataLayout &DL = F->getParent()->getDataLayout(); + + // Push left and right operands of binary operation into Left and Right + for (unsigned i = 0, e = VL.size(); i < e; ++i) { + Left.push_back(cast<Instruction>(VL[i])->getOperand(0)); + Right.push_back(cast<Instruction>(VL[i])->getOperand(1)); + } + + // Reorder if we have a commutative operation and consecutive access + // are on either side of the alternate instructions. + for (unsigned j = 0; j < VL.size() - 1; ++j) { + if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) { + if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) { + Instruction *VL1 = cast<Instruction>(VL[j]); + Instruction *VL2 = cast<Instruction>(VL[j + 1]); + if (isConsecutiveAccess(L, L1, DL) && VL1->isCommutative()) { + std::swap(Left[j], Right[j]); + continue; + } else if (isConsecutiveAccess(L, L1, DL) && VL2->isCommutative()) { + std::swap(Left[j + 1], Right[j + 1]); + continue; + } + // else unchanged + } + } + if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) { + if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) { + Instruction *VL1 = cast<Instruction>(VL[j]); + Instruction *VL2 = cast<Instruction>(VL[j + 1]); + if (isConsecutiveAccess(L, L1, DL) && VL1->isCommutative()) { + std::swap(Left[j], Right[j]); + continue; + } else if (isConsecutiveAccess(L, L1, DL) && VL2->isCommutative()) { + std::swap(Left[j + 1], Right[j + 1]); + continue; + } + // else unchanged + } + } + } +} + +void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, + SmallVectorImpl<Value *> &Left, + SmallVectorImpl<Value *> &Right) { + + SmallVector<Value *, 16> OrigLeft, OrigRight; + + bool AllSameOpcodeLeft = true; + bool AllSameOpcodeRight = true; + for (unsigned i = 0, e = VL.size(); i != e; ++i) { + Instruction *I = cast<Instruction>(VL[i]); + Value *VLeft = I->getOperand(0); + Value *VRight = I->getOperand(1); + + OrigLeft.push_back(VLeft); + OrigRight.push_back(VRight); + + Instruction *ILeft = dyn_cast<Instruction>(VLeft); + Instruction *IRight = dyn_cast<Instruction>(VRight); + + // Check whether all operands on one side have the same opcode. In this case + // we want to preserve the original order and not make things worse by + // reordering. + if (i && AllSameOpcodeLeft && ILeft) { + if (Instruction *PLeft = dyn_cast<Instruction>(OrigLeft[i - 1])) { + if (PLeft->getOpcode() != ILeft->getOpcode()) + AllSameOpcodeLeft = false; + } else + AllSameOpcodeLeft = false; + } + if (i && AllSameOpcodeRight && IRight) { + if (Instruction *PRight = dyn_cast<Instruction>(OrigRight[i - 1])) { + if (PRight->getOpcode() != IRight->getOpcode()) + AllSameOpcodeRight = false; + } else + AllSameOpcodeRight = false; + } + + // Sort two opcodes. In the code below we try to preserve the ability to use + // broadcast of values instead of individual inserts. + // vl1 = load + // vl2 = phi + // vr1 = load + // vr2 = vr2 + // = vl1 x vr1 + // = vl2 x vr2 + // If we just sorted according to opcode we would leave the first line in + // tact but we would swap vl2 with vr2 because opcode(phi) > opcode(load). + // = vl1 x vr1 + // = vr2 x vl2 + // Because vr2 and vr1 are from the same load we loose the opportunity of a + // broadcast for the packed right side in the backend: we have [vr1, vl2] + // instead of [vr1, vr2=vr1]. + if (ILeft && IRight) { + if (!i && ILeft->getOpcode() > IRight->getOpcode()) { + Left.push_back(IRight); + Right.push_back(ILeft); + } else if (i && ILeft->getOpcode() > IRight->getOpcode() && + Right[i - 1] != IRight) { + // Try not to destroy a broad cast for no apparent benefit. + Left.push_back(IRight); + Right.push_back(ILeft); + } else if (i && ILeft->getOpcode() == IRight->getOpcode() && + Right[i - 1] == ILeft) { + // Try preserve broadcasts. + Left.push_back(IRight); + Right.push_back(ILeft); + } else if (i && ILeft->getOpcode() == IRight->getOpcode() && + Left[i - 1] == IRight) { + // Try preserve broadcasts. + Left.push_back(IRight); + Right.push_back(ILeft); + } else { + Left.push_back(ILeft); + Right.push_back(IRight); + } + continue; + } + // One opcode, put the instruction on the right. + if (ILeft) { + Left.push_back(VRight); + Right.push_back(ILeft); + continue; + } + Left.push_back(VLeft); + Right.push_back(VRight); + } + + bool LeftBroadcast = isSplat(Left); + bool RightBroadcast = isSplat(Right); + + // If operands end up being broadcast return this operand order. + if (LeftBroadcast || RightBroadcast) + return; + + // Don't reorder if the operands where good to begin. + if (AllSameOpcodeRight || AllSameOpcodeLeft) { + Left = OrigLeft; + Right = OrigRight; + } + + const DataLayout &DL = F->getParent()->getDataLayout(); + + // Finally check if we can get longer vectorizable chain by reordering + // without breaking the good operand order detected above. + // E.g. If we have something like- + // load a[0] load b[0] + // load b[1] load a[1] + // load a[2] load b[2] + // load a[3] load b[3] + // Reordering the second load b[1] load a[1] would allow us to vectorize + // this code and we still retain AllSameOpcode property. + // FIXME: This load reordering might break AllSameOpcode in some rare cases + // such as- + // add a[0],c[0] load b[0] + // add a[1],c[2] load b[1] + // b[2] load b[2] + // add a[3],c[3] load b[3] + for (unsigned j = 0; j < VL.size() - 1; ++j) { + if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) { + if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) { + if (isConsecutiveAccess(L, L1, DL)) { + std::swap(Left[j + 1], Right[j + 1]); + continue; + } + } + } + if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) { + if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) { + if (isConsecutiveAccess(L, L1, DL)) { + std::swap(Left[j + 1], Right[j + 1]); + continue; + } + } + } + // else unchanged + } +} + void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) { Instruction *VL0 = cast<Instruction>(VL[0]); BasicBlock::iterator NextInst = VL0; @@ -1974,6 +2109,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return Gather(E->Scalars, VecTy); } + const DataLayout &DL = F->getParent()->getDataLayout(); unsigned Opcode = getSameOpcode(E->Scalars); switch (Opcode) { @@ -2066,7 +2202,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (Value *V = alreadyVectorized(E->Scalars)) return V; - CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate(); + CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate(); Value *V; if (Opcode == Instruction::FCmp) V = Builder.CreateFCmp(P0, L, R); @@ -2170,8 +2306,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { unsigned Alignment = LI->getAlignment(); LI = Builder.CreateLoad(VecPtr); - if (!Alignment) - Alignment = DL->getABITypeAlignment(ScalarLoadTy); + if (!Alignment) { + Alignment = DL.getABITypeAlignment(ScalarLoadTy); + } LI->setAlignment(Alignment); E->VectorizedValue = LI; ++NumVectorInstructions; @@ -2200,8 +2337,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { ExternalUses.push_back( ExternalUser(SI->getPointerOperand(), cast<User>(VecPtr), 0)); - if (!Alignment) - Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType()); + if (!Alignment) { + Alignment = DL.getABITypeAlignment(SI->getValueOperand()->getType()); + } S->setAlignment(Alignment); E->VectorizedValue = S; ++NumVectorInstructions; @@ -2227,7 +2365,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { OpVecs.push_back(OpVec); } - Value *V = Builder.CreateGEP(Op0, OpVecs); + Value *V = Builder.CreateGEP( + cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs); E->VectorizedValue = V; ++NumVectorInstructions; @@ -2243,7 +2382,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Intrinsic::ID IID = Intrinsic::not_intrinsic; Value *ScalarArg = nullptr; if (CI && (FI = CI->getCalledFunction())) { - IID = (Intrinsic::ID) FI->getIntrinsicID(); + IID = FI->getIntrinsicID(); } std::vector<Value *> OpVecs; for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) { @@ -2284,10 +2423,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } case Instruction::ShuffleVector: { ValueList LHSVL, RHSVL; - for (int i = 0, e = E->Scalars.size(); i < e; ++i) { - LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); - RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1)); - } + assert(isa<BinaryOperator>(VL0) && "Invalid Shuffle Vector Operand"); + reorderAltShuffleOperands(E->Scalars, LHSVL, RHSVL); setInsertPointAfterBundle(E->Scalars); Value *LHS = vectorizeTree(LHSVL); @@ -2768,23 +2905,57 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, Instruction *SrcInst = BundleMember->Inst; AliasAnalysis::Location SrcLoc = getLocation(SrcInst, SLP->AA); bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory(); + unsigned numAliased = 0; + unsigned DistToSrc = 1; while (DepDest) { assert(isInSchedulingRegion(DepDest)); - if (SrcMayWrite || DepDest->Inst->mayWriteToMemory()) { - if (SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)) { - DepDest->MemoryDependencies.push_back(BundleMember); - BundleMember->Dependencies++; - ScheduleData *DestBundle = DepDest->FirstInBundle; - if (!DestBundle->IsScheduled) { - BundleMember->incrementUnscheduledDeps(1); - } - if (!DestBundle->hasValidDependencies()) { - WorkList.push_back(DestBundle); - } + + // We have two limits to reduce the complexity: + // 1) AliasedCheckLimit: It's a small limit to reduce calls to + // SLP->isAliased (which is the expensive part in this loop). + // 2) MaxMemDepDistance: It's for very large blocks and it aborts + // the whole loop (even if the loop is fast, it's quadratic). + // It's important for the loop break condition (see below) to + // check this limit even between two read-only instructions. + if (DistToSrc >= MaxMemDepDistance || + ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) && + (numAliased >= AliasedCheckLimit || + SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) { + + // We increment the counter only if the locations are aliased + // (instead of counting all alias checks). This gives a better + // balance between reduced runtime and accurate dependencies. + numAliased++; + + DepDest->MemoryDependencies.push_back(BundleMember); + BundleMember->Dependencies++; + ScheduleData *DestBundle = DepDest->FirstInBundle; + if (!DestBundle->IsScheduled) { + BundleMember->incrementUnscheduledDeps(1); + } + if (!DestBundle->hasValidDependencies()) { + WorkList.push_back(DestBundle); } } DepDest = DepDest->NextLoadStore; + + // Example, explaining the loop break condition: Let's assume our + // starting instruction is i0 and MaxMemDepDistance = 3. + // + // +--------v--v--v + // i0,i1,i2,i3,i4,i5,i6,i7,i8 + // +--------^--^--^ + // + // MaxMemDepDistance let us stop alias-checking at i3 and we add + // dependencies from i0 to i3,i4,.. (even if they are not aliased). + // Previously we already added dependencies from i3 to i6,i7,i8 + // (because of MaxMemDepDistance). As we added a dependency from + // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8 + // and we can abort this loop at i6. + if (DistToSrc >= 2 * MaxMemDepDistance) + break; + DistToSrc++; } } } @@ -2888,7 +3059,6 @@ struct SLPVectorizer : public FunctionPass { } ScalarEvolution *SE; - const DataLayout *DL; TargetTransformInfo *TTI; TargetLibraryInfo *TLI; AliasAnalysis *AA; @@ -2901,12 +3071,11 @@ struct SLPVectorizer : public FunctionPass { return false; SE = &getAnalysis<ScalarEvolution>(); - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - DL = DLP ? &DLP->getDataLayout() : nullptr; - TTI = &getAnalysis<TargetTransformInfo>(); - TLI = getAnalysisIfAvailable<TargetLibraryInfo>(); + TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); + TLI = TLIP ? &TLIP->getTLI() : nullptr; AA = &getAnalysis<AliasAnalysis>(); - LI = &getAnalysis<LoopInfo>(); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); @@ -2918,11 +3087,6 @@ struct SLPVectorizer : public FunctionPass { if (!TTI->getNumberOfRegisters(true)) return false; - // Must have DataLayout. We can't require it because some tests run w/o - // triple. - if (!DL) - return false; - // Don't vectorize when the attribute NoImplicitFloat is used. if (F.hasFnAttribute(Attribute::NoImplicitFloat)) return false; @@ -2931,15 +3095,13 @@ struct SLPVectorizer : public FunctionPass { // Use the bottom up slp vectorizer to construct chains that start with // store instructions. - BoUpSLP R(&F, SE, DL, TTI, TLI, AA, LI, DT, AC); + BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC); // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to // delete instructions. // Scan the blocks in the function in post order. - for (po_iterator<BasicBlock*> it = po_begin(&F.getEntryBlock()), - e = po_end(&F.getEntryBlock()); it != e; ++it) { - BasicBlock *BB = *it; + for (auto BB : post_order(&F.getEntryBlock())) { // Vectorize trees that end at stores. if (unsigned count = collectStores(BB, R)) { (void)count; @@ -2964,10 +3126,10 @@ struct SLPVectorizer : public FunctionPass { AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<ScalarEvolution>(); AU.addRequired<AliasAnalysis>(); - AU.addRequired<TargetTransformInfo>(); - AU.addRequired<LoopInfo>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.addRequired<LoopInfoWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); - AU.addPreserved<LoopInfo>(); + AU.addPreserved<LoopInfoWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); AU.setPreservesCFG(); } @@ -3014,15 +3176,11 @@ private: /// the WeakVH array. /// Vectorization of part of the VL array may cause later values in the VL array /// to become invalid. We track when this has happened in the WeakVH array. -static bool hasValueBeenRAUWed(ArrayRef<Value *> &VL, - SmallVectorImpl<WeakVH> &VH, - unsigned SliceBegin, - unsigned SliceSize) { - for (unsigned i = SliceBegin; i < SliceBegin + SliceSize; ++i) - if (VH[i] != VL[i]) - return true; - - return false; +static bool hasValueBeenRAUWed(ArrayRef<Value *> VL, ArrayRef<WeakVH> VH, + unsigned SliceBegin, unsigned SliceSize) { + VL = VL.slice(SliceBegin, SliceSize); + VH = VH.slice(SliceBegin, SliceSize); + return !std::equal(VL.begin(), VL.end(), VH.begin()); } bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain, @@ -3031,7 +3189,8 @@ bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain, DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen << "\n"); Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType(); - unsigned Sz = DL->getTypeSizeInBits(StoreTy); + auto &DL = cast<StoreInst>(Chain[0])->getModule()->getDataLayout(); + unsigned Sz = DL.getTypeSizeInBits(StoreTy); unsigned VF = MinVecRegSize / Sz; if (!isPowerOf2_32(Sz) || VF < 2) @@ -3074,8 +3233,8 @@ bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain, bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold, BoUpSLP &R) { - SetVector<Value *> Heads, Tails; - SmallDenseMap<Value *, Value *> ConsecutiveChain; + SetVector<StoreInst *> Heads, Tails; + SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain; // We may run into multiple chains that merge into a single chain. We mark the // stores that we vectorized so that we don't visit the same store twice. @@ -3088,8 +3247,8 @@ bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores, for (unsigned j = 0; j < e; ++j) { if (i == j) continue; - - if (R.isConsecutiveAccess(Stores[i], Stores[j])) { + const DataLayout &DL = Stores[i]->getModule()->getDataLayout(); + if (R.isConsecutiveAccess(Stores[i], Stores[j], DL)) { Tails.insert(Stores[j]); Heads.insert(Stores[i]); ConsecutiveChain[Stores[i]] = Stores[j]; @@ -3098,7 +3257,7 @@ bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores, } // For stores that start but don't end a link in the chain: - for (SetVector<Value *>::iterator it = Heads.begin(), e = Heads.end(); + for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end(); it != e; ++it) { if (Tails.count(*it)) continue; @@ -3106,7 +3265,7 @@ bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores, // We found a store instr that starts a chain. Now follow the chain and try // to vectorize it. BoUpSLP::ValueList Operands; - Value *I = *it; + StoreInst *I = *it; // Collect the chain into a list. while (Tails.count(I) || Heads.count(I)) { if (VectorizedStores.count(I)) @@ -3131,6 +3290,7 @@ bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores, unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) { unsigned count = 0; StoreRefs.clear(); + const DataLayout &DL = BB->getModule()->getDataLayout(); for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { StoreInst *SI = dyn_cast<StoreInst>(it); if (!SI) @@ -3176,9 +3336,10 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, return false; unsigned Opcode0 = I0->getOpcode(); + const DataLayout &DL = I0->getModule()->getDataLayout(); Type *Ty0 = I0->getType(); - unsigned Sz = DL->getTypeSizeInBits(Ty0); + unsigned Sz = DL.getTypeSizeInBits(Ty0); unsigned VF = MinVecRegSize / Sz; for (int i = 0, e = VL.size(); i < e; ++i) { @@ -3380,8 +3541,7 @@ public: ReducedValueOpcode(0), ReduxWidth(0), IsPairwiseReduction(false) {} /// \brief Try to find a reduction tree. - bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B, - const DataLayout *DL) { + bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) { assert((!Phi || std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) && "Thi phi needs to use the binary operator"); @@ -3406,9 +3566,10 @@ public: if (!isValidElementType(Ty)) return false; + const DataLayout &DL = B->getModule()->getDataLayout(); ReductionOpcode = B->getOpcode(); ReducedValueOpcode = 0; - ReduxWidth = MinVecRegSize / DL->getTypeSizeInBits(Ty); + ReduxWidth = MinVecRegSize / DL.getTypeSizeInBits(Ty); ReductionRoot = B; ReductionPHI = Phi; @@ -3718,8 +3879,7 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { // Try to match and vectorize a horizontal reduction. HorizontalReduction HorRdx; - if (ShouldVectorizeHor && - HorRdx.matchAssociativeReduction(P, BI, DL) && + if (ShouldVectorizeHor && HorRdx.matchAssociativeReduction(P, BI) && HorRdx.tryToReduce(R, TTI)) { Changed = true; it = BB->begin(); @@ -3749,7 +3909,7 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { if (BinaryOperator *BinOp = dyn_cast<BinaryOperator>(SI->getValueOperand())) { HorizontalReduction HorRdx; - if (((HorRdx.matchAssociativeReduction(nullptr, BinOp, DL) && + if (((HorRdx.matchAssociativeReduction(nullptr, BinOp) && HorRdx.tryToReduce(R, TTI)) || tryToVectorize(BinOp, R))) { Changed = true; @@ -3793,6 +3953,7 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { // and the iterator may become invalid value. it = BB->begin(); e = BB->end(); + break; } } } @@ -3849,7 +4010,7 @@ char SLPVectorizer::ID = 0; static const char lv_name[] = "SLP Vectorizer"; INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) -INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) diff --git a/contrib/llvm/lib/Transforms/Vectorize/Vectorize.cpp b/contrib/llvm/lib/Transforms/Vectorize/Vectorize.cpp index d459bcf..6e002fd 100644 --- a/contrib/llvm/lib/Transforms/Vectorize/Vectorize.cpp +++ b/contrib/llvm/lib/Transforms/Vectorize/Vectorize.cpp @@ -19,7 +19,7 @@ #include "llvm/Analysis/Passes.h" #include "llvm/IR/Verifier.h" #include "llvm/InitializePasses.h" -#include "llvm/PassManager.h" +#include "llvm/IR/LegacyPassManager.h" using namespace llvm; |