diff options
Diffstat (limited to 'contrib/llvm/lib/Transforms')
29 files changed, 686 insertions, 671 deletions
diff --git a/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp index f754363..4762011 100644 --- a/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp +++ b/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp @@ -825,7 +825,6 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, V = GetElementPtrInst::Create(SI->first, V, Ops, V->getName() + ".idx", Call); Ops.clear(); - AA.copyValue(OrigLoad->getOperand(0), V); } // Since we're replacing a load make sure we take the alignment // of the previous load. @@ -837,7 +836,6 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, newLoad->setAAMetadata(AAInfo); Args.push_back(newLoad); - AA.copyValue(OrigLoad, Args.back()); } } diff --git a/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp index 76898f2..d044764 100644 --- a/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp +++ b/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp @@ -326,7 +326,18 @@ bool DAE::DeleteDeadVarargs(Function &Fn) { /// instead. bool DAE::RemoveDeadArgumentsFromCallers(Function &Fn) { - if (Fn.isDeclaration() || Fn.mayBeOverridden()) + // We cannot change the arguments if this TU does not define the function or + // if the linker may choose a function body from another TU, even if the + // nominal linkage indicates that other copies of the function have the same + // semantics. In the below example, the dead load from %p may not have been + // eliminated from the linker-chosen copy of f, so replacing %p with undef + // in callers may introduce undefined behavior. + // + // define linkonce_odr void @f(i32* %p) { + // %v = load i32 %p + // ret void + // } + if (!Fn.isStrongDefinitionForLinker()) return false; // Functions with local linkage should already have been handled, except the @@ -334,19 +345,6 @@ bool DAE::RemoveDeadArgumentsFromCallers(Function &Fn) if (Fn.hasLocalLinkage() && !Fn.getFunctionType()->isVarArg()) return false; - // If a function seen at compile time is not necessarily the one linked to - // the binary being built, it is illegal to change the actual arguments - // passed to it. These functions can be captured by isWeakForLinker(). - // *NOTE* that mayBeOverridden() is insufficient for this purpose as it - // doesn't include linkage types like AvailableExternallyLinkage and - // LinkOnceODRLinkage. Take link_odr* as an example, it indicates a set of - // *EQUIVALENT* globals that can be merged at link-time. However, the - // semantic of *EQUIVALENT*-functions includes parameters. Changing - // parameters breaks this assumption. - // - if (Fn.isWeakForLinker()) - return false; - if (Fn.use_empty()) return false; diff --git a/contrib/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp b/contrib/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp new file mode 100644 index 0000000..67ba72d --- /dev/null +++ b/contrib/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp @@ -0,0 +1,84 @@ +//===-- ElimAvailExtern.cpp - DCE unreachable internal functions ----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This transform is designed to eliminate available external global +// definitions from the program, turning them into declarations. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/IPO.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/Transforms/Utils/CtorUtils.h" +#include "llvm/Transforms/Utils/GlobalStatus.h" +#include "llvm/Pass.h" +using namespace llvm; + +#define DEBUG_TYPE "elim-avail-extern" + +STATISTIC(NumFunctions, "Number of functions removed"); +STATISTIC(NumVariables, "Number of global variables removed"); + +namespace { + struct EliminateAvailableExternally : public ModulePass { + static char ID; // Pass identification, replacement for typeid + EliminateAvailableExternally() : ModulePass(ID) { + initializeEliminateAvailableExternallyPass( + *PassRegistry::getPassRegistry()); + } + + // run - Do the EliminateAvailableExternally pass on the specified module, + // optionally updating the specified callgraph to reflect the changes. + // + bool runOnModule(Module &M) override; + }; +} + +char EliminateAvailableExternally::ID = 0; +INITIALIZE_PASS(EliminateAvailableExternally, "elim-avail-extern", + "Eliminate Available Externally Globals", false, false) + +ModulePass *llvm::createEliminateAvailableExternallyPass() { + return new EliminateAvailableExternally(); +} + +bool EliminateAvailableExternally::runOnModule(Module &M) { + bool Changed = false; + + // Drop initializers of available externally global variables. + for (Module::global_iterator I = M.global_begin(), E = M.global_end(); + I != E; ++I) { + if (!I->hasAvailableExternallyLinkage()) + continue; + if (I->hasInitializer()) { + Constant *Init = I->getInitializer(); + I->setInitializer(nullptr); + if (isSafeToDestroyConstant(Init)) + Init->destroyConstant(); + } + I->removeDeadConstantUsers(); + I->setLinkage(GlobalValue::ExternalLinkage); + NumVariables++; + } + + // Drop the bodies of available externally functions. + for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { + if (!I->hasAvailableExternallyLinkage()) + continue; + if (!I->isDeclaration()) + // This will set the linkage to external + I->deleteBody(); + I->removeDeadConstantUsers(); + NumFunctions++; + } + + return Changed; +} diff --git a/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp b/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp index 2f8c7d9..b9462f2 100644 --- a/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp +++ b/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp @@ -93,8 +93,11 @@ namespace { makeVisible(*I, Delete); - if (Delete) + if (Delete) { + // Make this a declaration and drop it's comdat. I->setInitializer(nullptr); + I->setComdat(nullptr); + } } // Visit the Functions. @@ -108,8 +111,11 @@ namespace { makeVisible(*I, Delete); - if (Delete) + if (Delete) { + // Make this a declaration and drop it's comdat. I->deleteBody(); + I->setComdat(nullptr); + } } // Visit the Aliases. diff --git a/contrib/llvm/lib/Transforms/IPO/IPO.cpp b/contrib/llvm/lib/Transforms/IPO/IPO.cpp index fcacec328..50f56b0 100644 --- a/contrib/llvm/lib/Transforms/IPO/IPO.cpp +++ b/contrib/llvm/lib/Transforms/IPO/IPO.cpp @@ -46,6 +46,7 @@ void llvm::initializeIPO(PassRegistry &Registry) { initializeStripDeadDebugInfoPass(Registry); initializeStripNonDebugSymbolsPass(Registry); initializeBarrierNoopPass(Registry); + initializeEliminateAvailableExternallyPass(Registry); } void LLVMInitializeIPO(LLVMPassRegistryRef R) { diff --git a/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp index 963f1bb..88e5e47 100644 --- a/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -105,6 +105,7 @@ PassManagerBuilder::PassManagerBuilder() { VerifyInput = false; VerifyOutput = false; MergeFunctions = false; + PrepareForLTO = false; } PassManagerBuilder::~PassManagerBuilder() { @@ -319,8 +320,8 @@ void PassManagerBuilder::populateModulePassManager( // Re-rotate loops in all our loop nests. These may have fallout out of // rotated form due to GVN or other transformations, and the vectorizer relies - // on the rotated form. - MPM.add(createLoopRotatePass()); + // on the rotated form. Disable header duplication at -Oz. + MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1)); // Distribute loops to allow partial vectorization. I.e. isolate dependences // into separate loop that would otherwise inhibit vectorization. @@ -401,6 +402,17 @@ void PassManagerBuilder::populateModulePassManager( // GlobalOpt already deletes dead functions and globals, at -O2 try a // late pass of GlobalDCE. It is capable of deleting dead cycles. if (OptLevel > 1) { + if (!PrepareForLTO) { + // Remove avail extern fns and globals definitions if we aren't + // compiling an object file for later LTO. For LTO we want to preserve + // these so they are eligible for inlining at link-time. Note if they + // are unreferenced they will be removed by GlobalDCE below, so + // this only impacts referenced available externally globals. + // Eventually they will be suppressed during codegen, but eliminating + // here enables more opportunity for GlobalDCE as it may make + // globals referenced by available external functions dead. + MPM.add(createEliminateAvailableExternallyPass()); + } MPM.add(createGlobalDCEPass()); // Remove dead fns and globals. MPM.add(createConstantMergePass()); // Merge dup global constants } diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index 010b7b5..0bd6fd2 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -3928,8 +3928,8 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) { Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); - if (Value *V = - SimplifyFCmpInst(I.getPredicate(), Op0, Op1, DL, TLI, DT, AC, &I)) + if (Value *V = SimplifyFCmpInst(I.getPredicate(), Op0, Op1, + I.getFastMathFlags(), DL, TLI, DT, AC, &I)) return ReplaceInstUsesWith(I, V); // Simplify 'fcmp pred X, X' diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/contrib/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index 97ea8df..ac934f1 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -15,6 +15,7 @@ #ifndef LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINEINTERNAL_H #define LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINEINTERNAL_H +#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetFolder.h" @@ -177,6 +178,8 @@ private: // Mode in which we are running the combiner. const bool MinimizeSize; + AliasAnalysis *AA; + // Required analyses. // FIXME: These can never be null and should be references. AssumptionCache *AC; @@ -192,10 +195,11 @@ private: public: InstCombiner(InstCombineWorklist &Worklist, BuilderTy *Builder, - bool MinimizeSize, AssumptionCache *AC, TargetLibraryInfo *TLI, + bool MinimizeSize, AliasAnalysis *AA, + AssumptionCache *AC, TargetLibraryInfo *TLI, DominatorTree *DT, const DataLayout &DL, LoopInfo *LI) : Worklist(Worklist), Builder(Builder), MinimizeSize(MinimizeSize), - AC(AC), TLI(TLI), DT(DT), DL(DL), LI(LI), MadeIRChange(false) {} + AA(AA), AC(AC), TLI(TLI), DT(DT), DL(DL), LI(LI), MadeIRChange(false) {} /// \brief Run the combiner over the entire worklist until it is empty. /// diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index e7a4533..e3179db 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -749,10 +749,25 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) { // where there are several consecutive memory accesses to the same location, // separated by a few arithmetic operations. BasicBlock::iterator BBI = &LI; - if (Value *AvailableVal = FindAvailableLoadedValue(Op, LI.getParent(), BBI,6)) + AAMDNodes AATags; + if (Value *AvailableVal = FindAvailableLoadedValue(Op, LI.getParent(), BBI, + 6, AA, &AATags)) { + if (LoadInst *NLI = dyn_cast<LoadInst>(AvailableVal)) { + unsigned KnownIDs[] = { + LLVMContext::MD_tbaa, + LLVMContext::MD_alias_scope, + LLVMContext::MD_noalias, + LLVMContext::MD_range, + LLVMContext::MD_invariant_load, + LLVMContext::MD_nonnull, + }; + combineMetadata(NLI, &LI, KnownIDs); + }; + return ReplaceInstUsesWith( LI, Builder->CreateBitOrPointerCast(AvailableVal, LI.getType(), LI.getName() + ".cast")); + } // load(gep null, ...) -> unreachable if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Op)) { diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp index 24446c8..2730472 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -14,6 +14,8 @@ #include "InstCombineInternal.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/PatternMatch.h" using namespace llvm; using namespace PatternMatch; @@ -60,56 +62,6 @@ static bool CheapToScalarize(Value *V, bool isConstant) { return false; } -/// FindScalarElement - Given a vector and an element number, see if the scalar -/// value is already around as a register, for example if it were inserted then -/// extracted from the vector. -static Value *FindScalarElement(Value *V, unsigned EltNo) { - assert(V->getType()->isVectorTy() && "Not looking at a vector?"); - VectorType *VTy = cast<VectorType>(V->getType()); - unsigned Width = VTy->getNumElements(); - if (EltNo >= Width) // Out of range access. - return UndefValue::get(VTy->getElementType()); - - if (Constant *C = dyn_cast<Constant>(V)) - return C->getAggregateElement(EltNo); - - if (InsertElementInst *III = dyn_cast<InsertElementInst>(V)) { - // If this is an insert to a variable element, we don't know what it is. - if (!isa<ConstantInt>(III->getOperand(2))) - return nullptr; - unsigned IIElt = cast<ConstantInt>(III->getOperand(2))->getZExtValue(); - - // If this is an insert to the element we are looking for, return the - // inserted value. - if (EltNo == IIElt) - return III->getOperand(1); - - // Otherwise, the insertelement doesn't modify the value, recurse on its - // vector input. - return FindScalarElement(III->getOperand(0), EltNo); - } - - if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(V)) { - unsigned LHSWidth = SVI->getOperand(0)->getType()->getVectorNumElements(); - int InEl = SVI->getMaskValue(EltNo); - if (InEl < 0) - return UndefValue::get(VTy->getElementType()); - if (InEl < (int)LHSWidth) - return FindScalarElement(SVI->getOperand(0), InEl); - return FindScalarElement(SVI->getOperand(1), InEl - LHSWidth); - } - - // Extract a value from a vector add operation with a constant zero. - Value *Val = nullptr; Constant *Con = nullptr; - if (match(V, m_Add(m_Value(Val), m_Constant(Con)))) { - if (Con->getAggregateElement(EltNo)->isNullValue()) - return FindScalarElement(Val, EltNo); - } - - // Otherwise, we don't know. - return nullptr; -} - // If we have a PHI node with a vector type that has only 2 uses: feed // itself and be an operand of extractelement at a constant location, // try to replace the PHI of the vector type with a PHI of a scalar type. @@ -178,6 +130,10 @@ Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) { } Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) { + if (Value *V = SimplifyExtractElementInst( + EI.getVectorOperand(), EI.getIndexOperand(), DL, TLI, DT, AC)) + return ReplaceInstUsesWith(EI, V); + // If vector val is constant with all elements the same, replace EI with // that element. We handle a known element # below. if (Constant *C = dyn_cast<Constant>(EI.getOperand(0))) @@ -190,10 +146,8 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) { unsigned IndexVal = IdxC->getZExtValue(); unsigned VectorWidth = EI.getVectorOperandType()->getNumElements(); - // If this is extracting an invalid index, turn this into undef, to avoid - // crashing the code below. - if (IndexVal >= VectorWidth) - return ReplaceInstUsesWith(EI, UndefValue::get(EI.getType())); + // InstSimplify handles cases where the index is invalid. + assert(IndexVal < VectorWidth); // This instruction only demands the single element from the input vector. // If the input vector has a single use, simplify it based on this use @@ -209,16 +163,13 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) { } } - if (Value *Elt = FindScalarElement(EI.getOperand(0), IndexVal)) - return ReplaceInstUsesWith(EI, Elt); - // If the this extractelement is directly using a bitcast from a vector of // the same number of elements, see if we can find the source element from // it. In this case, we will end up needing to bitcast the scalars. if (BitCastInst *BCI = dyn_cast<BitCastInst>(EI.getOperand(0))) { if (VectorType *VT = dyn_cast<VectorType>(BCI->getOperand(0)->getType())) if (VT->getNumElements() == VectorWidth) - if (Value *Elt = FindScalarElement(BCI->getOperand(0), IndexVal)) + if (Value *Elt = findScalarElement(BCI->getOperand(0), IndexVal)) return new BitCastInst(Elt, EI.getType()); } diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 2a81689..fd34a24 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -2174,16 +2174,9 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) { if (!EV.hasIndices()) return ReplaceInstUsesWith(EV, Agg); - if (Constant *C = dyn_cast<Constant>(Agg)) { - if (Constant *C2 = C->getAggregateElement(*EV.idx_begin())) { - if (EV.getNumIndices() == 0) - return ReplaceInstUsesWith(EV, C2); - // Extract the remaining indices out of the constant indexed by the - // first index - return ExtractValueInst::Create(C2, EV.getIndices().slice(1)); - } - return nullptr; // Can't handle other constants - } + if (Value *V = + SimplifyExtractValueInst(Agg, EV.getIndices(), DL, TLI, DT, AC)) + return ReplaceInstUsesWith(EV, V); if (InsertValueInst *IV = dyn_cast<InsertValueInst>(Agg)) { // We're extracting from an insertvalue instruction, compare the indices @@ -2972,8 +2965,9 @@ static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL, static bool combineInstructionsOverFunction(Function &F, InstCombineWorklist &Worklist, - AssumptionCache &AC, TargetLibraryInfo &TLI, - DominatorTree &DT, LoopInfo *LI = nullptr) { + AliasAnalysis *AA, AssumptionCache &AC, + TargetLibraryInfo &TLI, DominatorTree &DT, + LoopInfo *LI = nullptr) { // Minimizing size? bool MinimizeSize = F.hasFnAttribute(Attribute::MinSize); auto &DL = F.getParent()->getDataLayout(); @@ -2998,7 +2992,8 @@ combineInstructionsOverFunction(Function &F, InstCombineWorklist &Worklist, if (prepareICWorklistFromFunction(F, DL, &TLI, Worklist)) Changed = true; - InstCombiner IC(Worklist, &Builder, MinimizeSize, &AC, &TLI, &DT, DL, LI); + InstCombiner IC(Worklist, &Builder, MinimizeSize, + AA, &AC, &TLI, &DT, DL, LI); if (IC.run()) Changed = true; @@ -3017,7 +3012,8 @@ PreservedAnalyses InstCombinePass::run(Function &F, auto *LI = AM->getCachedResult<LoopAnalysis>(F); - if (!combineInstructionsOverFunction(F, Worklist, AC, TLI, DT, LI)) + // FIXME: The AliasAnalysis is not yet supported in the new pass manager + if (!combineInstructionsOverFunction(F, Worklist, nullptr, AC, TLI, DT, LI)) // No changes, all analyses are preserved. return PreservedAnalyses::all(); @@ -3050,6 +3046,7 @@ public: void InstructionCombiningPass::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesCFG(); + AU.addRequired<AliasAnalysis>(); AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<TargetLibraryInfoWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); @@ -3061,6 +3058,7 @@ bool InstructionCombiningPass::runOnFunction(Function &F) { return false; // Required analyses. + auto AA = &getAnalysis<AliasAnalysis>(); auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); @@ -3069,7 +3067,7 @@ bool InstructionCombiningPass::runOnFunction(Function &F) { auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>(); auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr; - return combineInstructionsOverFunction(F, Worklist, AC, TLI, DT, LI); + return combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, DT, LI); } char InstructionCombiningPass::ID = 0; @@ -3078,6 +3076,7 @@ INITIALIZE_PASS_BEGIN(InstructionCombiningPass, "instcombine", INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) INITIALIZE_PASS_END(InstructionCombiningPass, "instcombine", "Combine redundant instructions", false, false) diff --git a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp index 60903c8..d1eba6e 100644 --- a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp @@ -656,11 +656,14 @@ namespace { LeaderTableEntry* Prev = nullptr; LeaderTableEntry* Curr = &LeaderTable[N]; - while (Curr->Val != I || Curr->BB != BB) { + while (Curr && (Curr->Val != I || Curr->BB != BB)) { Prev = Curr; Curr = Curr->Next; } + if (!Curr) + return; + if (Prev) { Prev->Next = Curr->Next; } else { @@ -1304,11 +1307,7 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI, if (V->getType()->getScalarType()->isPointerTy()) { AliasAnalysis *AA = gvn.getAliasAnalysis(); - for (unsigned i = 0, e = NewPHIs.size(); i != e; ++i) - AA->copyValue(LI, NewPHIs[i]); - - // Now that we've copied information to the new PHIs, scan through - // them again and inform alias analysis that we've added potentially + // Scan the new PHIs and inform alias analysis that we've added potentially // escaping uses to any values that are operands to these PHIs. for (unsigned i = 0, e = NewPHIs.size(); i != e; ++i) { PHINode *P = NewPHIs[i]; @@ -1796,7 +1795,7 @@ static void patchReplacementInstruction(Instruction *I, Value *Repl) { // In general, GVN unifies expressions over different control-flow // regions, and so we need a conservative combination of the noalias // scopes. - unsigned KnownIDs[] = { + static const unsigned KnownIDs[] = { LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope, LLVMContext::MD_noalias, diff --git a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp index 6f03754..2a954d9 100644 --- a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -41,6 +41,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -135,6 +136,10 @@ namespace { PHINode *IndVar, SCEVExpander &Rewriter); void SinkUnusedInvariants(Loop *L); + + Value *ExpandSCEVIfNeeded(SCEVExpander &Rewriter, const SCEV *S, Loop *L, + Instruction *InsertPt, Type *Ty, + bool &IsHighCostExpansion); }; } @@ -496,6 +501,52 @@ struct RewritePhi { }; } +Value *IndVarSimplify::ExpandSCEVIfNeeded(SCEVExpander &Rewriter, const SCEV *S, + Loop *L, Instruction *InsertPt, + Type *ResultTy, + bool &IsHighCostExpansion) { + using namespace llvm::PatternMatch; + + if (!Rewriter.isHighCostExpansion(S, L)) { + IsHighCostExpansion = false; + return Rewriter.expandCodeFor(S, ResultTy, InsertPt); + } + + // Before expanding S into an expensive LLVM expression, see if we can use an + // already existing value as the expansion for S. There is potential to make + // this significantly smarter, but this simple heuristic already gets some + // interesting cases. + + SmallVector<BasicBlock *, 4> Latches; + L->getLoopLatches(Latches); + + for (BasicBlock *BB : Latches) { + ICmpInst::Predicate Pred; + Instruction *LHS, *RHS; + BasicBlock *TrueBB, *FalseBB; + + if (!match(BB->getTerminator(), + m_Br(m_ICmp(Pred, m_Instruction(LHS), m_Instruction(RHS)), + TrueBB, FalseBB))) + continue; + + if (SE->getSCEV(LHS) == S && DT->dominates(LHS, InsertPt)) { + IsHighCostExpansion = false; + return LHS; + } + + if (SE->getSCEV(RHS) == S && DT->dominates(RHS, InsertPt)) { + IsHighCostExpansion = false; + return RHS; + } + } + + // We didn't find anything, fall back to using SCEVExpander. + assert(Rewriter.isHighCostExpansion(S, L) && "this should not have changed!"); + IsHighCostExpansion = true; + return Rewriter.expandCodeFor(S, ResultTy, InsertPt); +} + //===----------------------------------------------------------------------===// // RewriteLoopExitValues - Optimize IV users outside the loop. // As a side effect, reduces the amount of IV processing within the loop. @@ -628,7 +679,9 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { continue; } - Value *ExitVal = Rewriter.expandCodeFor(ExitValue, PN->getType(), Inst); + bool HighCost = false; + Value *ExitVal = ExpandSCEVIfNeeded(Rewriter, ExitValue, L, Inst, + PN->getType(), HighCost); DEBUG(dbgs() << "INDVARS: RLEV: AfterLoopVal = " << *ExitVal << '\n' << " LoopVal = " << *Inst << "\n"); @@ -637,7 +690,6 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { DeadInsts.push_back(ExitVal); continue; } - bool HighCost = Rewriter.isHighCostExpansion(ExitValue, L); // Collect all the candidate PHINodes to be rewritten. RewritePhiSet.push_back( diff --git a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp index f0e6d64..43fc50e 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp @@ -602,7 +602,8 @@ static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT, // PHI nodes in exit blocks due to LCSSA form. Just RAUW them with clones of // the instruction. while (!I.use_empty()) { - Instruction *User = I.user_back(); + Value::user_iterator UI = I.user_begin(); + auto *User = cast<Instruction>(*UI); if (!DT->isReachableFromEntry(User->getParent())) { User->replaceUsesOfWith(&I, UndefValue::get(I.getType())); continue; @@ -610,6 +611,16 @@ static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT, // The user must be a PHI node. PHINode *PN = cast<PHINode>(User); + // Surprisingly, instructions can be used outside of loops without any + // exits. This can only happen in PHI nodes if the incoming block is + // unreachable. + Use &U = UI.getUse(); + BasicBlock *BB = PN->getIncomingBlock(U); + if (!DT->isReachableFromEntry(BB)) { + U = UndefValue::get(I.getType()); + continue; + } + BasicBlock *ExitBlock = PN->getParent(); assert(ExitBlockSet.count(ExitBlock) && "The LCSSA PHI is not in an exit block!"); diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp index 0325d26..1b9859b 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp @@ -34,6 +34,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/LoopVersioning.h" #include <list> #define LDIST_NAME "loop-distribute" @@ -55,70 +56,6 @@ static cl::opt<bool> DistributeNonIfConvertible( STATISTIC(NumLoopsDistributed, "Number of loops distributed"); -/// \brief Remaps instructions in a loop including the preheader. -static void remapInstructionsInLoop(const SmallVectorImpl<BasicBlock *> &Blocks, - ValueToValueMapTy &VMap) { - // Rewrite the code to refer to itself. - for (auto *BB : Blocks) - for (auto &Inst : *BB) - RemapInstruction(&Inst, VMap, - RF_NoModuleLevelChanges | RF_IgnoreMissingEntries); -} - -/// \brief Clones a loop \p OrigLoop. Returns the loop and the blocks in \p -/// Blocks. -/// -/// Updates LoopInfo and DominatorTree assuming the loop is dominated by block -/// \p LoopDomBB. Insert the new blocks before block specified in \p Before. -static Loop *cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB, - Loop *OrigLoop, ValueToValueMapTy &VMap, - const Twine &NameSuffix, LoopInfo *LI, - DominatorTree *DT, - SmallVectorImpl<BasicBlock *> &Blocks) { - Function *F = OrigLoop->getHeader()->getParent(); - Loop *ParentLoop = OrigLoop->getParentLoop(); - - Loop *NewLoop = new Loop(); - if (ParentLoop) - ParentLoop->addChildLoop(NewLoop); - else - LI->addTopLevelLoop(NewLoop); - - BasicBlock *OrigPH = OrigLoop->getLoopPreheader(); - BasicBlock *NewPH = CloneBasicBlock(OrigPH, VMap, NameSuffix, F); - // To rename the loop PHIs. - VMap[OrigPH] = NewPH; - Blocks.push_back(NewPH); - - // Update LoopInfo. - if (ParentLoop) - ParentLoop->addBasicBlockToLoop(NewPH, *LI); - - // Update DominatorTree. - DT->addNewBlock(NewPH, LoopDomBB); - - for (BasicBlock *BB : OrigLoop->getBlocks()) { - BasicBlock *NewBB = CloneBasicBlock(BB, VMap, NameSuffix, F); - VMap[BB] = NewBB; - - // Update LoopInfo. - NewLoop->addBasicBlockToLoop(NewBB, *LI); - - // Update DominatorTree. - BasicBlock *IDomBB = DT->getNode(BB)->getIDom()->getBlock(); - DT->addNewBlock(NewBB, cast<BasicBlock>(VMap[IDomBB])); - - Blocks.push_back(NewBB); - } - - // Move them physically from the end of the block list. - F->getBasicBlockList().splice(Before, F->getBasicBlockList(), NewPH); - F->getBasicBlockList().splice(Before, F->getBasicBlockList(), - NewLoop->getHeader(), F->end()); - - return NewLoop; -} - namespace { /// \brief Maintains the set of instructions of the loop for a partition before /// cloning. After cloning, it hosts the new loop. @@ -204,7 +141,9 @@ public: ValueToValueMapTy &getVMap() { return VMap; } /// \brief Remaps the cloned instructions using VMap. - void remapInstructions() { remapInstructionsInLoop(ClonedLoopBlocks, VMap); } + void remapInstructions() { + remapInstructionsInBlocks(ClonedLoopBlocks, VMap); + } /// \brief Based on the set of instructions selected for this partition, /// removes the unnecessary ones. @@ -493,15 +432,14 @@ public: /// partitions its entry is set to -1. SmallVector<int, 8> computePartitionSetForPointers(const LoopAccessInfo &LAI) { - const LoopAccessInfo::RuntimePointerCheck *RtPtrCheck = - LAI.getRuntimePointerCheck(); + const RuntimePointerChecking *RtPtrCheck = LAI.getRuntimePointerChecking(); unsigned N = RtPtrCheck->Pointers.size(); SmallVector<int, 8> PtrToPartitions(N); for (unsigned I = 0; I < N; ++I) { - Value *Ptr = RtPtrCheck->Pointers[I]; + Value *Ptr = RtPtrCheck->Pointers[I].PointerValue; auto Instructions = - LAI.getInstructionsForAccess(Ptr, RtPtrCheck->IsWritePtr[I]); + LAI.getInstructionsForAccess(Ptr, RtPtrCheck->Pointers[I].IsWritePtr); int &Partition = PtrToPartitions[I]; // First set it to uninitialized. @@ -629,121 +567,6 @@ private: AccessesType Accesses; }; -/// \brief Handles the loop versioning based on memchecks. -class LoopVersioning { -public: - LoopVersioning(const LoopAccessInfo &LAI, Loop *L, LoopInfo *LI, - DominatorTree *DT, - const SmallVector<int, 8> *PtrToPartition = nullptr) - : VersionedLoop(L), NonVersionedLoop(nullptr), - PtrToPartition(PtrToPartition), LAI(LAI), LI(LI), DT(DT) {} - - /// \brief Returns true if we need memchecks to disambiguate may-aliasing - /// accesses. - bool needsRuntimeChecks() const { - return LAI.getRuntimePointerCheck()->needsAnyChecking(PtrToPartition); - } - - /// \brief Performs the CFG manipulation part of versioning the loop including - /// the DominatorTree and LoopInfo updates. - void versionLoop(Pass *P) { - Instruction *FirstCheckInst; - Instruction *MemRuntimeCheck; - // Add the memcheck in the original preheader (this is empty initially). - BasicBlock *MemCheckBB = VersionedLoop->getLoopPreheader(); - std::tie(FirstCheckInst, MemRuntimeCheck) = - LAI.addRuntimeCheck(MemCheckBB->getTerminator(), PtrToPartition); - assert(MemRuntimeCheck && "called even though needsAnyChecking = false"); - - // Rename the block to make the IR more readable. - MemCheckBB->setName(VersionedLoop->getHeader()->getName() + - ".lver.memcheck"); - - // Create empty preheader for the loop (and after cloning for the - // non-versioned loop). - BasicBlock *PH = - SplitBlock(MemCheckBB, MemCheckBB->getTerminator(), DT, LI); - PH->setName(VersionedLoop->getHeader()->getName() + ".ph"); - - // Clone the loop including the preheader. - // - // FIXME: This does not currently preserve SimplifyLoop because the exit - // block is a join between the two loops. - SmallVector<BasicBlock *, 8> NonVersionedLoopBlocks; - NonVersionedLoop = - cloneLoopWithPreheader(PH, MemCheckBB, VersionedLoop, VMap, - ".lver.orig", LI, DT, NonVersionedLoopBlocks); - remapInstructionsInLoop(NonVersionedLoopBlocks, VMap); - - // Insert the conditional branch based on the result of the memchecks. - Instruction *OrigTerm = MemCheckBB->getTerminator(); - BranchInst::Create(NonVersionedLoop->getLoopPreheader(), - VersionedLoop->getLoopPreheader(), MemRuntimeCheck, - OrigTerm); - OrigTerm->eraseFromParent(); - - // The loops merge in the original exit block. This is now dominated by the - // memchecking block. - DT->changeImmediateDominator(VersionedLoop->getExitBlock(), MemCheckBB); - } - - /// \brief Adds the necessary PHI nodes for the versioned loops based on the - /// loop-defined values used outside of the loop. - void addPHINodes(const SmallVectorImpl<Instruction *> &DefsUsedOutside) { - BasicBlock *PHIBlock = VersionedLoop->getExitBlock(); - assert(PHIBlock && "No single successor to loop exit block"); - - for (auto *Inst : DefsUsedOutside) { - auto *NonVersionedLoopInst = cast<Instruction>(VMap[Inst]); - PHINode *PN; - - // First see if we have a single-operand PHI with the value defined by the - // original loop. - for (auto I = PHIBlock->begin(); (PN = dyn_cast<PHINode>(I)); ++I) { - assert(PN->getNumOperands() == 1 && - "Exit block should only have on predecessor"); - if (PN->getIncomingValue(0) == Inst) - break; - } - // If not create it. - if (!PN) { - PN = PHINode::Create(Inst->getType(), 2, Inst->getName() + ".lver", - PHIBlock->begin()); - for (auto *User : Inst->users()) - if (!VersionedLoop->contains(cast<Instruction>(User)->getParent())) - User->replaceUsesOfWith(Inst, PN); - PN->addIncoming(Inst, VersionedLoop->getExitingBlock()); - } - // Add the new incoming value from the non-versioned loop. - PN->addIncoming(NonVersionedLoopInst, - NonVersionedLoop->getExitingBlock()); - } - } - -private: - /// \brief The original loop. This becomes the "versioned" one, i.e. control - /// goes if the memchecks all pass. - Loop *VersionedLoop; - /// \brief The fall-back loop, i.e. if any of the memchecks fail. - Loop *NonVersionedLoop; - - /// \brief For each memory pointer it contains the partitionId it is used in. - /// If nullptr, no partitioning is used. - /// - /// The I-th entry corresponds to I-th entry in LAI.getRuntimePointerCheck(). - /// If the pointer is used in multiple partitions the entry is set to -1. - const SmallVector<int, 8> *PtrToPartition; - - /// \brief This maps the instructions from VersionedLoop to their counterpart - /// in NonVersionedLoop. - ValueToValueMapTy VMap; - - /// \brief Analyses used. - const LoopAccessInfo &LAI; - LoopInfo *LI; - DominatorTree *DT; -}; - /// \brief Returns the instructions that use values defined in the loop. static SmallVector<Instruction *, 8> findDefsUsedOutsideOfLoop(Loop *L) { SmallVector<Instruction *, 8> UsedOutside; @@ -929,7 +752,7 @@ private: LoopVersioning LVer(LAI, L, LI, DT, &PtrToPartition); if (LVer.needsRuntimeChecks()) { DEBUG(dbgs() << "\nPointers:\n"); - DEBUG(LAI.getRuntimePointerCheck()->print(dbgs(), 0, &PtrToPartition)); + DEBUG(LAI.getRuntimePointerChecking()->print(dbgs(), 0, &PtrToPartition)); LVer.versionLoop(this); LVer.addPHINodes(DefsUsedOutside); } diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 714ce91..a21ca24 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -508,7 +508,7 @@ void NclPopcountRecognize::transform(Instruction *CntInst, ICmpInst *NewPreCond = cast<ICmpInst>(Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1)); - PreCond->replaceAllUsesWith(NewPreCond); + PreCondBr->setCondition(NewPreCond); RecursivelyDeleteTriviallyDeadInstructions(PreCond, TLI); } diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp index 2554655..9d7e57f 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -282,21 +282,21 @@ static void populateWorklist(Loop &L, SmallVector<LoopVector, 8> &V) { DEBUG(dbgs() << "Calling populateWorklist called\n"); LoopVector LoopList; Loop *CurrentLoop = &L; - std::vector<Loop *> vec = CurrentLoop->getSubLoopsVector(); - while (vec.size() != 0) { + const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops(); + while (!Vec->empty()) { // The current loop has multiple subloops in it hence it is not tightly // nested. // Discard all loops above it added into Worklist. - if (vec.size() != 1) { + if (Vec->size() != 1) { LoopList.clear(); return; } LoopList.push_back(CurrentLoop); - CurrentLoop = *(vec.begin()); - vec = CurrentLoop->getSubLoopsVector(); + CurrentLoop = Vec->front(); + Vec = &CurrentLoop->getSubLoops(); } LoopList.push_back(CurrentLoop); - V.push_back(LoopList); + V.push_back(std::move(LoopList)); } static PHINode *getInductionVariable(Loop *L, ScalarEvolution *SE) { diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index 9e7558d..d78db6c 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -840,8 +840,10 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { // Reduce count based on the type of unrolling and the threshold values. unsigned OriginalCount = Count; - bool AllowRuntime = UserRuntime ? CurrentRuntime : UP.Runtime; - if (HasRuntimeUnrollDisablePragma(L)) { + bool AllowRuntime = + (PragmaCount > 0) || (UserRuntime ? CurrentRuntime : UP.Runtime); + // Don't unroll a runtime trip count loop with unroll full pragma. + if (HasRuntimeUnrollDisablePragma(L) || PragmaFullUnroll) { AllowRuntime = false; } if (Unrolling == Partial) { diff --git a/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp index 243db8d..643f374 100644 --- a/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp @@ -301,10 +301,6 @@ void MergedLoadStoreMotion::hoistInstruction(BasicBlock *BB, // Merged instruction Instruction *HoistedInst = HoistCand->clone(); - // Notify AA of the new value. - if (isa<LoadInst>(HoistCand)) - AA->copyValue(HoistCand, HoistedInst); - // Hoist instruction. HoistedInst->insertBefore(HoistPt); @@ -451,9 +447,6 @@ PHINode *MergedLoadStoreMotion::getPHIOperand(BasicBlock *BB, StoreInst *S0, NewPN->addIncoming(Opd1, S0->getParent()); NewPN->addIncoming(Opd2, S1->getParent()); if (NewPN->getType()->getScalarType()->isPointerTy()) { - // Notify AA of the new value. - AA->copyValue(Opd1, NewPN); - AA->copyValue(Opd2, NewPN); // AA needs to be informed when a PHI-use of the pointer value is added for (unsigned I = 0, E = NewPN->getNumIncomingValues(); I != E; ++I) { unsigned J = PHINode::getOperandNumForIncomingValue(I); @@ -491,7 +484,6 @@ bool MergedLoadStoreMotion::sinkStore(BasicBlock *BB, StoreInst *S0, // Create the new store to be inserted at the join point. StoreInst *SNew = (StoreInst *)(S0->clone()); Instruction *ANew = A0->clone(); - AA->copyValue(S0, SNew); SNew->insertBefore(InsertPt); ANew->insertBefore(SNew); diff --git a/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp b/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp index 9ecaf10..366301a 100644 --- a/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp @@ -399,8 +399,8 @@ static bool doesNotRequireEntrySafepointBefore(const CallSite &CS) { // at least if they do, are leaf functions that cause only finite stack // growth. In particular, the optimizer likes to form things like memsets // out of stores in the original IR. Another important example is - // llvm.frameescape which must occur in the entry block. Inserting a - // safepoint before it is not legal since it could push the frameescape + // llvm.localescape which must occur in the entry block. Inserting a + // safepoint before it is not legal since it could push the localescape // out of the entry block. return true; } diff --git a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp index 305175f..4d3a708 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp @@ -1799,11 +1799,10 @@ bool IPSCCP::runOnModule(Module &M) { if (!TI->use_empty()) TI->replaceAllUsesWith(UndefValue::get(TI->getType())); TI->eraseFromParent(); + new UnreachableInst(M.getContext(), BB); if (&*BB != &F->front()) BlocksToErase.push_back(BB); - else - new UnreachableInst(M.getContext(), BB); continue; } diff --git a/contrib/llvm/lib/Transforms/Scalar/SROA.cpp b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp index 056dd11..d1a0a82 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp @@ -2593,13 +2593,21 @@ private: V = rewriteIntegerLoad(LI); } else if (NewBeginOffset == NewAllocaBeginOffset && canConvertValue(DL, NewAllocaTy, LI.getType())) { - V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), LI.isVolatile(), - LI.getName()); + LoadInst *NewLI = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), + LI.isVolatile(), LI.getName()); + if (LI.isVolatile()) + NewLI->setAtomic(LI.getOrdering(), LI.getSynchScope()); + + V = NewLI; } else { Type *LTy = TargetTy->getPointerTo(); - V = IRB.CreateAlignedLoad(getNewAllocaSlicePtr(IRB, LTy), - getSliceAlign(TargetTy), LI.isVolatile(), - LI.getName()); + LoadInst *NewLI = IRB.CreateAlignedLoad(getNewAllocaSlicePtr(IRB, LTy), + getSliceAlign(TargetTy), + LI.isVolatile(), LI.getName()); + if (LI.isVolatile()) + NewLI->setAtomic(LI.getOrdering(), LI.getSynchScope()); + + V = NewLI; IsPtrAdjusted = true; } V = convertValue(DL, IRB, V, TargetTy); @@ -2722,7 +2730,8 @@ private: NewSI = IRB.CreateAlignedStore(V, NewPtr, getSliceAlign(V->getType()), SI.isVolatile()); } - (void)NewSI; + if (SI.isVolatile()) + NewSI->setAtomic(SI.getOrdering(), SI.getSynchScope()); Pass.DeadInsts.insert(&SI); deleteIfTriviallyDead(OldOp); diff --git a/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp index 53471de..ef7daca 100644 --- a/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp +++ b/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp @@ -440,8 +440,6 @@ static void UpdatePHINodes(BasicBlock *OrigBB, BasicBlock *NewBB, // Create the new PHI node, insert it into NewBB at the end of the block PHINode *NewPHI = PHINode::Create(PN->getType(), Preds.size(), PN->getName() + ".ph", BI); - if (AA) - AA->copyValue(PN, NewPHI); // NOTE! This loop walks backwards for a reason! First off, this minimizes // the cost of removal if we end up removing a large number of values, and diff --git a/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp b/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp index 4f8d1df..cc4d6c6 100644 --- a/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp +++ b/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp @@ -17,6 +17,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DebugInfo.h" @@ -720,3 +721,68 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc, ModuleLevelChanges, Returns, NameSuffix, CodeInfo, nullptr); } + +/// \brief Remaps instructions in \p Blocks using the mapping in \p VMap. +void llvm::remapInstructionsInBlocks( + const SmallVectorImpl<BasicBlock *> &Blocks, ValueToValueMapTy &VMap) { + // Rewrite the code to refer to itself. + for (auto *BB : Blocks) + for (auto &Inst : *BB) + RemapInstruction(&Inst, VMap, + RF_NoModuleLevelChanges | RF_IgnoreMissingEntries); +} + +/// \brief Clones a loop \p OrigLoop. Returns the loop and the blocks in \p +/// Blocks. +/// +/// Updates LoopInfo and DominatorTree assuming the loop is dominated by block +/// \p LoopDomBB. Insert the new blocks before block specified in \p Before. +Loop *llvm::cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB, + Loop *OrigLoop, ValueToValueMapTy &VMap, + const Twine &NameSuffix, LoopInfo *LI, + DominatorTree *DT, + SmallVectorImpl<BasicBlock *> &Blocks) { + Function *F = OrigLoop->getHeader()->getParent(); + Loop *ParentLoop = OrigLoop->getParentLoop(); + + Loop *NewLoop = new Loop(); + if (ParentLoop) + ParentLoop->addChildLoop(NewLoop); + else + LI->addTopLevelLoop(NewLoop); + + BasicBlock *OrigPH = OrigLoop->getLoopPreheader(); + assert(OrigPH && "No preheader"); + BasicBlock *NewPH = CloneBasicBlock(OrigPH, VMap, NameSuffix, F); + // To rename the loop PHIs. + VMap[OrigPH] = NewPH; + Blocks.push_back(NewPH); + + // Update LoopInfo. + if (ParentLoop) + ParentLoop->addBasicBlockToLoop(NewPH, *LI); + + // Update DominatorTree. + DT->addNewBlock(NewPH, LoopDomBB); + + for (BasicBlock *BB : OrigLoop->getBlocks()) { + BasicBlock *NewBB = CloneBasicBlock(BB, VMap, NameSuffix, F); + VMap[BB] = NewBB; + + // Update LoopInfo. + NewLoop->addBasicBlockToLoop(NewBB, *LI); + + // Update DominatorTree. + BasicBlock *IDomBB = DT->getNode(BB)->getIDom()->getBlock(); + DT->addNewBlock(NewBB, cast<BasicBlock>(VMap[IDomBB])); + + Blocks.push_back(NewBB); + } + + // Move them physically from the end of the block list. + F->getBasicBlockList().splice(Before, F->getBasicBlockList(), NewPH); + F->getBasicBlockList().splice(Before, F->getBasicBlockList(), + NewLoop->getHeader(), F->end()); + + return NewLoop; +} diff --git a/contrib/llvm/lib/Transforms/Utils/Local.cpp b/contrib/llvm/lib/Transforms/Utils/Local.cpp index 5608557..50ca623 100644 --- a/contrib/llvm/lib/Transforms/Utils/Local.cpp +++ b/contrib/llvm/lib/Transforms/Utils/Local.cpp @@ -900,13 +900,10 @@ static unsigned enforceKnownAlignment(Value *V, unsigned Align, if (auto *GO = dyn_cast<GlobalObject>(V)) { // If there is a large requested alignment and we can, bump up the alignment - // of the global. - if (GO->isDeclaration()) - return Align; - // If the memory we set aside for the global may not be the memory used by - // the final program then it is impossible for us to reliably enforce the - // preferred alignment. - if (GO->isWeakForLinker()) + // of the global. If the memory we set aside for the global may not be the + // memory used by the final program then it is impossible for us to reliably + // enforce the preferred alignment. + if (!GO->isStrongDefinitionForLinker()) return Align; if (GO->getAlignment() >= PrefAlign) diff --git a/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp index 2e7d21c..5c98043 100644 --- a/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp +++ b/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp @@ -403,7 +403,6 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader, PHINode *PN = cast<PHINode>(I); PHINode *NewPN = PHINode::Create(PN->getType(), BackedgeBlocks.size(), PN->getName()+".be", BETerminator); - if (AA) AA->copyValue(PN, NewPN); // Loop over the PHI node, moving all entries except the one for the // preheader over to the new PHI node. diff --git a/contrib/llvm/lib/Transforms/Utils/LoopVersioning.cpp b/contrib/llvm/lib/Transforms/Utils/LoopVersioning.cpp new file mode 100644 index 0000000..832079d --- /dev/null +++ b/contrib/llvm/lib/Transforms/Utils/LoopVersioning.cpp @@ -0,0 +1,106 @@ +//===- LoopVersioning.cpp - Utility to version a loop ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines a utility class to perform loop versioning. The versioned +// loop speculates that otherwise may-aliasing memory accesses don't overlap and +// emits checks to prove this. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Analysis/LoopAccessAnalysis.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/LoopVersioning.h" + +using namespace llvm; + +LoopVersioning::LoopVersioning(const LoopAccessInfo &LAI, Loop *L, LoopInfo *LI, + DominatorTree *DT, + const SmallVector<int, 8> *PtrToPartition) + : VersionedLoop(L), NonVersionedLoop(nullptr), + PtrToPartition(PtrToPartition), LAI(LAI), LI(LI), DT(DT) { + assert(L->getExitBlock() && "No single exit block"); + assert(L->getLoopPreheader() && "No preheader"); +} + +bool LoopVersioning::needsRuntimeChecks() const { + return LAI.getRuntimePointerChecking()->needsAnyChecking(PtrToPartition); +} + +void LoopVersioning::versionLoop(Pass *P) { + Instruction *FirstCheckInst; + Instruction *MemRuntimeCheck; + // Add the memcheck in the original preheader (this is empty initially). + BasicBlock *MemCheckBB = VersionedLoop->getLoopPreheader(); + std::tie(FirstCheckInst, MemRuntimeCheck) = + LAI.addRuntimeCheck(MemCheckBB->getTerminator(), PtrToPartition); + assert(MemRuntimeCheck && "called even though needsAnyChecking = false"); + + // Rename the block to make the IR more readable. + MemCheckBB->setName(VersionedLoop->getHeader()->getName() + ".lver.memcheck"); + + // Create empty preheader for the loop (and after cloning for the + // non-versioned loop). + BasicBlock *PH = SplitBlock(MemCheckBB, MemCheckBB->getTerminator(), DT, LI); + PH->setName(VersionedLoop->getHeader()->getName() + ".ph"); + + // Clone the loop including the preheader. + // + // FIXME: This does not currently preserve SimplifyLoop because the exit + // block is a join between the two loops. + SmallVector<BasicBlock *, 8> NonVersionedLoopBlocks; + NonVersionedLoop = + cloneLoopWithPreheader(PH, MemCheckBB, VersionedLoop, VMap, ".lver.orig", + LI, DT, NonVersionedLoopBlocks); + remapInstructionsInBlocks(NonVersionedLoopBlocks, VMap); + + // Insert the conditional branch based on the result of the memchecks. + Instruction *OrigTerm = MemCheckBB->getTerminator(); + BranchInst::Create(NonVersionedLoop->getLoopPreheader(), + VersionedLoop->getLoopPreheader(), MemRuntimeCheck, + OrigTerm); + OrigTerm->eraseFromParent(); + + // The loops merge in the original exit block. This is now dominated by the + // memchecking block. + DT->changeImmediateDominator(VersionedLoop->getExitBlock(), MemCheckBB); +} + +void LoopVersioning::addPHINodes( + const SmallVectorImpl<Instruction *> &DefsUsedOutside) { + BasicBlock *PHIBlock = VersionedLoop->getExitBlock(); + assert(PHIBlock && "No single successor to loop exit block"); + + for (auto *Inst : DefsUsedOutside) { + auto *NonVersionedLoopInst = cast<Instruction>(VMap[Inst]); + PHINode *PN; + + // First see if we have a single-operand PHI with the value defined by the + // original loop. + for (auto I = PHIBlock->begin(); (PN = dyn_cast<PHINode>(I)); ++I) { + assert(PN->getNumOperands() == 1 && + "Exit block should only have on predecessor"); + if (PN->getIncomingValue(0) == Inst) + break; + } + // If not create it. + if (!PN) { + PN = PHINode::Create(Inst->getType(), 2, Inst->getName() + ".lver", + PHIBlock->begin()); + for (auto *User : Inst->users()) + if (!VersionedLoop->contains(cast<Instruction>(User)->getParent())) + User->replaceUsesOfWith(Inst, PN); + PN->addIncoming(Inst, VersionedLoop->getExitingBlock()); + } + // Add the new incoming value from the non-versioned loop. + PN->addIncoming(NonVersionedLoopInst, NonVersionedLoop->getExitingBlock()); + } +} diff --git a/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 5ba1417..69ca268 100644 --- a/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -148,8 +148,9 @@ static cl::opt<unsigned> MaxInterleaveGroupFactor( cl::desc("Maximum factor for an interleaved access group (default = 8)"), cl::init(8)); -/// We don't unroll loops with a known constant trip count below this number. -static const unsigned TinyTripCountUnrollThreshold = 128; +/// We don't interleave loops with a known constant trip count below this +/// number. +static const unsigned TinyTripCountInterleaveThreshold = 128; static cl::opt<unsigned> ForceTargetNumScalarRegs( "force-target-num-scalar-regs", cl::init(0), cl::Hidden, @@ -180,7 +181,8 @@ static cl::opt<unsigned> ForceTargetInstructionCost( static cl::opt<unsigned> SmallLoopCost( "small-loop-cost", cl::init(20), cl::Hidden, - cl::desc("The cost of a loop that is considered 'small' by the unroller.")); + cl::desc( + "The cost of a loop that is considered 'small' by the interleaver.")); static cl::opt<bool> LoopVectorizeWithBlockFrequency( "loop-vectorize-with-block-frequency", cl::init(false), cl::Hidden, @@ -188,10 +190,11 @@ static cl::opt<bool> LoopVectorizeWithBlockFrequency( "heuristics minimizing code growth in cold regions and being more " "aggressive in hot regions.")); -// Runtime unroll loops for load/store throughput. -static cl::opt<bool> EnableLoadStoreRuntimeUnroll( - "enable-loadstore-runtime-unroll", cl::init(true), cl::Hidden, - cl::desc("Enable runtime unrolling until load/store ports are saturated")); +// Runtime interleave loops for load/store throughput. +static cl::opt<bool> EnableLoadStoreRuntimeInterleave( + "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, + cl::desc( + "Enable runtime interleaving until load/store ports are saturated")); /// The number of stores in a loop that are allowed to need predication. static cl::opt<unsigned> NumberOfStoresToPredicate( @@ -200,15 +203,15 @@ static cl::opt<unsigned> NumberOfStoresToPredicate( static cl::opt<bool> EnableIndVarRegisterHeur( "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, - cl::desc("Count the induction variable only once when unrolling")); + cl::desc("Count the induction variable only once when interleaving")); static cl::opt<bool> EnableCondStoresVectorization( "enable-cond-stores-vec", cl::init(false), cl::Hidden, cl::desc("Enable if predication of stores during vectorization.")); -static cl::opt<unsigned> MaxNestedScalarReductionUF( - "max-nested-scalar-reduction-unroll", cl::init(2), cl::Hidden, - cl::desc("The maximum unroll factor to use when unrolling a scalar " +static cl::opt<unsigned> MaxNestedScalarReductionIC( + "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, + cl::desc("The maximum interleave count to use when interleaving a scalar " "reduction in a nested loop.")); namespace { @@ -921,8 +924,8 @@ public: bool isUniformAfterVectorization(Instruction* I) { return Uniforms.count(I); } /// Returns the information that we collected about runtime memory check. - const LoopAccessInfo::RuntimePointerCheck *getRuntimePointerCheck() const { - return LAI->getRuntimePointerCheck(); + const RuntimePointerChecking *getRuntimePointerChecking() const { + return LAI->getRuntimePointerChecking(); } const LoopAccessInfo *getLAI() const { @@ -1105,12 +1108,19 @@ public: /// 64 bit loop indices. unsigned getWidestType(); + /// \return The desired interleave count. + /// If interleave count has been specified by metadata it will be returned. + /// Otherwise, the interleave count is computed and returned. VF and LoopCost + /// are the selected vectorization factor and the cost of the selected VF. + unsigned selectInterleaveCount(bool OptForSize, unsigned VF, + unsigned LoopCost); + /// \return The most profitable unroll factor. - /// If UserUF is non-zero then this method finds the best unroll-factor - /// based on register pressure and other parameters. - /// VF and LoopCost are the selected vectorization factor and the cost of the - /// selected VF. - unsigned selectUnrollFactor(bool OptForSize, unsigned VF, unsigned LoopCost); + /// This method finds the best unroll-factor based on register pressure and + /// other parameters. VF and LoopCost are the selected vectorization factor + /// and the cost of the selected VF. + unsigned computeInterleaveCount(bool OptForSize, unsigned VF, + unsigned LoopCost); /// \brief A struct that represents some properties of the register usage /// of a loop. @@ -1456,9 +1466,14 @@ struct LoopVectorize : public FunctionPass { const BranchProbability ColdProb(1, 5); // 20% ColdEntryFreq = BlockFrequency(BFI->getEntryFreq()) * ColdProb; - // If the target claims to have no vector registers don't attempt - // vectorization. - if (!TTI->getNumberOfRegisters(true)) + // Don't attempt if + // 1. the target claims to have no vector registers, and + // 2. interleaving won't help ILP. + // + // The second condition is necessary because, even if the target has no + // vector registers, loop vectorization may still enable scalar + // interleaving. + if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2) return false; // Build up a worklist of inner-loops to vectorize. This is necessary as @@ -1633,18 +1648,17 @@ struct LoopVectorize : public FunctionPass { const LoopVectorizationCostModel::VectorizationFactor VF = CM.selectVectorizationFactor(OptForSize); - // Select the unroll factor. - const unsigned UF = - CM.selectUnrollFactor(OptForSize, VF.Width, VF.Cost); + // Select the interleave count. + unsigned IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost); DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in " << DebugLocStr << '\n'); - DEBUG(dbgs() << "LV: Unroll Factor is " << UF << '\n'); + DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); if (VF.Width == 1) { DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial\n"); - if (UF == 1) { + if (IC == 1) { emitOptimizationRemarkAnalysis( F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), "not beneficial to vectorize and user disabled interleaving"); @@ -1654,17 +1668,14 @@ struct LoopVectorize : public FunctionPass { // Report the unrolling decision. emitOptimizationRemark(F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), - Twine("unrolled with interleaving factor " + - Twine(UF) + + Twine("interleaved by " + Twine(IC) + " (vectorization not beneficial)")); - // We decided not to vectorize, but we may want to unroll. - - InnerLoopUnroller Unroller(L, SE, LI, DT, TLI, TTI, UF); + InnerLoopUnroller Unroller(L, SE, LI, DT, TLI, TTI, IC); Unroller.vectorize(&LVL); } else { // If we decided that it is *legal* to vectorize the loop then do it. - InnerLoopVectorizer LB(L, SE, LI, DT, TLI, TTI, VF.Width, UF); + InnerLoopVectorizer LB(L, SE, LI, DT, TLI, TTI, VF.Width, IC); LB.vectorize(&LVL); ++LoopsVectorized; @@ -1675,10 +1686,10 @@ struct LoopVectorize : public FunctionPass { AddRuntimeUnrollDisableMetaData(L); // Report the vectorization decision. - emitOptimizationRemark( - F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), - Twine("vectorized loop (vectorization factor: ") + Twine(VF.Width) + - ", unrolling interleave factor: " + Twine(UF) + ")"); + emitOptimizationRemark(F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), + Twine("vectorized loop (vectorization width: ") + + Twine(VF.Width) + ", interleaved count: " + + Twine(IC) + ")"); } // Mark the loop as already vectorized to avoid vectorizing again. @@ -1760,31 +1771,6 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, return Builder.CreateAdd(Val, Step, "induction"); } -/// \brief Find the operand of the GEP that should be checked for consecutive -/// stores. This ignores trailing indices that have no effect on the final -/// pointer. -static unsigned getGEPInductionOperand(const GetElementPtrInst *Gep) { - const DataLayout &DL = Gep->getModule()->getDataLayout(); - unsigned LastOperand = Gep->getNumOperands() - 1; - unsigned GEPAllocSize = DL.getTypeAllocSize( - cast<PointerType>(Gep->getType()->getScalarType())->getElementType()); - - // Walk backwards and try to peel off zeros. - while (LastOperand > 1 && match(Gep->getOperand(LastOperand), m_Zero())) { - // Find the type we're currently indexing into. - gep_type_iterator GEPTI = gep_type_begin(Gep); - std::advance(GEPTI, LastOperand - 1); - - // If it's a type with the same allocation size as the result of the GEP we - // can peel off the zero index. - if (DL.getTypeAllocSize(*GEPTI) != GEPAllocSize) - break; - --LastOperand; - } - - return LastOperand; -} - int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr"); // Make sure that the pointer does not point to structs. @@ -2503,9 +2489,9 @@ void InnerLoopVectorizer::createEmptyLoop() { */ BasicBlock *OldBasicBlock = OrigLoop->getHeader(); - BasicBlock *BypassBlock = OrigLoop->getLoopPreheader(); + BasicBlock *VectorPH = OrigLoop->getLoopPreheader(); BasicBlock *ExitBlock = OrigLoop->getExitBlock(); - assert(BypassBlock && "Invalid loop structure"); + assert(VectorPH && "Invalid loop structure"); assert(ExitBlock && "Must have an exit block"); // Some loops have a single integer induction variable, while other loops @@ -2545,44 +2531,35 @@ void InnerLoopVectorizer::createEmptyLoop() { // loop. Value *BackedgeCount = Exp.expandCodeFor(BackedgeTakeCount, BackedgeTakeCount->getType(), - BypassBlock->getTerminator()); + VectorPH->getTerminator()); if (BackedgeCount->getType()->isPointerTy()) BackedgeCount = CastInst::CreatePointerCast(BackedgeCount, IdxTy, "backedge.ptrcnt.to.int", - BypassBlock->getTerminator()); + VectorPH->getTerminator()); Instruction *CheckBCOverflow = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, BackedgeCount, Constant::getAllOnesValue(BackedgeCount->getType()), - "backedge.overflow", BypassBlock->getTerminator()); + "backedge.overflow", VectorPH->getTerminator()); // The loop index does not have to start at Zero. Find the original start // value from the induction PHI node. If we don't have an induction variable // then we know that it starts at zero. - Builder.SetInsertPoint(BypassBlock->getTerminator()); - Value *StartIdx = ExtendedIdx = OldInduction ? - Builder.CreateZExt(OldInduction->getIncomingValueForBlock(BypassBlock), - IdxTy): - ConstantInt::get(IdxTy, 0); - - // We need an instruction to anchor the overflow check on. StartIdx needs to - // be defined before the overflow check branch. Because the scalar preheader - // is going to merge the start index and so the overflow branch block needs to - // contain a definition of the start index. - Instruction *OverflowCheckAnchor = BinaryOperator::CreateAdd( - StartIdx, ConstantInt::get(IdxTy, 0), "overflow.check.anchor", - BypassBlock->getTerminator()); + Builder.SetInsertPoint(VectorPH->getTerminator()); + Value *StartIdx = ExtendedIdx = + OldInduction + ? Builder.CreateZExt(OldInduction->getIncomingValueForBlock(VectorPH), + IdxTy) + : ConstantInt::get(IdxTy, 0); // Count holds the overall loop count (N). Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(), - BypassBlock->getTerminator()); + VectorPH->getTerminator()); - LoopBypassBlocks.push_back(BypassBlock); + LoopBypassBlocks.push_back(VectorPH); // Split the single block loop into the two loop structure described above. - BasicBlock *VectorPH = - BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph"); BasicBlock *VecBody = - VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body"); + VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body"); BasicBlock *MiddleBlock = VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block"); BasicBlock *ScalarPH = @@ -2597,7 +2574,6 @@ void InnerLoopVectorizer::createEmptyLoop() { if (ParentLoop) { ParentLoop->addChildLoop(Lp); ParentLoop->addBasicBlockToLoop(ScalarPH, *LI); - ParentLoop->addBasicBlockToLoop(VectorPH, *LI); ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI); } else { LI->addTopLevelLoop(Lp); @@ -2615,9 +2591,20 @@ void InnerLoopVectorizer::createEmptyLoop() { // times the unroll factor (num of SIMD instructions). Constant *Step = ConstantInt::get(IdxTy, VF * UF); + // Generate code to check that the loop's trip count that we computed by + // adding one to the backedge-taken count will not overflow. + BasicBlock *NewVectorPH = + VectorPH->splitBasicBlock(VectorPH->getTerminator(), "overflow.checked"); + if (ParentLoop) + ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI); + ReplaceInstWithInst( + VectorPH->getTerminator(), + BranchInst::Create(ScalarPH, NewVectorPH, CheckBCOverflow)); + VectorPH = NewVectorPH; + // This is the IR builder that we use to add all of the logic for bypassing // the new vector loop. - IRBuilder<> BypassBuilder(BypassBlock->getTerminator()); + IRBuilder<> BypassBuilder(VectorPH->getTerminator()); setDebugLocFromInst(BypassBuilder, getDebugLocFromInstOrOperands(OldInduction)); @@ -2646,24 +2633,14 @@ void InnerLoopVectorizer::createEmptyLoop() { // jump to the scalar loop. Value *Cmp = BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx, "cmp.zero"); - - BasicBlock *LastBypassBlock = BypassBlock; - - // Generate code to check that the loops trip count that we computed by adding - // one to the backedge-taken count will not overflow. - { - auto PastOverflowCheck = - std::next(BasicBlock::iterator(OverflowCheckAnchor)); - BasicBlock *CheckBlock = - LastBypassBlock->splitBasicBlock(PastOverflowCheck, "overflow.checked"); - if (ParentLoop) - ParentLoop->addBasicBlockToLoop(CheckBlock, *LI); - LoopBypassBlocks.push_back(CheckBlock); - ReplaceInstWithInst( - LastBypassBlock->getTerminator(), - BranchInst::Create(ScalarPH, CheckBlock, CheckBCOverflow)); - LastBypassBlock = CheckBlock; - } + NewVectorPH = + VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.ph"); + if (ParentLoop) + ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI); + LoopBypassBlocks.push_back(VectorPH); + ReplaceInstWithInst(VectorPH->getTerminator(), + BranchInst::Create(MiddleBlock, NewVectorPH, Cmp)); + VectorPH = NewVectorPH; // Generate the code to check that the strides we assumed to be one are really // one. We want the new basic block to start at the first instruction in a @@ -2671,23 +2648,24 @@ void InnerLoopVectorizer::createEmptyLoop() { Instruction *StrideCheck; Instruction *FirstCheckInst; std::tie(FirstCheckInst, StrideCheck) = - addStrideCheck(LastBypassBlock->getTerminator()); + addStrideCheck(VectorPH->getTerminator()); if (StrideCheck) { AddedSafetyChecks = true; // Create a new block containing the stride check. - BasicBlock *CheckBlock = - LastBypassBlock->splitBasicBlock(FirstCheckInst, "vector.stridecheck"); + VectorPH->setName("vector.stridecheck"); + NewVectorPH = + VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.ph"); if (ParentLoop) - ParentLoop->addBasicBlockToLoop(CheckBlock, *LI); - LoopBypassBlocks.push_back(CheckBlock); + ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI); + LoopBypassBlocks.push_back(VectorPH); // Replace the branch into the memory check block with a conditional branch // for the "few elements case". - ReplaceInstWithInst(LastBypassBlock->getTerminator(), - BranchInst::Create(MiddleBlock, CheckBlock, Cmp)); + ReplaceInstWithInst( + VectorPH->getTerminator(), + BranchInst::Create(MiddleBlock, NewVectorPH, StrideCheck)); - Cmp = StrideCheck; - LastBypassBlock = CheckBlock; + VectorPH = NewVectorPH; } // Generate the code that checks in runtime if arrays overlap. We put the @@ -2695,28 +2673,26 @@ void InnerLoopVectorizer::createEmptyLoop() { // faster. Instruction *MemRuntimeCheck; std::tie(FirstCheckInst, MemRuntimeCheck) = - Legal->getLAI()->addRuntimeCheck(LastBypassBlock->getTerminator()); + Legal->getLAI()->addRuntimeCheck(VectorPH->getTerminator()); if (MemRuntimeCheck) { AddedSafetyChecks = true; // Create a new block containing the memory check. - BasicBlock *CheckBlock = - LastBypassBlock->splitBasicBlock(FirstCheckInst, "vector.memcheck"); + VectorPH->setName("vector.memcheck"); + NewVectorPH = + VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.ph"); if (ParentLoop) - ParentLoop->addBasicBlockToLoop(CheckBlock, *LI); - LoopBypassBlocks.push_back(CheckBlock); + ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI); + LoopBypassBlocks.push_back(VectorPH); // Replace the branch into the memory check block with a conditional branch // for the "few elements case". - ReplaceInstWithInst(LastBypassBlock->getTerminator(), - BranchInst::Create(MiddleBlock, CheckBlock, Cmp)); + ReplaceInstWithInst( + VectorPH->getTerminator(), + BranchInst::Create(MiddleBlock, NewVectorPH, MemRuntimeCheck)); - Cmp = MemRuntimeCheck; - LastBypassBlock = CheckBlock; + VectorPH = NewVectorPH; } - ReplaceInstWithInst(LastBypassBlock->getTerminator(), - BranchInst::Create(MiddleBlock, VectorPH, Cmp)); - // We are going to resume the execution of the scalar loop. // Go over all of the induction variables that we found and fix the // PHIs that are left in the scalar version of the loop. @@ -3831,7 +3807,7 @@ bool LoopVectorizationLegality::canVectorize() { } // We can only vectorize innermost loops. - if (!TheLoop->getSubLoopsVector().empty()) { + if (!TheLoop->empty()) { emitAnalysis(VectorizationReport() << "loop is not the innermost loop"); return false; } @@ -3897,10 +3873,11 @@ bool LoopVectorizationLegality::canVectorize() { // Collect all of the variables that remain uniform after vectorization. collectLoopUniforms(); - DEBUG(dbgs() << "LV: We can vectorize this loop" << - (LAI->getRuntimePointerCheck()->Need ? " (with a runtime bound check)" : - "") - <<"!\n"); + DEBUG(dbgs() << "LV: We can vectorize this loop" + << (LAI->getRuntimePointerChecking()->Need + ? " (with a runtime bound check)" + : "") + << "!\n"); // Analyze interleaved memory accesses. if (EnableInterleavedMemAccesses) @@ -4130,118 +4107,6 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { return true; } -///\brief Remove GEPs whose indices but the last one are loop invariant and -/// return the induction operand of the gep pointer. -static Value *stripGetElementPtr(Value *Ptr, ScalarEvolution *SE, Loop *Lp) { - GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr); - if (!GEP) - return Ptr; - - unsigned InductionOperand = getGEPInductionOperand(GEP); - - // Check that all of the gep indices are uniform except for our induction - // operand. - for (unsigned i = 0, e = GEP->getNumOperands(); i != e; ++i) - if (i != InductionOperand && - !SE->isLoopInvariant(SE->getSCEV(GEP->getOperand(i)), Lp)) - return Ptr; - return GEP->getOperand(InductionOperand); -} - -///\brief Look for a cast use of the passed value. -static Value *getUniqueCastUse(Value *Ptr, Loop *Lp, Type *Ty) { - Value *UniqueCast = nullptr; - for (User *U : Ptr->users()) { - CastInst *CI = dyn_cast<CastInst>(U); - if (CI && CI->getType() == Ty) { - if (!UniqueCast) - UniqueCast = CI; - else - return nullptr; - } - } - return UniqueCast; -} - -///\brief Get the stride of a pointer access in a loop. -/// Looks for symbolic strides "a[i*stride]". Returns the symbolic stride as a -/// pointer to the Value, or null otherwise. -static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *Lp) { - const PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType()); - if (!PtrTy || PtrTy->isAggregateType()) - return nullptr; - - // Try to remove a gep instruction to make the pointer (actually index at this - // point) easier analyzable. If OrigPtr is equal to Ptr we are analzying the - // pointer, otherwise, we are analyzing the index. - Value *OrigPtr = Ptr; - - // The size of the pointer access. - int64_t PtrAccessSize = 1; - - Ptr = stripGetElementPtr(Ptr, SE, Lp); - const SCEV *V = SE->getSCEV(Ptr); - - if (Ptr != OrigPtr) - // Strip off casts. - while (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(V)) - V = C->getOperand(); - - const SCEVAddRecExpr *S = dyn_cast<SCEVAddRecExpr>(V); - if (!S) - return nullptr; - - V = S->getStepRecurrence(*SE); - if (!V) - return nullptr; - - // Strip off the size of access multiplication if we are still analyzing the - // pointer. - if (OrigPtr == Ptr) { - const DataLayout &DL = Lp->getHeader()->getModule()->getDataLayout(); - DL.getTypeAllocSize(PtrTy->getElementType()); - if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(V)) { - if (M->getOperand(0)->getSCEVType() != scConstant) - return nullptr; - - const APInt &APStepVal = - cast<SCEVConstant>(M->getOperand(0))->getValue()->getValue(); - - // Huge step value - give up. - if (APStepVal.getBitWidth() > 64) - return nullptr; - - int64_t StepVal = APStepVal.getSExtValue(); - if (PtrAccessSize != StepVal) - return nullptr; - V = M->getOperand(1); - } - } - - // Strip off casts. - Type *StripedOffRecurrenceCast = nullptr; - if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(V)) { - StripedOffRecurrenceCast = C->getType(); - V = C->getOperand(); - } - - // Look for the loop invariant symbolic value. - const SCEVUnknown *U = dyn_cast<SCEVUnknown>(V); - if (!U) - return nullptr; - - Value *Stride = U->getValue(); - if (!Lp->isLoopInvariant(Stride)) - return nullptr; - - // If we have stripped off the recurrence cast we have to make sure that we - // return the value that is used in this loop so that we can replace it later. - if (StripedOffRecurrenceCast) - Stride = getUniqueCastUse(Stride, Lp, StripedOffRecurrenceCast); - - return Stride; -} - void LoopVectorizationLegality::collectStridedAccess(Value *MemAccess) { Value *Ptr = nullptr; if (LoadInst *LI = dyn_cast<LoadInst>(MemAccess)) @@ -4585,7 +4450,7 @@ LoopVectorizationCostModel::VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { // Width 1 means no vectorize VectorizationFactor Factor = { 1U, 0U }; - if (OptForSize && Legal->getRuntimePointerCheck()->Need) { + if (OptForSize && Legal->getRuntimePointerChecking()->Need) { emitAnalysis(VectorizationReport() << "runtime pointer checks needed. Enable vectorization of this " "loop with '#pragma clang loop vectorize(enable)' when " @@ -4745,41 +4610,40 @@ unsigned LoopVectorizationCostModel::getWidestType() { return MaxWidth; } -unsigned -LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, - unsigned VF, - unsigned LoopCost) { +unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, + unsigned VF, + unsigned LoopCost) { - // -- The unroll heuristics -- - // We unroll the loop in order to expose ILP and reduce the loop overhead. + // -- The interleave heuristics -- + // We interleave the loop in order to expose ILP and reduce the loop overhead. // There are many micro-architectural considerations that we can't predict // at this level. For example, frontend pressure (on decode or fetch) due to // code size, or the number and capabilities of the execution ports. // - // We use the following heuristics to select the unroll factor: - // 1. If the code has reductions, then we unroll in order to break the cross + // We use the following heuristics to select the interleave count: + // 1. If the code has reductions, then we interleave to break the cross // iteration dependency. - // 2. If the loop is really small, then we unroll in order to reduce the loop + // 2. If the loop is really small, then we interleave to reduce the loop // overhead. - // 3. We don't unroll if we think that we will spill registers to memory due - // to the increased register pressure. + // 3. We don't interleave if we think that we will spill registers to memory + // due to the increased register pressure. // Use the user preference, unless 'auto' is selected. int UserUF = Hints->getInterleave(); if (UserUF != 0) return UserUF; - // When we optimize for size, we don't unroll. + // When we optimize for size, we don't interleave. if (OptForSize) return 1; - // We used the distance for the unroll factor. + // We used the distance for the interleave count. if (Legal->getMaxSafeDepDistBytes() != -1U) return 1; - // Do not unroll loops with a relatively small trip count. + // Do not interleave loops with a relatively small trip count. unsigned TC = SE->getSmallConstantTripCount(TheLoop); - if (TC > 1 && TC < TinyTripCountUnrollThreshold) + if (TC > 1 && TC < TinyTripCountInterleaveThreshold) return 1; unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1); @@ -4800,32 +4664,32 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U); R.NumInstructions = std::max(R.NumInstructions, 1U); - // We calculate the unroll factor using the following formula. + // We calculate the interleave count using the following formula. // Subtract the number of loop invariants from the number of available - // registers. These registers are used by all of the unrolled instances. + // registers. These registers are used by all of the interleaved instances. // Next, divide the remaining registers by the number of registers that is // required by the loop, in order to estimate how many parallel instances // fit without causing spills. All of this is rounded down if necessary to be - // a power of two. We want power of two unroll factors to simplify any + // a power of two. We want power of two interleave count to simplify any // addressing operations or alignment considerations. - unsigned UF = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) / + unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) / R.MaxLocalUsers); - // Don't count the induction variable as unrolled. + // Don't count the induction variable as interleaved. if (EnableIndVarRegisterHeur) - UF = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) / + IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) / std::max(1U, (R.MaxLocalUsers - 1))); - // Clamp the unroll factor ranges to reasonable factors. - unsigned MaxInterleaveSize = TTI.getMaxInterleaveFactor(VF); + // Clamp the interleave ranges to reasonable counts. + unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); - // Check if the user has overridden the unroll max. + // Check if the user has overridden the max. if (VF == 1) { if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) - MaxInterleaveSize = ForceTargetMaxScalarInterleaveFactor; + MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; } else { if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) - MaxInterleaveSize = ForceTargetMaxVectorInterleaveFactor; + MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; } // If we did not calculate the cost for VF (because the user selected the VF) @@ -4833,72 +4697,74 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, if (LoopCost == 0) LoopCost = expectedCost(VF); - // Clamp the calculated UF to be between the 1 and the max unroll factor + // Clamp the calculated IC to be between the 1 and the max interleave count // that the target allows. - if (UF > MaxInterleaveSize) - UF = MaxInterleaveSize; - else if (UF < 1) - UF = 1; + if (IC > MaxInterleaveCount) + IC = MaxInterleaveCount; + else if (IC < 1) + IC = 1; - // Unroll if we vectorized this loop and there is a reduction that could - // benefit from unrolling. + // Interleave if we vectorized this loop and there is a reduction that could + // benefit from interleaving. if (VF > 1 && Legal->getReductionVars()->size()) { - DEBUG(dbgs() << "LV: Unrolling because of reductions.\n"); - return UF; + DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); + return IC; } // Note that if we've already vectorized the loop we will have done the - // runtime check and so unrolling won't require further checks. - bool UnrollingRequiresRuntimePointerCheck = - (VF == 1 && Legal->getRuntimePointerCheck()->Need); + // runtime check and so interleaving won't require further checks. + bool InterleavingRequiresRuntimePointerCheck = + (VF == 1 && Legal->getRuntimePointerChecking()->Need); - // We want to unroll small loops in order to reduce the loop overhead and + // We want to interleave small loops in order to reduce the loop overhead and // potentially expose ILP opportunities. DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'); - if (!UnrollingRequiresRuntimePointerCheck && - LoopCost < SmallLoopCost) { + if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { // We assume that the cost overhead is 1 and we use the cost model - // to estimate the cost of the loop and unroll until the cost of the + // to estimate the cost of the loop and interleave until the cost of the // loop overhead is about 5% of the cost of the loop. - unsigned SmallUF = std::min(UF, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); + unsigned SmallIC = + std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); - // Unroll until store/load ports (estimated by max unroll factor) are + // Interleave until store/load ports (estimated by max interleave count) are // saturated. unsigned NumStores = Legal->getNumStores(); unsigned NumLoads = Legal->getNumLoads(); - unsigned StoresUF = UF / (NumStores ? NumStores : 1); - unsigned LoadsUF = UF / (NumLoads ? NumLoads : 1); + unsigned StoresIC = IC / (NumStores ? NumStores : 1); + unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); // If we have a scalar reduction (vector reductions are already dealt with // by this point), we can increase the critical path length if the loop - // we're unrolling is inside another loop. Limit, by default to 2, so the + // we're interleaving is inside another loop. Limit, by default to 2, so the // critical path only gets increased by one reduction operation. if (Legal->getReductionVars()->size() && TheLoop->getLoopDepth() > 1) { - unsigned F = static_cast<unsigned>(MaxNestedScalarReductionUF); - SmallUF = std::min(SmallUF, F); - StoresUF = std::min(StoresUF, F); - LoadsUF = std::min(LoadsUF, F); + unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); + SmallIC = std::min(SmallIC, F); + StoresIC = std::min(StoresIC, F); + LoadsIC = std::min(LoadsIC, F); } - if (EnableLoadStoreRuntimeUnroll && std::max(StoresUF, LoadsUF) > SmallUF) { - DEBUG(dbgs() << "LV: Unrolling to saturate store or load ports.\n"); - return std::max(StoresUF, LoadsUF); + if (EnableLoadStoreRuntimeInterleave && + std::max(StoresIC, LoadsIC) > SmallIC) { + DEBUG(dbgs() << "LV: Interleaving to saturate store or load ports.\n"); + return std::max(StoresIC, LoadsIC); } - DEBUG(dbgs() << "LV: Unrolling to reduce branch cost.\n"); - return SmallUF; + DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); + return SmallIC; } - // Unroll if this is a large loop (small loops are already dealt with by this - // point) that could benefit from interleaved unrolling. + // Interleave if this is a large loop (small loops are already dealt with by + // this + // point) that could benefit from interleaving. bool HasReductions = (Legal->getReductionVars()->size() > 0); if (TTI.enableAggressiveInterleaving(HasReductions)) { - DEBUG(dbgs() << "LV: Unrolling to expose ILP.\n"); - return UF; + DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); + return IC; } - DEBUG(dbgs() << "LV: Not Unrolling.\n"); + DEBUG(dbgs() << "LV: Not Interleaving.\n"); return 1; } diff --git a/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 7c4c279..7bac407 100644 --- a/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -69,8 +69,13 @@ static cl::opt<bool> ShouldStartVectorizeHorAtStore( cl::desc( "Attempt to vectorize horizontal reductions feeding into a store")); +static cl::opt<int> +MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, + cl::desc("Attempt to vectorize for this register size in bits")); + namespace { +// FIXME: Set this via cl::opt to allow overriding. static const unsigned MinVecRegSize = 128; static const unsigned RecursionMaxDepth = 12; @@ -2136,9 +2141,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } // Prepare the operand vector. - for (unsigned j = 0; j < E->Scalars.size(); ++j) - Operands.push_back(cast<PHINode>(E->Scalars[j])-> - getIncomingValueForBlock(IBB)); + for (Value *V : E->Scalars) + Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(IBB)); Builder.SetInsertPoint(IBB->getTerminator()); Builder.SetCurrentDebugLocation(PH->getDebugLoc()); @@ -2172,8 +2176,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { case Instruction::FPTrunc: case Instruction::BitCast: { ValueList INVL; - for (int i = 0, e = E->Scalars.size(); i < e; ++i) - INVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); + for (Value *V : E->Scalars) + INVL.push_back(cast<Instruction>(V)->getOperand(0)); setInsertPointAfterBundle(E->Scalars); @@ -2191,9 +2195,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { case Instruction::FCmp: case Instruction::ICmp: { ValueList LHSV, RHSV; - for (int i = 0, e = E->Scalars.size(); i < e; ++i) { - LHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); - RHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1)); + for (Value *V : E->Scalars) { + LHSV.push_back(cast<Instruction>(V)->getOperand(0)); + RHSV.push_back(cast<Instruction>(V)->getOperand(1)); } setInsertPointAfterBundle(E->Scalars); @@ -2217,10 +2221,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } case Instruction::Select: { ValueList TrueVec, FalseVec, CondVec; - for (int i = 0, e = E->Scalars.size(); i < e; ++i) { - CondVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); - TrueVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1)); - FalseVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(2)); + for (Value *V : E->Scalars) { + CondVec.push_back(cast<Instruction>(V)->getOperand(0)); + TrueVec.push_back(cast<Instruction>(V)->getOperand(1)); + FalseVec.push_back(cast<Instruction>(V)->getOperand(2)); } setInsertPointAfterBundle(E->Scalars); @@ -2259,9 +2263,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL); else - for (int i = 0, e = E->Scalars.size(); i < e; ++i) { - LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); - RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1)); + for (Value *V : E->Scalars) { + LHSVL.push_back(cast<Instruction>(V)->getOperand(0)); + RHSVL.push_back(cast<Instruction>(V)->getOperand(1)); } setInsertPointAfterBundle(E->Scalars); @@ -2322,8 +2326,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { unsigned AS = SI->getPointerAddressSpace(); ValueList ValueOp; - for (int i = 0, e = E->Scalars.size(); i < e; ++i) - ValueOp.push_back(cast<StoreInst>(E->Scalars[i])->getValueOperand()); + for (Value *V : E->Scalars) + ValueOp.push_back(cast<StoreInst>(V)->getValueOperand()); setInsertPointAfterBundle(E->Scalars); @@ -2351,8 +2355,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { setInsertPointAfterBundle(E->Scalars); ValueList Op0VL; - for (int i = 0, e = E->Scalars.size(); i < e; ++i) - Op0VL.push_back(cast<GetElementPtrInst>(E->Scalars[i])->getOperand(0)); + for (Value *V : E->Scalars) + Op0VL.push_back(cast<GetElementPtrInst>(V)->getOperand(0)); Value *Op0 = vectorizeTree(Op0VL); @@ -2360,8 +2364,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e; ++j) { ValueList OpVL; - for (int i = 0, e = E->Scalars.size(); i < e; ++i) - OpVL.push_back(cast<GetElementPtrInst>(E->Scalars[i])->getOperand(j)); + for (Value *V : E->Scalars) + OpVL.push_back(cast<GetElementPtrInst>(V)->getOperand(j)); Value *OpVec = vectorizeTree(OpVL); OpVecs.push_back(OpVec); @@ -2397,8 +2401,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { OpVecs.push_back(CEI->getArgOperand(j)); continue; } - for (int i = 0, e = E->Scalars.size(); i < e; ++i) { - CallInst *CEI = cast<CallInst>(E->Scalars[i]); + for (Value *V : E->Scalars) { + CallInst *CEI = cast<CallInst>(V); OpVL.push_back(CEI->getArgOperand(j)); } @@ -3089,6 +3093,17 @@ struct SLPVectorizer : public FunctionPass { if (!TTI->getNumberOfRegisters(true)) return false; + // Use the vector register size specified by the target unless overridden + // by a command-line option. + // TODO: It would be better to limit the vectorization factor based on + // data type rather than just register size. For example, x86 AVX has + // 256-bit registers, but it does not support integer operations + // at that width (that requires AVX2). + if (MaxVectorRegSizeOption.getNumOccurrences()) + MaxVecRegSize = MaxVectorRegSizeOption; + else + MaxVecRegSize = TTI->getRegisterBitWidth(true); + // Don't vectorize when the attribute NoImplicitFloat is used. if (F.hasFnAttribute(Attribute::NoImplicitFloat)) return false; @@ -3166,12 +3181,13 @@ private: bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R); bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold, - BoUpSLP &R); + BoUpSLP &R, unsigned VecRegSize); bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold, BoUpSLP &R); private: StoreListMap StoreRefs; + unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt. }; /// \brief Check that the Values in the slice in VL array are still existent in @@ -3186,14 +3202,15 @@ static bool hasValueBeenRAUWed(ArrayRef<Value *> VL, ArrayRef<WeakVH> VH, } bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain, - int CostThreshold, BoUpSLP &R) { + int CostThreshold, BoUpSLP &R, + unsigned VecRegSize) { unsigned ChainLen = Chain.size(); DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen << "\n"); Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType(); auto &DL = cast<StoreInst>(Chain[0])->getModule()->getDataLayout(); unsigned Sz = DL.getTypeSizeInBits(StoreTy); - unsigned VF = MinVecRegSize / Sz; + unsigned VF = VecRegSize / Sz; if (!isPowerOf2_32(Sz) || VF < 2) return false; @@ -3277,12 +3294,16 @@ bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores, I = ConsecutiveChain[I]; } - bool Vectorized = vectorizeStoreChain(Operands, costThreshold, R); - - // Mark the vectorized stores so that we don't vectorize them again. - if (Vectorized) - VectorizedStores.insert(Operands.begin(), Operands.end()); - Changed |= Vectorized; + // FIXME: Is division-by-2 the correct step? Should we assert that the + // register size is a power-of-2? + for (unsigned Size = MaxVecRegSize; Size >= MinVecRegSize; Size /= 2) { + if (vectorizeStoreChain(Operands, costThreshold, R, Size)) { + // Mark the vectorized stores so that we don't vectorize them again. + VectorizedStores.insert(Operands.begin(), Operands.end()); + Changed = true; + break; + } + } } return Changed; @@ -3293,8 +3314,8 @@ unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) { unsigned count = 0; StoreRefs.clear(); const DataLayout &DL = BB->getModule()->getDataLayout(); - for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { - StoreInst *SI = dyn_cast<StoreInst>(it); + for (Instruction &I : *BB) { + StoreInst *SI = dyn_cast<StoreInst>(&I); if (!SI) continue; @@ -3342,13 +3363,15 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, Type *Ty0 = I0->getType(); unsigned Sz = DL.getTypeSizeInBits(Ty0); + // FIXME: Register size should be a parameter to this function, so we can + // try different vectorization factors. unsigned VF = MinVecRegSize / Sz; - for (int i = 0, e = VL.size(); i < e; ++i) { - Type *Ty = VL[i]->getType(); + for (Value *V : VL) { + Type *Ty = V->getType(); if (!isValidElementType(Ty)) return false; - Instruction *Inst = dyn_cast<Instruction>(VL[i]); + Instruction *Inst = dyn_cast<Instruction>(V); if (!Inst || Inst->getOpcode() != Opcode0) return false; } @@ -3571,6 +3594,8 @@ public: const DataLayout &DL = B->getModule()->getDataLayout(); ReductionOpcode = B->getOpcode(); ReducedValueOpcode = 0; + // FIXME: Register size should be a parameter to this function, so we can + // try different vectorization factors. ReduxWidth = MinVecRegSize / DL.getTypeSizeInBits(Ty); ReductionRoot = B; ReductionPHI = Phi; @@ -3997,6 +4022,9 @@ bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) { << it->second.size() << ".\n"); // Process the stores in chunks of 16. + // TODO: The limit of 16 inhibits greater vectorization factors. + // For example, AVX2 supports v32i8. Increasing this limit, however, + // may cause a significant compile-time increase. for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) { unsigned Len = std::min<unsigned>(CE - CI, 16); Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len), |