summaryrefslogtreecommitdiffstats
path: root/contrib/llvm/lib/Transforms
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm/lib/Transforms')
-rw-r--r--contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp2
-rw-r--r--contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp26
-rw-r--r--contrib/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp84
-rw-r--r--contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp10
-rw-r--r--contrib/llvm/lib/Transforms/IPO/IPO.cpp1
-rw-r--r--contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp16
-rw-r--r--contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp4
-rw-r--r--contrib/llvm/lib/Transforms/InstCombine/InstCombineInternal.h8
-rw-r--r--contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp17
-rw-r--r--contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp67
-rw-r--r--contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp29
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/GVN.cpp13
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp56
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LICM.cpp13
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp193
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp2
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp12
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp6
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp8
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp4
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/SCCP.cpp3
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/SROA.cpp21
-rw-r--r--contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp2
-rw-r--r--contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp66
-rw-r--r--contrib/llvm/lib/Transforms/Utils/Local.cpp11
-rw-r--r--contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp1
-rw-r--r--contrib/llvm/lib/Transforms/Utils/LoopVersioning.cpp106
-rw-r--r--contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp474
-rw-r--r--contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp102
29 files changed, 686 insertions, 671 deletions
diff --git a/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
index f754363..4762011 100644
--- a/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -825,7 +825,6 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
V = GetElementPtrInst::Create(SI->first, V, Ops,
V->getName() + ".idx", Call);
Ops.clear();
- AA.copyValue(OrigLoad->getOperand(0), V);
}
// Since we're replacing a load make sure we take the alignment
// of the previous load.
@@ -837,7 +836,6 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
newLoad->setAAMetadata(AAInfo);
Args.push_back(newLoad);
- AA.copyValue(OrigLoad, Args.back());
}
}
diff --git a/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
index 76898f2..d044764 100644
--- a/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -326,7 +326,18 @@ bool DAE::DeleteDeadVarargs(Function &Fn) {
/// instead.
bool DAE::RemoveDeadArgumentsFromCallers(Function &Fn)
{
- if (Fn.isDeclaration() || Fn.mayBeOverridden())
+ // We cannot change the arguments if this TU does not define the function or
+ // if the linker may choose a function body from another TU, even if the
+ // nominal linkage indicates that other copies of the function have the same
+ // semantics. In the below example, the dead load from %p may not have been
+ // eliminated from the linker-chosen copy of f, so replacing %p with undef
+ // in callers may introduce undefined behavior.
+ //
+ // define linkonce_odr void @f(i32* %p) {
+ // %v = load i32 %p
+ // ret void
+ // }
+ if (!Fn.isStrongDefinitionForLinker())
return false;
// Functions with local linkage should already have been handled, except the
@@ -334,19 +345,6 @@ bool DAE::RemoveDeadArgumentsFromCallers(Function &Fn)
if (Fn.hasLocalLinkage() && !Fn.getFunctionType()->isVarArg())
return false;
- // If a function seen at compile time is not necessarily the one linked to
- // the binary being built, it is illegal to change the actual arguments
- // passed to it. These functions can be captured by isWeakForLinker().
- // *NOTE* that mayBeOverridden() is insufficient for this purpose as it
- // doesn't include linkage types like AvailableExternallyLinkage and
- // LinkOnceODRLinkage. Take link_odr* as an example, it indicates a set of
- // *EQUIVALENT* globals that can be merged at link-time. However, the
- // semantic of *EQUIVALENT*-functions includes parameters. Changing
- // parameters breaks this assumption.
- //
- if (Fn.isWeakForLinker())
- return false;
-
if (Fn.use_empty())
return false;
diff --git a/contrib/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp b/contrib/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp
new file mode 100644
index 0000000..67ba72d
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp
@@ -0,0 +1,84 @@
+//===-- ElimAvailExtern.cpp - DCE unreachable internal functions ----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This transform is designed to eliminate available external global
+// definitions from the program, turning them into declarations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/CtorUtils.h"
+#include "llvm/Transforms/Utils/GlobalStatus.h"
+#include "llvm/Pass.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "elim-avail-extern"
+
+STATISTIC(NumFunctions, "Number of functions removed");
+STATISTIC(NumVariables, "Number of global variables removed");
+
+namespace {
+ struct EliminateAvailableExternally : public ModulePass {
+ static char ID; // Pass identification, replacement for typeid
+ EliminateAvailableExternally() : ModulePass(ID) {
+ initializeEliminateAvailableExternallyPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ // run - Do the EliminateAvailableExternally pass on the specified module,
+ // optionally updating the specified callgraph to reflect the changes.
+ //
+ bool runOnModule(Module &M) override;
+ };
+}
+
+char EliminateAvailableExternally::ID = 0;
+INITIALIZE_PASS(EliminateAvailableExternally, "elim-avail-extern",
+ "Eliminate Available Externally Globals", false, false)
+
+ModulePass *llvm::createEliminateAvailableExternallyPass() {
+ return new EliminateAvailableExternally();
+}
+
+bool EliminateAvailableExternally::runOnModule(Module &M) {
+ bool Changed = false;
+
+ // Drop initializers of available externally global variables.
+ for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+ I != E; ++I) {
+ if (!I->hasAvailableExternallyLinkage())
+ continue;
+ if (I->hasInitializer()) {
+ Constant *Init = I->getInitializer();
+ I->setInitializer(nullptr);
+ if (isSafeToDestroyConstant(Init))
+ Init->destroyConstant();
+ }
+ I->removeDeadConstantUsers();
+ I->setLinkage(GlobalValue::ExternalLinkage);
+ NumVariables++;
+ }
+
+ // Drop the bodies of available externally functions.
+ for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
+ if (!I->hasAvailableExternallyLinkage())
+ continue;
+ if (!I->isDeclaration())
+ // This will set the linkage to external
+ I->deleteBody();
+ I->removeDeadConstantUsers();
+ NumFunctions++;
+ }
+
+ return Changed;
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp b/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp
index 2f8c7d9..b9462f2 100644
--- a/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp
@@ -93,8 +93,11 @@ namespace {
makeVisible(*I, Delete);
- if (Delete)
+ if (Delete) {
+ // Make this a declaration and drop it's comdat.
I->setInitializer(nullptr);
+ I->setComdat(nullptr);
+ }
}
// Visit the Functions.
@@ -108,8 +111,11 @@ namespace {
makeVisible(*I, Delete);
- if (Delete)
+ if (Delete) {
+ // Make this a declaration and drop it's comdat.
I->deleteBody();
+ I->setComdat(nullptr);
+ }
}
// Visit the Aliases.
diff --git a/contrib/llvm/lib/Transforms/IPO/IPO.cpp b/contrib/llvm/lib/Transforms/IPO/IPO.cpp
index fcacec328..50f56b0 100644
--- a/contrib/llvm/lib/Transforms/IPO/IPO.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/IPO.cpp
@@ -46,6 +46,7 @@ void llvm::initializeIPO(PassRegistry &Registry) {
initializeStripDeadDebugInfoPass(Registry);
initializeStripNonDebugSymbolsPass(Registry);
initializeBarrierNoopPass(Registry);
+ initializeEliminateAvailableExternallyPass(Registry);
}
void LLVMInitializeIPO(LLVMPassRegistryRef R) {
diff --git a/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
index 963f1bb..88e5e47 100644
--- a/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -105,6 +105,7 @@ PassManagerBuilder::PassManagerBuilder() {
VerifyInput = false;
VerifyOutput = false;
MergeFunctions = false;
+ PrepareForLTO = false;
}
PassManagerBuilder::~PassManagerBuilder() {
@@ -319,8 +320,8 @@ void PassManagerBuilder::populateModulePassManager(
// Re-rotate loops in all our loop nests. These may have fallout out of
// rotated form due to GVN or other transformations, and the vectorizer relies
- // on the rotated form.
- MPM.add(createLoopRotatePass());
+ // on the rotated form. Disable header duplication at -Oz.
+ MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1));
// Distribute loops to allow partial vectorization. I.e. isolate dependences
// into separate loop that would otherwise inhibit vectorization.
@@ -401,6 +402,17 @@ void PassManagerBuilder::populateModulePassManager(
// GlobalOpt already deletes dead functions and globals, at -O2 try a
// late pass of GlobalDCE. It is capable of deleting dead cycles.
if (OptLevel > 1) {
+ if (!PrepareForLTO) {
+ // Remove avail extern fns and globals definitions if we aren't
+ // compiling an object file for later LTO. For LTO we want to preserve
+ // these so they are eligible for inlining at link-time. Note if they
+ // are unreferenced they will be removed by GlobalDCE below, so
+ // this only impacts referenced available externally globals.
+ // Eventually they will be suppressed during codegen, but eliminating
+ // here enables more opportunity for GlobalDCE as it may make
+ // globals referenced by available external functions dead.
+ MPM.add(createEliminateAvailableExternallyPass());
+ }
MPM.add(createGlobalDCEPass()); // Remove dead fns and globals.
MPM.add(createConstantMergePass()); // Merge dup global constants
}
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 010b7b5..0bd6fd2 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -3928,8 +3928,8 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
- if (Value *V =
- SimplifyFCmpInst(I.getPredicate(), Op0, Op1, DL, TLI, DT, AC, &I))
+ if (Value *V = SimplifyFCmpInst(I.getPredicate(), Op0, Op1,
+ I.getFastMathFlags(), DL, TLI, DT, AC, &I))
return ReplaceInstUsesWith(I, V);
// Simplify 'fcmp pred X, X'
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/contrib/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 97ea8df..ac934f1 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -15,6 +15,7 @@
#ifndef LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINEINTERNAL_H
#define LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINEINTERNAL_H
+#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/TargetFolder.h"
@@ -177,6 +178,8 @@ private:
// Mode in which we are running the combiner.
const bool MinimizeSize;
+ AliasAnalysis *AA;
+
// Required analyses.
// FIXME: These can never be null and should be references.
AssumptionCache *AC;
@@ -192,10 +195,11 @@ private:
public:
InstCombiner(InstCombineWorklist &Worklist, BuilderTy *Builder,
- bool MinimizeSize, AssumptionCache *AC, TargetLibraryInfo *TLI,
+ bool MinimizeSize, AliasAnalysis *AA,
+ AssumptionCache *AC, TargetLibraryInfo *TLI,
DominatorTree *DT, const DataLayout &DL, LoopInfo *LI)
: Worklist(Worklist), Builder(Builder), MinimizeSize(MinimizeSize),
- AC(AC), TLI(TLI), DT(DT), DL(DL), LI(LI), MadeIRChange(false) {}
+ AA(AA), AC(AC), TLI(TLI), DT(DT), DL(DL), LI(LI), MadeIRChange(false) {}
/// \brief Run the combiner over the entire worklist until it is empty.
///
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index e7a4533..e3179db 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -749,10 +749,25 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
// where there are several consecutive memory accesses to the same location,
// separated by a few arithmetic operations.
BasicBlock::iterator BBI = &LI;
- if (Value *AvailableVal = FindAvailableLoadedValue(Op, LI.getParent(), BBI,6))
+ AAMDNodes AATags;
+ if (Value *AvailableVal = FindAvailableLoadedValue(Op, LI.getParent(), BBI,
+ 6, AA, &AATags)) {
+ if (LoadInst *NLI = dyn_cast<LoadInst>(AvailableVal)) {
+ unsigned KnownIDs[] = {
+ LLVMContext::MD_tbaa,
+ LLVMContext::MD_alias_scope,
+ LLVMContext::MD_noalias,
+ LLVMContext::MD_range,
+ LLVMContext::MD_invariant_load,
+ LLVMContext::MD_nonnull,
+ };
+ combineMetadata(NLI, &LI, KnownIDs);
+ };
+
return ReplaceInstUsesWith(
LI, Builder->CreateBitOrPointerCast(AvailableVal, LI.getType(),
LI.getName() + ".cast"));
+ }
// load(gep null, ...) -> unreachable
if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Op)) {
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index 24446c8..2730472 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -14,6 +14,8 @@
#include "InstCombineInternal.h"
#include "llvm/ADT/DenseMap.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/PatternMatch.h"
using namespace llvm;
using namespace PatternMatch;
@@ -60,56 +62,6 @@ static bool CheapToScalarize(Value *V, bool isConstant) {
return false;
}
-/// FindScalarElement - Given a vector and an element number, see if the scalar
-/// value is already around as a register, for example if it were inserted then
-/// extracted from the vector.
-static Value *FindScalarElement(Value *V, unsigned EltNo) {
- assert(V->getType()->isVectorTy() && "Not looking at a vector?");
- VectorType *VTy = cast<VectorType>(V->getType());
- unsigned Width = VTy->getNumElements();
- if (EltNo >= Width) // Out of range access.
- return UndefValue::get(VTy->getElementType());
-
- if (Constant *C = dyn_cast<Constant>(V))
- return C->getAggregateElement(EltNo);
-
- if (InsertElementInst *III = dyn_cast<InsertElementInst>(V)) {
- // If this is an insert to a variable element, we don't know what it is.
- if (!isa<ConstantInt>(III->getOperand(2)))
- return nullptr;
- unsigned IIElt = cast<ConstantInt>(III->getOperand(2))->getZExtValue();
-
- // If this is an insert to the element we are looking for, return the
- // inserted value.
- if (EltNo == IIElt)
- return III->getOperand(1);
-
- // Otherwise, the insertelement doesn't modify the value, recurse on its
- // vector input.
- return FindScalarElement(III->getOperand(0), EltNo);
- }
-
- if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(V)) {
- unsigned LHSWidth = SVI->getOperand(0)->getType()->getVectorNumElements();
- int InEl = SVI->getMaskValue(EltNo);
- if (InEl < 0)
- return UndefValue::get(VTy->getElementType());
- if (InEl < (int)LHSWidth)
- return FindScalarElement(SVI->getOperand(0), InEl);
- return FindScalarElement(SVI->getOperand(1), InEl - LHSWidth);
- }
-
- // Extract a value from a vector add operation with a constant zero.
- Value *Val = nullptr; Constant *Con = nullptr;
- if (match(V, m_Add(m_Value(Val), m_Constant(Con)))) {
- if (Con->getAggregateElement(EltNo)->isNullValue())
- return FindScalarElement(Val, EltNo);
- }
-
- // Otherwise, we don't know.
- return nullptr;
-}
-
// If we have a PHI node with a vector type that has only 2 uses: feed
// itself and be an operand of extractelement at a constant location,
// try to replace the PHI of the vector type with a PHI of a scalar type.
@@ -178,6 +130,10 @@ Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) {
}
Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
+ if (Value *V = SimplifyExtractElementInst(
+ EI.getVectorOperand(), EI.getIndexOperand(), DL, TLI, DT, AC))
+ return ReplaceInstUsesWith(EI, V);
+
// If vector val is constant with all elements the same, replace EI with
// that element. We handle a known element # below.
if (Constant *C = dyn_cast<Constant>(EI.getOperand(0)))
@@ -190,10 +146,8 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
unsigned IndexVal = IdxC->getZExtValue();
unsigned VectorWidth = EI.getVectorOperandType()->getNumElements();
- // If this is extracting an invalid index, turn this into undef, to avoid
- // crashing the code below.
- if (IndexVal >= VectorWidth)
- return ReplaceInstUsesWith(EI, UndefValue::get(EI.getType()));
+ // InstSimplify handles cases where the index is invalid.
+ assert(IndexVal < VectorWidth);
// This instruction only demands the single element from the input vector.
// If the input vector has a single use, simplify it based on this use
@@ -209,16 +163,13 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
}
}
- if (Value *Elt = FindScalarElement(EI.getOperand(0), IndexVal))
- return ReplaceInstUsesWith(EI, Elt);
-
// If the this extractelement is directly using a bitcast from a vector of
// the same number of elements, see if we can find the source element from
// it. In this case, we will end up needing to bitcast the scalars.
if (BitCastInst *BCI = dyn_cast<BitCastInst>(EI.getOperand(0))) {
if (VectorType *VT = dyn_cast<VectorType>(BCI->getOperand(0)->getType()))
if (VT->getNumElements() == VectorWidth)
- if (Value *Elt = FindScalarElement(BCI->getOperand(0), IndexVal))
+ if (Value *Elt = findScalarElement(BCI->getOperand(0), IndexVal))
return new BitCastInst(Elt, EI.getType());
}
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 2a81689..fd34a24 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -2174,16 +2174,9 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
if (!EV.hasIndices())
return ReplaceInstUsesWith(EV, Agg);
- if (Constant *C = dyn_cast<Constant>(Agg)) {
- if (Constant *C2 = C->getAggregateElement(*EV.idx_begin())) {
- if (EV.getNumIndices() == 0)
- return ReplaceInstUsesWith(EV, C2);
- // Extract the remaining indices out of the constant indexed by the
- // first index
- return ExtractValueInst::Create(C2, EV.getIndices().slice(1));
- }
- return nullptr; // Can't handle other constants
- }
+ if (Value *V =
+ SimplifyExtractValueInst(Agg, EV.getIndices(), DL, TLI, DT, AC))
+ return ReplaceInstUsesWith(EV, V);
if (InsertValueInst *IV = dyn_cast<InsertValueInst>(Agg)) {
// We're extracting from an insertvalue instruction, compare the indices
@@ -2972,8 +2965,9 @@ static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL,
static bool
combineInstructionsOverFunction(Function &F, InstCombineWorklist &Worklist,
- AssumptionCache &AC, TargetLibraryInfo &TLI,
- DominatorTree &DT, LoopInfo *LI = nullptr) {
+ AliasAnalysis *AA, AssumptionCache &AC,
+ TargetLibraryInfo &TLI, DominatorTree &DT,
+ LoopInfo *LI = nullptr) {
// Minimizing size?
bool MinimizeSize = F.hasFnAttribute(Attribute::MinSize);
auto &DL = F.getParent()->getDataLayout();
@@ -2998,7 +2992,8 @@ combineInstructionsOverFunction(Function &F, InstCombineWorklist &Worklist,
if (prepareICWorklistFromFunction(F, DL, &TLI, Worklist))
Changed = true;
- InstCombiner IC(Worklist, &Builder, MinimizeSize, &AC, &TLI, &DT, DL, LI);
+ InstCombiner IC(Worklist, &Builder, MinimizeSize,
+ AA, &AC, &TLI, &DT, DL, LI);
if (IC.run())
Changed = true;
@@ -3017,7 +3012,8 @@ PreservedAnalyses InstCombinePass::run(Function &F,
auto *LI = AM->getCachedResult<LoopAnalysis>(F);
- if (!combineInstructionsOverFunction(F, Worklist, AC, TLI, DT, LI))
+ // FIXME: The AliasAnalysis is not yet supported in the new pass manager
+ if (!combineInstructionsOverFunction(F, Worklist, nullptr, AC, TLI, DT, LI))
// No changes, all analyses are preserved.
return PreservedAnalyses::all();
@@ -3050,6 +3046,7 @@ public:
void InstructionCombiningPass::getAnalysisUsage(AnalysisUsage &AU) const {
AU.setPreservesCFG();
+ AU.addRequired<AliasAnalysis>();
AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<TargetLibraryInfoWrapperPass>();
AU.addRequired<DominatorTreeWrapperPass>();
@@ -3061,6 +3058,7 @@ bool InstructionCombiningPass::runOnFunction(Function &F) {
return false;
// Required analyses.
+ auto AA = &getAnalysis<AliasAnalysis>();
auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
@@ -3069,7 +3067,7 @@ bool InstructionCombiningPass::runOnFunction(Function &F) {
auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
- return combineInstructionsOverFunction(F, Worklist, AC, TLI, DT, LI);
+ return combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, DT, LI);
}
char InstructionCombiningPass::ID = 0;
@@ -3078,6 +3076,7 @@ INITIALIZE_PASS_BEGIN(InstructionCombiningPass, "instcombine",
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
INITIALIZE_PASS_END(InstructionCombiningPass, "instcombine",
"Combine redundant instructions", false, false)
diff --git a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp
index 60903c8..d1eba6e 100644
--- a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -656,11 +656,14 @@ namespace {
LeaderTableEntry* Prev = nullptr;
LeaderTableEntry* Curr = &LeaderTable[N];
- while (Curr->Val != I || Curr->BB != BB) {
+ while (Curr && (Curr->Val != I || Curr->BB != BB)) {
Prev = Curr;
Curr = Curr->Next;
}
+ if (!Curr)
+ return;
+
if (Prev) {
Prev->Next = Curr->Next;
} else {
@@ -1304,11 +1307,7 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI,
if (V->getType()->getScalarType()->isPointerTy()) {
AliasAnalysis *AA = gvn.getAliasAnalysis();
- for (unsigned i = 0, e = NewPHIs.size(); i != e; ++i)
- AA->copyValue(LI, NewPHIs[i]);
-
- // Now that we've copied information to the new PHIs, scan through
- // them again and inform alias analysis that we've added potentially
+ // Scan the new PHIs and inform alias analysis that we've added potentially
// escaping uses to any values that are operands to these PHIs.
for (unsigned i = 0, e = NewPHIs.size(); i != e; ++i) {
PHINode *P = NewPHIs[i];
@@ -1796,7 +1795,7 @@ static void patchReplacementInstruction(Instruction *I, Value *Repl) {
// In general, GVN unifies expressions over different control-flow
// regions, and so we need a conservative combination of the noalias
// scopes.
- unsigned KnownIDs[] = {
+ static const unsigned KnownIDs[] = {
LLVMContext::MD_tbaa,
LLVMContext::MD_alias_scope,
LLVMContext::MD_noalias,
diff --git a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
index 6f03754..2a954d9 100644
--- a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -41,6 +41,7 @@
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
@@ -135,6 +136,10 @@ namespace {
PHINode *IndVar, SCEVExpander &Rewriter);
void SinkUnusedInvariants(Loop *L);
+
+ Value *ExpandSCEVIfNeeded(SCEVExpander &Rewriter, const SCEV *S, Loop *L,
+ Instruction *InsertPt, Type *Ty,
+ bool &IsHighCostExpansion);
};
}
@@ -496,6 +501,52 @@ struct RewritePhi {
};
}
+Value *IndVarSimplify::ExpandSCEVIfNeeded(SCEVExpander &Rewriter, const SCEV *S,
+ Loop *L, Instruction *InsertPt,
+ Type *ResultTy,
+ bool &IsHighCostExpansion) {
+ using namespace llvm::PatternMatch;
+
+ if (!Rewriter.isHighCostExpansion(S, L)) {
+ IsHighCostExpansion = false;
+ return Rewriter.expandCodeFor(S, ResultTy, InsertPt);
+ }
+
+ // Before expanding S into an expensive LLVM expression, see if we can use an
+ // already existing value as the expansion for S. There is potential to make
+ // this significantly smarter, but this simple heuristic already gets some
+ // interesting cases.
+
+ SmallVector<BasicBlock *, 4> Latches;
+ L->getLoopLatches(Latches);
+
+ for (BasicBlock *BB : Latches) {
+ ICmpInst::Predicate Pred;
+ Instruction *LHS, *RHS;
+ BasicBlock *TrueBB, *FalseBB;
+
+ if (!match(BB->getTerminator(),
+ m_Br(m_ICmp(Pred, m_Instruction(LHS), m_Instruction(RHS)),
+ TrueBB, FalseBB)))
+ continue;
+
+ if (SE->getSCEV(LHS) == S && DT->dominates(LHS, InsertPt)) {
+ IsHighCostExpansion = false;
+ return LHS;
+ }
+
+ if (SE->getSCEV(RHS) == S && DT->dominates(RHS, InsertPt)) {
+ IsHighCostExpansion = false;
+ return RHS;
+ }
+ }
+
+ // We didn't find anything, fall back to using SCEVExpander.
+ assert(Rewriter.isHighCostExpansion(S, L) && "this should not have changed!");
+ IsHighCostExpansion = true;
+ return Rewriter.expandCodeFor(S, ResultTy, InsertPt);
+}
+
//===----------------------------------------------------------------------===//
// RewriteLoopExitValues - Optimize IV users outside the loop.
// As a side effect, reduces the amount of IV processing within the loop.
@@ -628,7 +679,9 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
continue;
}
- Value *ExitVal = Rewriter.expandCodeFor(ExitValue, PN->getType(), Inst);
+ bool HighCost = false;
+ Value *ExitVal = ExpandSCEVIfNeeded(Rewriter, ExitValue, L, Inst,
+ PN->getType(), HighCost);
DEBUG(dbgs() << "INDVARS: RLEV: AfterLoopVal = " << *ExitVal << '\n'
<< " LoopVal = " << *Inst << "\n");
@@ -637,7 +690,6 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
DeadInsts.push_back(ExitVal);
continue;
}
- bool HighCost = Rewriter.isHighCostExpansion(ExitValue, L);
// Collect all the candidate PHINodes to be rewritten.
RewritePhiSet.push_back(
diff --git a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp
index f0e6d64..43fc50e 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -602,7 +602,8 @@ static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT,
// PHI nodes in exit blocks due to LCSSA form. Just RAUW them with clones of
// the instruction.
while (!I.use_empty()) {
- Instruction *User = I.user_back();
+ Value::user_iterator UI = I.user_begin();
+ auto *User = cast<Instruction>(*UI);
if (!DT->isReachableFromEntry(User->getParent())) {
User->replaceUsesOfWith(&I, UndefValue::get(I.getType()));
continue;
@@ -610,6 +611,16 @@ static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT,
// The user must be a PHI node.
PHINode *PN = cast<PHINode>(User);
+ // Surprisingly, instructions can be used outside of loops without any
+ // exits. This can only happen in PHI nodes if the incoming block is
+ // unreachable.
+ Use &U = UI.getUse();
+ BasicBlock *BB = PN->getIncomingBlock(U);
+ if (!DT->isReachableFromEntry(BB)) {
+ U = UndefValue::get(I.getType());
+ continue;
+ }
+
BasicBlock *ExitBlock = PN->getParent();
assert(ExitBlockSet.count(ExitBlock) &&
"The LCSSA PHI is not in an exit block!");
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
index 0325d26..1b9859b 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
@@ -34,6 +34,7 @@
#include "llvm/Support/Debug.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/LoopVersioning.h"
#include <list>
#define LDIST_NAME "loop-distribute"
@@ -55,70 +56,6 @@ static cl::opt<bool> DistributeNonIfConvertible(
STATISTIC(NumLoopsDistributed, "Number of loops distributed");
-/// \brief Remaps instructions in a loop including the preheader.
-static void remapInstructionsInLoop(const SmallVectorImpl<BasicBlock *> &Blocks,
- ValueToValueMapTy &VMap) {
- // Rewrite the code to refer to itself.
- for (auto *BB : Blocks)
- for (auto &Inst : *BB)
- RemapInstruction(&Inst, VMap,
- RF_NoModuleLevelChanges | RF_IgnoreMissingEntries);
-}
-
-/// \brief Clones a loop \p OrigLoop. Returns the loop and the blocks in \p
-/// Blocks.
-///
-/// Updates LoopInfo and DominatorTree assuming the loop is dominated by block
-/// \p LoopDomBB. Insert the new blocks before block specified in \p Before.
-static Loop *cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB,
- Loop *OrigLoop, ValueToValueMapTy &VMap,
- const Twine &NameSuffix, LoopInfo *LI,
- DominatorTree *DT,
- SmallVectorImpl<BasicBlock *> &Blocks) {
- Function *F = OrigLoop->getHeader()->getParent();
- Loop *ParentLoop = OrigLoop->getParentLoop();
-
- Loop *NewLoop = new Loop();
- if (ParentLoop)
- ParentLoop->addChildLoop(NewLoop);
- else
- LI->addTopLevelLoop(NewLoop);
-
- BasicBlock *OrigPH = OrigLoop->getLoopPreheader();
- BasicBlock *NewPH = CloneBasicBlock(OrigPH, VMap, NameSuffix, F);
- // To rename the loop PHIs.
- VMap[OrigPH] = NewPH;
- Blocks.push_back(NewPH);
-
- // Update LoopInfo.
- if (ParentLoop)
- ParentLoop->addBasicBlockToLoop(NewPH, *LI);
-
- // Update DominatorTree.
- DT->addNewBlock(NewPH, LoopDomBB);
-
- for (BasicBlock *BB : OrigLoop->getBlocks()) {
- BasicBlock *NewBB = CloneBasicBlock(BB, VMap, NameSuffix, F);
- VMap[BB] = NewBB;
-
- // Update LoopInfo.
- NewLoop->addBasicBlockToLoop(NewBB, *LI);
-
- // Update DominatorTree.
- BasicBlock *IDomBB = DT->getNode(BB)->getIDom()->getBlock();
- DT->addNewBlock(NewBB, cast<BasicBlock>(VMap[IDomBB]));
-
- Blocks.push_back(NewBB);
- }
-
- // Move them physically from the end of the block list.
- F->getBasicBlockList().splice(Before, F->getBasicBlockList(), NewPH);
- F->getBasicBlockList().splice(Before, F->getBasicBlockList(),
- NewLoop->getHeader(), F->end());
-
- return NewLoop;
-}
-
namespace {
/// \brief Maintains the set of instructions of the loop for a partition before
/// cloning. After cloning, it hosts the new loop.
@@ -204,7 +141,9 @@ public:
ValueToValueMapTy &getVMap() { return VMap; }
/// \brief Remaps the cloned instructions using VMap.
- void remapInstructions() { remapInstructionsInLoop(ClonedLoopBlocks, VMap); }
+ void remapInstructions() {
+ remapInstructionsInBlocks(ClonedLoopBlocks, VMap);
+ }
/// \brief Based on the set of instructions selected for this partition,
/// removes the unnecessary ones.
@@ -493,15 +432,14 @@ public:
/// partitions its entry is set to -1.
SmallVector<int, 8>
computePartitionSetForPointers(const LoopAccessInfo &LAI) {
- const LoopAccessInfo::RuntimePointerCheck *RtPtrCheck =
- LAI.getRuntimePointerCheck();
+ const RuntimePointerChecking *RtPtrCheck = LAI.getRuntimePointerChecking();
unsigned N = RtPtrCheck->Pointers.size();
SmallVector<int, 8> PtrToPartitions(N);
for (unsigned I = 0; I < N; ++I) {
- Value *Ptr = RtPtrCheck->Pointers[I];
+ Value *Ptr = RtPtrCheck->Pointers[I].PointerValue;
auto Instructions =
- LAI.getInstructionsForAccess(Ptr, RtPtrCheck->IsWritePtr[I]);
+ LAI.getInstructionsForAccess(Ptr, RtPtrCheck->Pointers[I].IsWritePtr);
int &Partition = PtrToPartitions[I];
// First set it to uninitialized.
@@ -629,121 +567,6 @@ private:
AccessesType Accesses;
};
-/// \brief Handles the loop versioning based on memchecks.
-class LoopVersioning {
-public:
- LoopVersioning(const LoopAccessInfo &LAI, Loop *L, LoopInfo *LI,
- DominatorTree *DT,
- const SmallVector<int, 8> *PtrToPartition = nullptr)
- : VersionedLoop(L), NonVersionedLoop(nullptr),
- PtrToPartition(PtrToPartition), LAI(LAI), LI(LI), DT(DT) {}
-
- /// \brief Returns true if we need memchecks to disambiguate may-aliasing
- /// accesses.
- bool needsRuntimeChecks() const {
- return LAI.getRuntimePointerCheck()->needsAnyChecking(PtrToPartition);
- }
-
- /// \brief Performs the CFG manipulation part of versioning the loop including
- /// the DominatorTree and LoopInfo updates.
- void versionLoop(Pass *P) {
- Instruction *FirstCheckInst;
- Instruction *MemRuntimeCheck;
- // Add the memcheck in the original preheader (this is empty initially).
- BasicBlock *MemCheckBB = VersionedLoop->getLoopPreheader();
- std::tie(FirstCheckInst, MemRuntimeCheck) =
- LAI.addRuntimeCheck(MemCheckBB->getTerminator(), PtrToPartition);
- assert(MemRuntimeCheck && "called even though needsAnyChecking = false");
-
- // Rename the block to make the IR more readable.
- MemCheckBB->setName(VersionedLoop->getHeader()->getName() +
- ".lver.memcheck");
-
- // Create empty preheader for the loop (and after cloning for the
- // non-versioned loop).
- BasicBlock *PH =
- SplitBlock(MemCheckBB, MemCheckBB->getTerminator(), DT, LI);
- PH->setName(VersionedLoop->getHeader()->getName() + ".ph");
-
- // Clone the loop including the preheader.
- //
- // FIXME: This does not currently preserve SimplifyLoop because the exit
- // block is a join between the two loops.
- SmallVector<BasicBlock *, 8> NonVersionedLoopBlocks;
- NonVersionedLoop =
- cloneLoopWithPreheader(PH, MemCheckBB, VersionedLoop, VMap,
- ".lver.orig", LI, DT, NonVersionedLoopBlocks);
- remapInstructionsInLoop(NonVersionedLoopBlocks, VMap);
-
- // Insert the conditional branch based on the result of the memchecks.
- Instruction *OrigTerm = MemCheckBB->getTerminator();
- BranchInst::Create(NonVersionedLoop->getLoopPreheader(),
- VersionedLoop->getLoopPreheader(), MemRuntimeCheck,
- OrigTerm);
- OrigTerm->eraseFromParent();
-
- // The loops merge in the original exit block. This is now dominated by the
- // memchecking block.
- DT->changeImmediateDominator(VersionedLoop->getExitBlock(), MemCheckBB);
- }
-
- /// \brief Adds the necessary PHI nodes for the versioned loops based on the
- /// loop-defined values used outside of the loop.
- void addPHINodes(const SmallVectorImpl<Instruction *> &DefsUsedOutside) {
- BasicBlock *PHIBlock = VersionedLoop->getExitBlock();
- assert(PHIBlock && "No single successor to loop exit block");
-
- for (auto *Inst : DefsUsedOutside) {
- auto *NonVersionedLoopInst = cast<Instruction>(VMap[Inst]);
- PHINode *PN;
-
- // First see if we have a single-operand PHI with the value defined by the
- // original loop.
- for (auto I = PHIBlock->begin(); (PN = dyn_cast<PHINode>(I)); ++I) {
- assert(PN->getNumOperands() == 1 &&
- "Exit block should only have on predecessor");
- if (PN->getIncomingValue(0) == Inst)
- break;
- }
- // If not create it.
- if (!PN) {
- PN = PHINode::Create(Inst->getType(), 2, Inst->getName() + ".lver",
- PHIBlock->begin());
- for (auto *User : Inst->users())
- if (!VersionedLoop->contains(cast<Instruction>(User)->getParent()))
- User->replaceUsesOfWith(Inst, PN);
- PN->addIncoming(Inst, VersionedLoop->getExitingBlock());
- }
- // Add the new incoming value from the non-versioned loop.
- PN->addIncoming(NonVersionedLoopInst,
- NonVersionedLoop->getExitingBlock());
- }
- }
-
-private:
- /// \brief The original loop. This becomes the "versioned" one, i.e. control
- /// goes if the memchecks all pass.
- Loop *VersionedLoop;
- /// \brief The fall-back loop, i.e. if any of the memchecks fail.
- Loop *NonVersionedLoop;
-
- /// \brief For each memory pointer it contains the partitionId it is used in.
- /// If nullptr, no partitioning is used.
- ///
- /// The I-th entry corresponds to I-th entry in LAI.getRuntimePointerCheck().
- /// If the pointer is used in multiple partitions the entry is set to -1.
- const SmallVector<int, 8> *PtrToPartition;
-
- /// \brief This maps the instructions from VersionedLoop to their counterpart
- /// in NonVersionedLoop.
- ValueToValueMapTy VMap;
-
- /// \brief Analyses used.
- const LoopAccessInfo &LAI;
- LoopInfo *LI;
- DominatorTree *DT;
-};
-
/// \brief Returns the instructions that use values defined in the loop.
static SmallVector<Instruction *, 8> findDefsUsedOutsideOfLoop(Loop *L) {
SmallVector<Instruction *, 8> UsedOutside;
@@ -929,7 +752,7 @@ private:
LoopVersioning LVer(LAI, L, LI, DT, &PtrToPartition);
if (LVer.needsRuntimeChecks()) {
DEBUG(dbgs() << "\nPointers:\n");
- DEBUG(LAI.getRuntimePointerCheck()->print(dbgs(), 0, &PtrToPartition));
+ DEBUG(LAI.getRuntimePointerChecking()->print(dbgs(), 0, &PtrToPartition));
LVer.versionLoop(this);
LVer.addPHINodes(DefsUsedOutside);
}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 714ce91..a21ca24 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -508,7 +508,7 @@ void NclPopcountRecognize::transform(Instruction *CntInst,
ICmpInst *NewPreCond =
cast<ICmpInst>(Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
- PreCond->replaceAllUsesWith(NewPreCond);
+ PreCondBr->setCondition(NewPreCond);
RecursivelyDeleteTriviallyDeadInstructions(PreCond, TLI);
}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 2554655..9d7e57f 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -282,21 +282,21 @@ static void populateWorklist(Loop &L, SmallVector<LoopVector, 8> &V) {
DEBUG(dbgs() << "Calling populateWorklist called\n");
LoopVector LoopList;
Loop *CurrentLoop = &L;
- std::vector<Loop *> vec = CurrentLoop->getSubLoopsVector();
- while (vec.size() != 0) {
+ const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops();
+ while (!Vec->empty()) {
// The current loop has multiple subloops in it hence it is not tightly
// nested.
// Discard all loops above it added into Worklist.
- if (vec.size() != 1) {
+ if (Vec->size() != 1) {
LoopList.clear();
return;
}
LoopList.push_back(CurrentLoop);
- CurrentLoop = *(vec.begin());
- vec = CurrentLoop->getSubLoopsVector();
+ CurrentLoop = Vec->front();
+ Vec = &CurrentLoop->getSubLoops();
}
LoopList.push_back(CurrentLoop);
- V.push_back(LoopList);
+ V.push_back(std::move(LoopList));
}
static PHINode *getInductionVariable(Loop *L, ScalarEvolution *SE) {
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 9e7558d..d78db6c 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -840,8 +840,10 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
// Reduce count based on the type of unrolling and the threshold values.
unsigned OriginalCount = Count;
- bool AllowRuntime = UserRuntime ? CurrentRuntime : UP.Runtime;
- if (HasRuntimeUnrollDisablePragma(L)) {
+ bool AllowRuntime =
+ (PragmaCount > 0) || (UserRuntime ? CurrentRuntime : UP.Runtime);
+ // Don't unroll a runtime trip count loop with unroll full pragma.
+ if (HasRuntimeUnrollDisablePragma(L) || PragmaFullUnroll) {
AllowRuntime = false;
}
if (Unrolling == Partial) {
diff --git a/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
index 243db8d..643f374 100644
--- a/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
@@ -301,10 +301,6 @@ void MergedLoadStoreMotion::hoistInstruction(BasicBlock *BB,
// Merged instruction
Instruction *HoistedInst = HoistCand->clone();
- // Notify AA of the new value.
- if (isa<LoadInst>(HoistCand))
- AA->copyValue(HoistCand, HoistedInst);
-
// Hoist instruction.
HoistedInst->insertBefore(HoistPt);
@@ -451,9 +447,6 @@ PHINode *MergedLoadStoreMotion::getPHIOperand(BasicBlock *BB, StoreInst *S0,
NewPN->addIncoming(Opd1, S0->getParent());
NewPN->addIncoming(Opd2, S1->getParent());
if (NewPN->getType()->getScalarType()->isPointerTy()) {
- // Notify AA of the new value.
- AA->copyValue(Opd1, NewPN);
- AA->copyValue(Opd2, NewPN);
// AA needs to be informed when a PHI-use of the pointer value is added
for (unsigned I = 0, E = NewPN->getNumIncomingValues(); I != E; ++I) {
unsigned J = PHINode::getOperandNumForIncomingValue(I);
@@ -491,7 +484,6 @@ bool MergedLoadStoreMotion::sinkStore(BasicBlock *BB, StoreInst *S0,
// Create the new store to be inserted at the join point.
StoreInst *SNew = (StoreInst *)(S0->clone());
Instruction *ANew = A0->clone();
- AA->copyValue(S0, SNew);
SNew->insertBefore(InsertPt);
ANew->insertBefore(SNew);
diff --git a/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp b/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp
index 9ecaf10..366301a 100644
--- a/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp
@@ -399,8 +399,8 @@ static bool doesNotRequireEntrySafepointBefore(const CallSite &CS) {
// at least if they do, are leaf functions that cause only finite stack
// growth. In particular, the optimizer likes to form things like memsets
// out of stores in the original IR. Another important example is
- // llvm.frameescape which must occur in the entry block. Inserting a
- // safepoint before it is not legal since it could push the frameescape
+ // llvm.localescape which must occur in the entry block. Inserting a
+ // safepoint before it is not legal since it could push the localescape
// out of the entry block.
return true;
}
diff --git a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp
index 305175f..4d3a708 100644
--- a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp
@@ -1799,11 +1799,10 @@ bool IPSCCP::runOnModule(Module &M) {
if (!TI->use_empty())
TI->replaceAllUsesWith(UndefValue::get(TI->getType()));
TI->eraseFromParent();
+ new UnreachableInst(M.getContext(), BB);
if (&*BB != &F->front())
BlocksToErase.push_back(BB);
- else
- new UnreachableInst(M.getContext(), BB);
continue;
}
diff --git a/contrib/llvm/lib/Transforms/Scalar/SROA.cpp b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp
index 056dd11..d1a0a82 100644
--- a/contrib/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -2593,13 +2593,21 @@ private:
V = rewriteIntegerLoad(LI);
} else if (NewBeginOffset == NewAllocaBeginOffset &&
canConvertValue(DL, NewAllocaTy, LI.getType())) {
- V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), LI.isVolatile(),
- LI.getName());
+ LoadInst *NewLI = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
+ LI.isVolatile(), LI.getName());
+ if (LI.isVolatile())
+ NewLI->setAtomic(LI.getOrdering(), LI.getSynchScope());
+
+ V = NewLI;
} else {
Type *LTy = TargetTy->getPointerTo();
- V = IRB.CreateAlignedLoad(getNewAllocaSlicePtr(IRB, LTy),
- getSliceAlign(TargetTy), LI.isVolatile(),
- LI.getName());
+ LoadInst *NewLI = IRB.CreateAlignedLoad(getNewAllocaSlicePtr(IRB, LTy),
+ getSliceAlign(TargetTy),
+ LI.isVolatile(), LI.getName());
+ if (LI.isVolatile())
+ NewLI->setAtomic(LI.getOrdering(), LI.getSynchScope());
+
+ V = NewLI;
IsPtrAdjusted = true;
}
V = convertValue(DL, IRB, V, TargetTy);
@@ -2722,7 +2730,8 @@ private:
NewSI = IRB.CreateAlignedStore(V, NewPtr, getSliceAlign(V->getType()),
SI.isVolatile());
}
- (void)NewSI;
+ if (SI.isVolatile())
+ NewSI->setAtomic(SI.getOrdering(), SI.getSynchScope());
Pass.DeadInsts.insert(&SI);
deleteIfTriviallyDead(OldOp);
diff --git a/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
index 53471de..ef7daca 100644
--- a/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -440,8 +440,6 @@ static void UpdatePHINodes(BasicBlock *OrigBB, BasicBlock *NewBB,
// Create the new PHI node, insert it into NewBB at the end of the block
PHINode *NewPHI =
PHINode::Create(PN->getType(), Preds.size(), PN->getName() + ".ph", BI);
- if (AA)
- AA->copyValue(PN, NewPHI);
// NOTE! This loop walks backwards for a reason! First off, this minimizes
// the cost of removal if we end up removing a large number of values, and
diff --git a/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp b/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp
index 4f8d1df..cc4d6c6 100644
--- a/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp
@@ -17,6 +17,7 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DebugInfo.h"
@@ -720,3 +721,68 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
ModuleLevelChanges, Returns, NameSuffix, CodeInfo,
nullptr);
}
+
+/// \brief Remaps instructions in \p Blocks using the mapping in \p VMap.
+void llvm::remapInstructionsInBlocks(
+ const SmallVectorImpl<BasicBlock *> &Blocks, ValueToValueMapTy &VMap) {
+ // Rewrite the code to refer to itself.
+ for (auto *BB : Blocks)
+ for (auto &Inst : *BB)
+ RemapInstruction(&Inst, VMap,
+ RF_NoModuleLevelChanges | RF_IgnoreMissingEntries);
+}
+
+/// \brief Clones a loop \p OrigLoop. Returns the loop and the blocks in \p
+/// Blocks.
+///
+/// Updates LoopInfo and DominatorTree assuming the loop is dominated by block
+/// \p LoopDomBB. Insert the new blocks before block specified in \p Before.
+Loop *llvm::cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB,
+ Loop *OrigLoop, ValueToValueMapTy &VMap,
+ const Twine &NameSuffix, LoopInfo *LI,
+ DominatorTree *DT,
+ SmallVectorImpl<BasicBlock *> &Blocks) {
+ Function *F = OrigLoop->getHeader()->getParent();
+ Loop *ParentLoop = OrigLoop->getParentLoop();
+
+ Loop *NewLoop = new Loop();
+ if (ParentLoop)
+ ParentLoop->addChildLoop(NewLoop);
+ else
+ LI->addTopLevelLoop(NewLoop);
+
+ BasicBlock *OrigPH = OrigLoop->getLoopPreheader();
+ assert(OrigPH && "No preheader");
+ BasicBlock *NewPH = CloneBasicBlock(OrigPH, VMap, NameSuffix, F);
+ // To rename the loop PHIs.
+ VMap[OrigPH] = NewPH;
+ Blocks.push_back(NewPH);
+
+ // Update LoopInfo.
+ if (ParentLoop)
+ ParentLoop->addBasicBlockToLoop(NewPH, *LI);
+
+ // Update DominatorTree.
+ DT->addNewBlock(NewPH, LoopDomBB);
+
+ for (BasicBlock *BB : OrigLoop->getBlocks()) {
+ BasicBlock *NewBB = CloneBasicBlock(BB, VMap, NameSuffix, F);
+ VMap[BB] = NewBB;
+
+ // Update LoopInfo.
+ NewLoop->addBasicBlockToLoop(NewBB, *LI);
+
+ // Update DominatorTree.
+ BasicBlock *IDomBB = DT->getNode(BB)->getIDom()->getBlock();
+ DT->addNewBlock(NewBB, cast<BasicBlock>(VMap[IDomBB]));
+
+ Blocks.push_back(NewBB);
+ }
+
+ // Move them physically from the end of the block list.
+ F->getBasicBlockList().splice(Before, F->getBasicBlockList(), NewPH);
+ F->getBasicBlockList().splice(Before, F->getBasicBlockList(),
+ NewLoop->getHeader(), F->end());
+
+ return NewLoop;
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/Local.cpp b/contrib/llvm/lib/Transforms/Utils/Local.cpp
index 5608557..50ca623 100644
--- a/contrib/llvm/lib/Transforms/Utils/Local.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/Local.cpp
@@ -900,13 +900,10 @@ static unsigned enforceKnownAlignment(Value *V, unsigned Align,
if (auto *GO = dyn_cast<GlobalObject>(V)) {
// If there is a large requested alignment and we can, bump up the alignment
- // of the global.
- if (GO->isDeclaration())
- return Align;
- // If the memory we set aside for the global may not be the memory used by
- // the final program then it is impossible for us to reliably enforce the
- // preferred alignment.
- if (GO->isWeakForLinker())
+ // of the global. If the memory we set aside for the global may not be the
+ // memory used by the final program then it is impossible for us to reliably
+ // enforce the preferred alignment.
+ if (!GO->isStrongDefinitionForLinker())
return Align;
if (GO->getAlignment() >= PrefAlign)
diff --git a/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp
index 2e7d21c..5c98043 100644
--- a/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp
@@ -403,7 +403,6 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader,
PHINode *PN = cast<PHINode>(I);
PHINode *NewPN = PHINode::Create(PN->getType(), BackedgeBlocks.size(),
PN->getName()+".be", BETerminator);
- if (AA) AA->copyValue(PN, NewPN);
// Loop over the PHI node, moving all entries except the one for the
// preheader over to the new PHI node.
diff --git a/contrib/llvm/lib/Transforms/Utils/LoopVersioning.cpp b/contrib/llvm/lib/Transforms/Utils/LoopVersioning.cpp
new file mode 100644
index 0000000..832079d
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/LoopVersioning.cpp
@@ -0,0 +1,106 @@
+//===- LoopVersioning.cpp - Utility to version a loop ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a utility class to perform loop versioning. The versioned
+// loop speculates that otherwise may-aliasing memory accesses don't overlap and
+// emits checks to prove this.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/LoopVersioning.h"
+
+using namespace llvm;
+
+LoopVersioning::LoopVersioning(const LoopAccessInfo &LAI, Loop *L, LoopInfo *LI,
+ DominatorTree *DT,
+ const SmallVector<int, 8> *PtrToPartition)
+ : VersionedLoop(L), NonVersionedLoop(nullptr),
+ PtrToPartition(PtrToPartition), LAI(LAI), LI(LI), DT(DT) {
+ assert(L->getExitBlock() && "No single exit block");
+ assert(L->getLoopPreheader() && "No preheader");
+}
+
+bool LoopVersioning::needsRuntimeChecks() const {
+ return LAI.getRuntimePointerChecking()->needsAnyChecking(PtrToPartition);
+}
+
+void LoopVersioning::versionLoop(Pass *P) {
+ Instruction *FirstCheckInst;
+ Instruction *MemRuntimeCheck;
+ // Add the memcheck in the original preheader (this is empty initially).
+ BasicBlock *MemCheckBB = VersionedLoop->getLoopPreheader();
+ std::tie(FirstCheckInst, MemRuntimeCheck) =
+ LAI.addRuntimeCheck(MemCheckBB->getTerminator(), PtrToPartition);
+ assert(MemRuntimeCheck && "called even though needsAnyChecking = false");
+
+ // Rename the block to make the IR more readable.
+ MemCheckBB->setName(VersionedLoop->getHeader()->getName() + ".lver.memcheck");
+
+ // Create empty preheader for the loop (and after cloning for the
+ // non-versioned loop).
+ BasicBlock *PH = SplitBlock(MemCheckBB, MemCheckBB->getTerminator(), DT, LI);
+ PH->setName(VersionedLoop->getHeader()->getName() + ".ph");
+
+ // Clone the loop including the preheader.
+ //
+ // FIXME: This does not currently preserve SimplifyLoop because the exit
+ // block is a join between the two loops.
+ SmallVector<BasicBlock *, 8> NonVersionedLoopBlocks;
+ NonVersionedLoop =
+ cloneLoopWithPreheader(PH, MemCheckBB, VersionedLoop, VMap, ".lver.orig",
+ LI, DT, NonVersionedLoopBlocks);
+ remapInstructionsInBlocks(NonVersionedLoopBlocks, VMap);
+
+ // Insert the conditional branch based on the result of the memchecks.
+ Instruction *OrigTerm = MemCheckBB->getTerminator();
+ BranchInst::Create(NonVersionedLoop->getLoopPreheader(),
+ VersionedLoop->getLoopPreheader(), MemRuntimeCheck,
+ OrigTerm);
+ OrigTerm->eraseFromParent();
+
+ // The loops merge in the original exit block. This is now dominated by the
+ // memchecking block.
+ DT->changeImmediateDominator(VersionedLoop->getExitBlock(), MemCheckBB);
+}
+
+void LoopVersioning::addPHINodes(
+ const SmallVectorImpl<Instruction *> &DefsUsedOutside) {
+ BasicBlock *PHIBlock = VersionedLoop->getExitBlock();
+ assert(PHIBlock && "No single successor to loop exit block");
+
+ for (auto *Inst : DefsUsedOutside) {
+ auto *NonVersionedLoopInst = cast<Instruction>(VMap[Inst]);
+ PHINode *PN;
+
+ // First see if we have a single-operand PHI with the value defined by the
+ // original loop.
+ for (auto I = PHIBlock->begin(); (PN = dyn_cast<PHINode>(I)); ++I) {
+ assert(PN->getNumOperands() == 1 &&
+ "Exit block should only have on predecessor");
+ if (PN->getIncomingValue(0) == Inst)
+ break;
+ }
+ // If not create it.
+ if (!PN) {
+ PN = PHINode::Create(Inst->getType(), 2, Inst->getName() + ".lver",
+ PHIBlock->begin());
+ for (auto *User : Inst->users())
+ if (!VersionedLoop->contains(cast<Instruction>(User)->getParent()))
+ User->replaceUsesOfWith(Inst, PN);
+ PN->addIncoming(Inst, VersionedLoop->getExitingBlock());
+ }
+ // Add the new incoming value from the non-versioned loop.
+ PN->addIncoming(NonVersionedLoopInst, NonVersionedLoop->getExitingBlock());
+ }
+}
diff --git a/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 5ba1417..69ca268 100644
--- a/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -148,8 +148,9 @@ static cl::opt<unsigned> MaxInterleaveGroupFactor(
cl::desc("Maximum factor for an interleaved access group (default = 8)"),
cl::init(8));
-/// We don't unroll loops with a known constant trip count below this number.
-static const unsigned TinyTripCountUnrollThreshold = 128;
+/// We don't interleave loops with a known constant trip count below this
+/// number.
+static const unsigned TinyTripCountInterleaveThreshold = 128;
static cl::opt<unsigned> ForceTargetNumScalarRegs(
"force-target-num-scalar-regs", cl::init(0), cl::Hidden,
@@ -180,7 +181,8 @@ static cl::opt<unsigned> ForceTargetInstructionCost(
static cl::opt<unsigned> SmallLoopCost(
"small-loop-cost", cl::init(20), cl::Hidden,
- cl::desc("The cost of a loop that is considered 'small' by the unroller."));
+ cl::desc(
+ "The cost of a loop that is considered 'small' by the interleaver."));
static cl::opt<bool> LoopVectorizeWithBlockFrequency(
"loop-vectorize-with-block-frequency", cl::init(false), cl::Hidden,
@@ -188,10 +190,11 @@ static cl::opt<bool> LoopVectorizeWithBlockFrequency(
"heuristics minimizing code growth in cold regions and being more "
"aggressive in hot regions."));
-// Runtime unroll loops for load/store throughput.
-static cl::opt<bool> EnableLoadStoreRuntimeUnroll(
- "enable-loadstore-runtime-unroll", cl::init(true), cl::Hidden,
- cl::desc("Enable runtime unrolling until load/store ports are saturated"));
+// Runtime interleave loops for load/store throughput.
+static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
+ "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
+ cl::desc(
+ "Enable runtime interleaving until load/store ports are saturated"));
/// The number of stores in a loop that are allowed to need predication.
static cl::opt<unsigned> NumberOfStoresToPredicate(
@@ -200,15 +203,15 @@ static cl::opt<unsigned> NumberOfStoresToPredicate(
static cl::opt<bool> EnableIndVarRegisterHeur(
"enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
- cl::desc("Count the induction variable only once when unrolling"));
+ cl::desc("Count the induction variable only once when interleaving"));
static cl::opt<bool> EnableCondStoresVectorization(
"enable-cond-stores-vec", cl::init(false), cl::Hidden,
cl::desc("Enable if predication of stores during vectorization."));
-static cl::opt<unsigned> MaxNestedScalarReductionUF(
- "max-nested-scalar-reduction-unroll", cl::init(2), cl::Hidden,
- cl::desc("The maximum unroll factor to use when unrolling a scalar "
+static cl::opt<unsigned> MaxNestedScalarReductionIC(
+ "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
+ cl::desc("The maximum interleave count to use when interleaving a scalar "
"reduction in a nested loop."));
namespace {
@@ -921,8 +924,8 @@ public:
bool isUniformAfterVectorization(Instruction* I) { return Uniforms.count(I); }
/// Returns the information that we collected about runtime memory check.
- const LoopAccessInfo::RuntimePointerCheck *getRuntimePointerCheck() const {
- return LAI->getRuntimePointerCheck();
+ const RuntimePointerChecking *getRuntimePointerChecking() const {
+ return LAI->getRuntimePointerChecking();
}
const LoopAccessInfo *getLAI() const {
@@ -1105,12 +1108,19 @@ public:
/// 64 bit loop indices.
unsigned getWidestType();
+ /// \return The desired interleave count.
+ /// If interleave count has been specified by metadata it will be returned.
+ /// Otherwise, the interleave count is computed and returned. VF and LoopCost
+ /// are the selected vectorization factor and the cost of the selected VF.
+ unsigned selectInterleaveCount(bool OptForSize, unsigned VF,
+ unsigned LoopCost);
+
/// \return The most profitable unroll factor.
- /// If UserUF is non-zero then this method finds the best unroll-factor
- /// based on register pressure and other parameters.
- /// VF and LoopCost are the selected vectorization factor and the cost of the
- /// selected VF.
- unsigned selectUnrollFactor(bool OptForSize, unsigned VF, unsigned LoopCost);
+ /// This method finds the best unroll-factor based on register pressure and
+ /// other parameters. VF and LoopCost are the selected vectorization factor
+ /// and the cost of the selected VF.
+ unsigned computeInterleaveCount(bool OptForSize, unsigned VF,
+ unsigned LoopCost);
/// \brief A struct that represents some properties of the register usage
/// of a loop.
@@ -1456,9 +1466,14 @@ struct LoopVectorize : public FunctionPass {
const BranchProbability ColdProb(1, 5); // 20%
ColdEntryFreq = BlockFrequency(BFI->getEntryFreq()) * ColdProb;
- // If the target claims to have no vector registers don't attempt
- // vectorization.
- if (!TTI->getNumberOfRegisters(true))
+ // Don't attempt if
+ // 1. the target claims to have no vector registers, and
+ // 2. interleaving won't help ILP.
+ //
+ // The second condition is necessary because, even if the target has no
+ // vector registers, loop vectorization may still enable scalar
+ // interleaving.
+ if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2)
return false;
// Build up a worklist of inner-loops to vectorize. This is necessary as
@@ -1633,18 +1648,17 @@ struct LoopVectorize : public FunctionPass {
const LoopVectorizationCostModel::VectorizationFactor VF =
CM.selectVectorizationFactor(OptForSize);
- // Select the unroll factor.
- const unsigned UF =
- CM.selectUnrollFactor(OptForSize, VF.Width, VF.Cost);
+ // Select the interleave count.
+ unsigned IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost);
DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "
<< DebugLocStr << '\n');
- DEBUG(dbgs() << "LV: Unroll Factor is " << UF << '\n');
+ DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
if (VF.Width == 1) {
DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial\n");
- if (UF == 1) {
+ if (IC == 1) {
emitOptimizationRemarkAnalysis(
F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
"not beneficial to vectorize and user disabled interleaving");
@@ -1654,17 +1668,14 @@ struct LoopVectorize : public FunctionPass {
// Report the unrolling decision.
emitOptimizationRemark(F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
- Twine("unrolled with interleaving factor " +
- Twine(UF) +
+ Twine("interleaved by " + Twine(IC) +
" (vectorization not beneficial)"));
- // We decided not to vectorize, but we may want to unroll.
-
- InnerLoopUnroller Unroller(L, SE, LI, DT, TLI, TTI, UF);
+ InnerLoopUnroller Unroller(L, SE, LI, DT, TLI, TTI, IC);
Unroller.vectorize(&LVL);
} else {
// If we decided that it is *legal* to vectorize the loop then do it.
- InnerLoopVectorizer LB(L, SE, LI, DT, TLI, TTI, VF.Width, UF);
+ InnerLoopVectorizer LB(L, SE, LI, DT, TLI, TTI, VF.Width, IC);
LB.vectorize(&LVL);
++LoopsVectorized;
@@ -1675,10 +1686,10 @@ struct LoopVectorize : public FunctionPass {
AddRuntimeUnrollDisableMetaData(L);
// Report the vectorization decision.
- emitOptimizationRemark(
- F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
- Twine("vectorized loop (vectorization factor: ") + Twine(VF.Width) +
- ", unrolling interleave factor: " + Twine(UF) + ")");
+ emitOptimizationRemark(F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
+ Twine("vectorized loop (vectorization width: ") +
+ Twine(VF.Width) + ", interleaved count: " +
+ Twine(IC) + ")");
}
// Mark the loop as already vectorized to avoid vectorizing again.
@@ -1760,31 +1771,6 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx,
return Builder.CreateAdd(Val, Step, "induction");
}
-/// \brief Find the operand of the GEP that should be checked for consecutive
-/// stores. This ignores trailing indices that have no effect on the final
-/// pointer.
-static unsigned getGEPInductionOperand(const GetElementPtrInst *Gep) {
- const DataLayout &DL = Gep->getModule()->getDataLayout();
- unsigned LastOperand = Gep->getNumOperands() - 1;
- unsigned GEPAllocSize = DL.getTypeAllocSize(
- cast<PointerType>(Gep->getType()->getScalarType())->getElementType());
-
- // Walk backwards and try to peel off zeros.
- while (LastOperand > 1 && match(Gep->getOperand(LastOperand), m_Zero())) {
- // Find the type we're currently indexing into.
- gep_type_iterator GEPTI = gep_type_begin(Gep);
- std::advance(GEPTI, LastOperand - 1);
-
- // If it's a type with the same allocation size as the result of the GEP we
- // can peel off the zero index.
- if (DL.getTypeAllocSize(*GEPTI) != GEPAllocSize)
- break;
- --LastOperand;
- }
-
- return LastOperand;
-}
-
int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr");
// Make sure that the pointer does not point to structs.
@@ -2503,9 +2489,9 @@ void InnerLoopVectorizer::createEmptyLoop() {
*/
BasicBlock *OldBasicBlock = OrigLoop->getHeader();
- BasicBlock *BypassBlock = OrigLoop->getLoopPreheader();
+ BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
BasicBlock *ExitBlock = OrigLoop->getExitBlock();
- assert(BypassBlock && "Invalid loop structure");
+ assert(VectorPH && "Invalid loop structure");
assert(ExitBlock && "Must have an exit block");
// Some loops have a single integer induction variable, while other loops
@@ -2545,44 +2531,35 @@ void InnerLoopVectorizer::createEmptyLoop() {
// loop.
Value *BackedgeCount =
Exp.expandCodeFor(BackedgeTakeCount, BackedgeTakeCount->getType(),
- BypassBlock->getTerminator());
+ VectorPH->getTerminator());
if (BackedgeCount->getType()->isPointerTy())
BackedgeCount = CastInst::CreatePointerCast(BackedgeCount, IdxTy,
"backedge.ptrcnt.to.int",
- BypassBlock->getTerminator());
+ VectorPH->getTerminator());
Instruction *CheckBCOverflow =
CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, BackedgeCount,
Constant::getAllOnesValue(BackedgeCount->getType()),
- "backedge.overflow", BypassBlock->getTerminator());
+ "backedge.overflow", VectorPH->getTerminator());
// The loop index does not have to start at Zero. Find the original start
// value from the induction PHI node. If we don't have an induction variable
// then we know that it starts at zero.
- Builder.SetInsertPoint(BypassBlock->getTerminator());
- Value *StartIdx = ExtendedIdx = OldInduction ?
- Builder.CreateZExt(OldInduction->getIncomingValueForBlock(BypassBlock),
- IdxTy):
- ConstantInt::get(IdxTy, 0);
-
- // We need an instruction to anchor the overflow check on. StartIdx needs to
- // be defined before the overflow check branch. Because the scalar preheader
- // is going to merge the start index and so the overflow branch block needs to
- // contain a definition of the start index.
- Instruction *OverflowCheckAnchor = BinaryOperator::CreateAdd(
- StartIdx, ConstantInt::get(IdxTy, 0), "overflow.check.anchor",
- BypassBlock->getTerminator());
+ Builder.SetInsertPoint(VectorPH->getTerminator());
+ Value *StartIdx = ExtendedIdx =
+ OldInduction
+ ? Builder.CreateZExt(OldInduction->getIncomingValueForBlock(VectorPH),
+ IdxTy)
+ : ConstantInt::get(IdxTy, 0);
// Count holds the overall loop count (N).
Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
- BypassBlock->getTerminator());
+ VectorPH->getTerminator());
- LoopBypassBlocks.push_back(BypassBlock);
+ LoopBypassBlocks.push_back(VectorPH);
// Split the single block loop into the two loop structure described above.
- BasicBlock *VectorPH =
- BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph");
BasicBlock *VecBody =
- VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
+ VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
BasicBlock *MiddleBlock =
VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
BasicBlock *ScalarPH =
@@ -2597,7 +2574,6 @@ void InnerLoopVectorizer::createEmptyLoop() {
if (ParentLoop) {
ParentLoop->addChildLoop(Lp);
ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
- ParentLoop->addBasicBlockToLoop(VectorPH, *LI);
ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
} else {
LI->addTopLevelLoop(Lp);
@@ -2615,9 +2591,20 @@ void InnerLoopVectorizer::createEmptyLoop() {
// times the unroll factor (num of SIMD instructions).
Constant *Step = ConstantInt::get(IdxTy, VF * UF);
+ // Generate code to check that the loop's trip count that we computed by
+ // adding one to the backedge-taken count will not overflow.
+ BasicBlock *NewVectorPH =
+ VectorPH->splitBasicBlock(VectorPH->getTerminator(), "overflow.checked");
+ if (ParentLoop)
+ ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI);
+ ReplaceInstWithInst(
+ VectorPH->getTerminator(),
+ BranchInst::Create(ScalarPH, NewVectorPH, CheckBCOverflow));
+ VectorPH = NewVectorPH;
+
// This is the IR builder that we use to add all of the logic for bypassing
// the new vector loop.
- IRBuilder<> BypassBuilder(BypassBlock->getTerminator());
+ IRBuilder<> BypassBuilder(VectorPH->getTerminator());
setDebugLocFromInst(BypassBuilder,
getDebugLocFromInstOrOperands(OldInduction));
@@ -2646,24 +2633,14 @@ void InnerLoopVectorizer::createEmptyLoop() {
// jump to the scalar loop.
Value *Cmp =
BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx, "cmp.zero");
-
- BasicBlock *LastBypassBlock = BypassBlock;
-
- // Generate code to check that the loops trip count that we computed by adding
- // one to the backedge-taken count will not overflow.
- {
- auto PastOverflowCheck =
- std::next(BasicBlock::iterator(OverflowCheckAnchor));
- BasicBlock *CheckBlock =
- LastBypassBlock->splitBasicBlock(PastOverflowCheck, "overflow.checked");
- if (ParentLoop)
- ParentLoop->addBasicBlockToLoop(CheckBlock, *LI);
- LoopBypassBlocks.push_back(CheckBlock);
- ReplaceInstWithInst(
- LastBypassBlock->getTerminator(),
- BranchInst::Create(ScalarPH, CheckBlock, CheckBCOverflow));
- LastBypassBlock = CheckBlock;
- }
+ NewVectorPH =
+ VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.ph");
+ if (ParentLoop)
+ ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI);
+ LoopBypassBlocks.push_back(VectorPH);
+ ReplaceInstWithInst(VectorPH->getTerminator(),
+ BranchInst::Create(MiddleBlock, NewVectorPH, Cmp));
+ VectorPH = NewVectorPH;
// Generate the code to check that the strides we assumed to be one are really
// one. We want the new basic block to start at the first instruction in a
@@ -2671,23 +2648,24 @@ void InnerLoopVectorizer::createEmptyLoop() {
Instruction *StrideCheck;
Instruction *FirstCheckInst;
std::tie(FirstCheckInst, StrideCheck) =
- addStrideCheck(LastBypassBlock->getTerminator());
+ addStrideCheck(VectorPH->getTerminator());
if (StrideCheck) {
AddedSafetyChecks = true;
// Create a new block containing the stride check.
- BasicBlock *CheckBlock =
- LastBypassBlock->splitBasicBlock(FirstCheckInst, "vector.stridecheck");
+ VectorPH->setName("vector.stridecheck");
+ NewVectorPH =
+ VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.ph");
if (ParentLoop)
- ParentLoop->addBasicBlockToLoop(CheckBlock, *LI);
- LoopBypassBlocks.push_back(CheckBlock);
+ ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI);
+ LoopBypassBlocks.push_back(VectorPH);
// Replace the branch into the memory check block with a conditional branch
// for the "few elements case".
- ReplaceInstWithInst(LastBypassBlock->getTerminator(),
- BranchInst::Create(MiddleBlock, CheckBlock, Cmp));
+ ReplaceInstWithInst(
+ VectorPH->getTerminator(),
+ BranchInst::Create(MiddleBlock, NewVectorPH, StrideCheck));
- Cmp = StrideCheck;
- LastBypassBlock = CheckBlock;
+ VectorPH = NewVectorPH;
}
// Generate the code that checks in runtime if arrays overlap. We put the
@@ -2695,28 +2673,26 @@ void InnerLoopVectorizer::createEmptyLoop() {
// faster.
Instruction *MemRuntimeCheck;
std::tie(FirstCheckInst, MemRuntimeCheck) =
- Legal->getLAI()->addRuntimeCheck(LastBypassBlock->getTerminator());
+ Legal->getLAI()->addRuntimeCheck(VectorPH->getTerminator());
if (MemRuntimeCheck) {
AddedSafetyChecks = true;
// Create a new block containing the memory check.
- BasicBlock *CheckBlock =
- LastBypassBlock->splitBasicBlock(FirstCheckInst, "vector.memcheck");
+ VectorPH->setName("vector.memcheck");
+ NewVectorPH =
+ VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.ph");
if (ParentLoop)
- ParentLoop->addBasicBlockToLoop(CheckBlock, *LI);
- LoopBypassBlocks.push_back(CheckBlock);
+ ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI);
+ LoopBypassBlocks.push_back(VectorPH);
// Replace the branch into the memory check block with a conditional branch
// for the "few elements case".
- ReplaceInstWithInst(LastBypassBlock->getTerminator(),
- BranchInst::Create(MiddleBlock, CheckBlock, Cmp));
+ ReplaceInstWithInst(
+ VectorPH->getTerminator(),
+ BranchInst::Create(MiddleBlock, NewVectorPH, MemRuntimeCheck));
- Cmp = MemRuntimeCheck;
- LastBypassBlock = CheckBlock;
+ VectorPH = NewVectorPH;
}
- ReplaceInstWithInst(LastBypassBlock->getTerminator(),
- BranchInst::Create(MiddleBlock, VectorPH, Cmp));
-
// We are going to resume the execution of the scalar loop.
// Go over all of the induction variables that we found and fix the
// PHIs that are left in the scalar version of the loop.
@@ -3831,7 +3807,7 @@ bool LoopVectorizationLegality::canVectorize() {
}
// We can only vectorize innermost loops.
- if (!TheLoop->getSubLoopsVector().empty()) {
+ if (!TheLoop->empty()) {
emitAnalysis(VectorizationReport() << "loop is not the innermost loop");
return false;
}
@@ -3897,10 +3873,11 @@ bool LoopVectorizationLegality::canVectorize() {
// Collect all of the variables that remain uniform after vectorization.
collectLoopUniforms();
- DEBUG(dbgs() << "LV: We can vectorize this loop" <<
- (LAI->getRuntimePointerCheck()->Need ? " (with a runtime bound check)" :
- "")
- <<"!\n");
+ DEBUG(dbgs() << "LV: We can vectorize this loop"
+ << (LAI->getRuntimePointerChecking()->Need
+ ? " (with a runtime bound check)"
+ : "")
+ << "!\n");
// Analyze interleaved memory accesses.
if (EnableInterleavedMemAccesses)
@@ -4130,118 +4107,6 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
return true;
}
-///\brief Remove GEPs whose indices but the last one are loop invariant and
-/// return the induction operand of the gep pointer.
-static Value *stripGetElementPtr(Value *Ptr, ScalarEvolution *SE, Loop *Lp) {
- GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
- if (!GEP)
- return Ptr;
-
- unsigned InductionOperand = getGEPInductionOperand(GEP);
-
- // Check that all of the gep indices are uniform except for our induction
- // operand.
- for (unsigned i = 0, e = GEP->getNumOperands(); i != e; ++i)
- if (i != InductionOperand &&
- !SE->isLoopInvariant(SE->getSCEV(GEP->getOperand(i)), Lp))
- return Ptr;
- return GEP->getOperand(InductionOperand);
-}
-
-///\brief Look for a cast use of the passed value.
-static Value *getUniqueCastUse(Value *Ptr, Loop *Lp, Type *Ty) {
- Value *UniqueCast = nullptr;
- for (User *U : Ptr->users()) {
- CastInst *CI = dyn_cast<CastInst>(U);
- if (CI && CI->getType() == Ty) {
- if (!UniqueCast)
- UniqueCast = CI;
- else
- return nullptr;
- }
- }
- return UniqueCast;
-}
-
-///\brief Get the stride of a pointer access in a loop.
-/// Looks for symbolic strides "a[i*stride]". Returns the symbolic stride as a
-/// pointer to the Value, or null otherwise.
-static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *Lp) {
- const PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
- if (!PtrTy || PtrTy->isAggregateType())
- return nullptr;
-
- // Try to remove a gep instruction to make the pointer (actually index at this
- // point) easier analyzable. If OrigPtr is equal to Ptr we are analzying the
- // pointer, otherwise, we are analyzing the index.
- Value *OrigPtr = Ptr;
-
- // The size of the pointer access.
- int64_t PtrAccessSize = 1;
-
- Ptr = stripGetElementPtr(Ptr, SE, Lp);
- const SCEV *V = SE->getSCEV(Ptr);
-
- if (Ptr != OrigPtr)
- // Strip off casts.
- while (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(V))
- V = C->getOperand();
-
- const SCEVAddRecExpr *S = dyn_cast<SCEVAddRecExpr>(V);
- if (!S)
- return nullptr;
-
- V = S->getStepRecurrence(*SE);
- if (!V)
- return nullptr;
-
- // Strip off the size of access multiplication if we are still analyzing the
- // pointer.
- if (OrigPtr == Ptr) {
- const DataLayout &DL = Lp->getHeader()->getModule()->getDataLayout();
- DL.getTypeAllocSize(PtrTy->getElementType());
- if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(V)) {
- if (M->getOperand(0)->getSCEVType() != scConstant)
- return nullptr;
-
- const APInt &APStepVal =
- cast<SCEVConstant>(M->getOperand(0))->getValue()->getValue();
-
- // Huge step value - give up.
- if (APStepVal.getBitWidth() > 64)
- return nullptr;
-
- int64_t StepVal = APStepVal.getSExtValue();
- if (PtrAccessSize != StepVal)
- return nullptr;
- V = M->getOperand(1);
- }
- }
-
- // Strip off casts.
- Type *StripedOffRecurrenceCast = nullptr;
- if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(V)) {
- StripedOffRecurrenceCast = C->getType();
- V = C->getOperand();
- }
-
- // Look for the loop invariant symbolic value.
- const SCEVUnknown *U = dyn_cast<SCEVUnknown>(V);
- if (!U)
- return nullptr;
-
- Value *Stride = U->getValue();
- if (!Lp->isLoopInvariant(Stride))
- return nullptr;
-
- // If we have stripped off the recurrence cast we have to make sure that we
- // return the value that is used in this loop so that we can replace it later.
- if (StripedOffRecurrenceCast)
- Stride = getUniqueCastUse(Stride, Lp, StripedOffRecurrenceCast);
-
- return Stride;
-}
-
void LoopVectorizationLegality::collectStridedAccess(Value *MemAccess) {
Value *Ptr = nullptr;
if (LoadInst *LI = dyn_cast<LoadInst>(MemAccess))
@@ -4585,7 +4450,7 @@ LoopVectorizationCostModel::VectorizationFactor
LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
// Width 1 means no vectorize
VectorizationFactor Factor = { 1U, 0U };
- if (OptForSize && Legal->getRuntimePointerCheck()->Need) {
+ if (OptForSize && Legal->getRuntimePointerChecking()->Need) {
emitAnalysis(VectorizationReport() <<
"runtime pointer checks needed. Enable vectorization of this "
"loop with '#pragma clang loop vectorize(enable)' when "
@@ -4745,41 +4610,40 @@ unsigned LoopVectorizationCostModel::getWidestType() {
return MaxWidth;
}
-unsigned
-LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
- unsigned VF,
- unsigned LoopCost) {
+unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
+ unsigned VF,
+ unsigned LoopCost) {
- // -- The unroll heuristics --
- // We unroll the loop in order to expose ILP and reduce the loop overhead.
+ // -- The interleave heuristics --
+ // We interleave the loop in order to expose ILP and reduce the loop overhead.
// There are many micro-architectural considerations that we can't predict
// at this level. For example, frontend pressure (on decode or fetch) due to
// code size, or the number and capabilities of the execution ports.
//
- // We use the following heuristics to select the unroll factor:
- // 1. If the code has reductions, then we unroll in order to break the cross
+ // We use the following heuristics to select the interleave count:
+ // 1. If the code has reductions, then we interleave to break the cross
// iteration dependency.
- // 2. If the loop is really small, then we unroll in order to reduce the loop
+ // 2. If the loop is really small, then we interleave to reduce the loop
// overhead.
- // 3. We don't unroll if we think that we will spill registers to memory due
- // to the increased register pressure.
+ // 3. We don't interleave if we think that we will spill registers to memory
+ // due to the increased register pressure.
// Use the user preference, unless 'auto' is selected.
int UserUF = Hints->getInterleave();
if (UserUF != 0)
return UserUF;
- // When we optimize for size, we don't unroll.
+ // When we optimize for size, we don't interleave.
if (OptForSize)
return 1;
- // We used the distance for the unroll factor.
+ // We used the distance for the interleave count.
if (Legal->getMaxSafeDepDistBytes() != -1U)
return 1;
- // Do not unroll loops with a relatively small trip count.
+ // Do not interleave loops with a relatively small trip count.
unsigned TC = SE->getSmallConstantTripCount(TheLoop);
- if (TC > 1 && TC < TinyTripCountUnrollThreshold)
+ if (TC > 1 && TC < TinyTripCountInterleaveThreshold)
return 1;
unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);
@@ -4800,32 +4664,32 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
R.NumInstructions = std::max(R.NumInstructions, 1U);
- // We calculate the unroll factor using the following formula.
+ // We calculate the interleave count using the following formula.
// Subtract the number of loop invariants from the number of available
- // registers. These registers are used by all of the unrolled instances.
+ // registers. These registers are used by all of the interleaved instances.
// Next, divide the remaining registers by the number of registers that is
// required by the loop, in order to estimate how many parallel instances
// fit without causing spills. All of this is rounded down if necessary to be
- // a power of two. We want power of two unroll factors to simplify any
+ // a power of two. We want power of two interleave count to simplify any
// addressing operations or alignment considerations.
- unsigned UF = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
+ unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
R.MaxLocalUsers);
- // Don't count the induction variable as unrolled.
+ // Don't count the induction variable as interleaved.
if (EnableIndVarRegisterHeur)
- UF = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) /
+ IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) /
std::max(1U, (R.MaxLocalUsers - 1)));
- // Clamp the unroll factor ranges to reasonable factors.
- unsigned MaxInterleaveSize = TTI.getMaxInterleaveFactor(VF);
+ // Clamp the interleave ranges to reasonable counts.
+ unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
- // Check if the user has overridden the unroll max.
+ // Check if the user has overridden the max.
if (VF == 1) {
if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
- MaxInterleaveSize = ForceTargetMaxScalarInterleaveFactor;
+ MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
} else {
if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
- MaxInterleaveSize = ForceTargetMaxVectorInterleaveFactor;
+ MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
}
// If we did not calculate the cost for VF (because the user selected the VF)
@@ -4833,72 +4697,74 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
if (LoopCost == 0)
LoopCost = expectedCost(VF);
- // Clamp the calculated UF to be between the 1 and the max unroll factor
+ // Clamp the calculated IC to be between the 1 and the max interleave count
// that the target allows.
- if (UF > MaxInterleaveSize)
- UF = MaxInterleaveSize;
- else if (UF < 1)
- UF = 1;
+ if (IC > MaxInterleaveCount)
+ IC = MaxInterleaveCount;
+ else if (IC < 1)
+ IC = 1;
- // Unroll if we vectorized this loop and there is a reduction that could
- // benefit from unrolling.
+ // Interleave if we vectorized this loop and there is a reduction that could
+ // benefit from interleaving.
if (VF > 1 && Legal->getReductionVars()->size()) {
- DEBUG(dbgs() << "LV: Unrolling because of reductions.\n");
- return UF;
+ DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
+ return IC;
}
// Note that if we've already vectorized the loop we will have done the
- // runtime check and so unrolling won't require further checks.
- bool UnrollingRequiresRuntimePointerCheck =
- (VF == 1 && Legal->getRuntimePointerCheck()->Need);
+ // runtime check and so interleaving won't require further checks.
+ bool InterleavingRequiresRuntimePointerCheck =
+ (VF == 1 && Legal->getRuntimePointerChecking()->Need);
- // We want to unroll small loops in order to reduce the loop overhead and
+ // We want to interleave small loops in order to reduce the loop overhead and
// potentially expose ILP opportunities.
DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
- if (!UnrollingRequiresRuntimePointerCheck &&
- LoopCost < SmallLoopCost) {
+ if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
// We assume that the cost overhead is 1 and we use the cost model
- // to estimate the cost of the loop and unroll until the cost of the
+ // to estimate the cost of the loop and interleave until the cost of the
// loop overhead is about 5% of the cost of the loop.
- unsigned SmallUF = std::min(UF, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
+ unsigned SmallIC =
+ std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
- // Unroll until store/load ports (estimated by max unroll factor) are
+ // Interleave until store/load ports (estimated by max interleave count) are
// saturated.
unsigned NumStores = Legal->getNumStores();
unsigned NumLoads = Legal->getNumLoads();
- unsigned StoresUF = UF / (NumStores ? NumStores : 1);
- unsigned LoadsUF = UF / (NumLoads ? NumLoads : 1);
+ unsigned StoresIC = IC / (NumStores ? NumStores : 1);
+ unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
// If we have a scalar reduction (vector reductions are already dealt with
// by this point), we can increase the critical path length if the loop
- // we're unrolling is inside another loop. Limit, by default to 2, so the
+ // we're interleaving is inside another loop. Limit, by default to 2, so the
// critical path only gets increased by one reduction operation.
if (Legal->getReductionVars()->size() &&
TheLoop->getLoopDepth() > 1) {
- unsigned F = static_cast<unsigned>(MaxNestedScalarReductionUF);
- SmallUF = std::min(SmallUF, F);
- StoresUF = std::min(StoresUF, F);
- LoadsUF = std::min(LoadsUF, F);
+ unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
+ SmallIC = std::min(SmallIC, F);
+ StoresIC = std::min(StoresIC, F);
+ LoadsIC = std::min(LoadsIC, F);
}
- if (EnableLoadStoreRuntimeUnroll && std::max(StoresUF, LoadsUF) > SmallUF) {
- DEBUG(dbgs() << "LV: Unrolling to saturate store or load ports.\n");
- return std::max(StoresUF, LoadsUF);
+ if (EnableLoadStoreRuntimeInterleave &&
+ std::max(StoresIC, LoadsIC) > SmallIC) {
+ DEBUG(dbgs() << "LV: Interleaving to saturate store or load ports.\n");
+ return std::max(StoresIC, LoadsIC);
}
- DEBUG(dbgs() << "LV: Unrolling to reduce branch cost.\n");
- return SmallUF;
+ DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
+ return SmallIC;
}
- // Unroll if this is a large loop (small loops are already dealt with by this
- // point) that could benefit from interleaved unrolling.
+ // Interleave if this is a large loop (small loops are already dealt with by
+ // this
+ // point) that could benefit from interleaving.
bool HasReductions = (Legal->getReductionVars()->size() > 0);
if (TTI.enableAggressiveInterleaving(HasReductions)) {
- DEBUG(dbgs() << "LV: Unrolling to expose ILP.\n");
- return UF;
+ DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
+ return IC;
}
- DEBUG(dbgs() << "LV: Not Unrolling.\n");
+ DEBUG(dbgs() << "LV: Not Interleaving.\n");
return 1;
}
diff --git a/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 7c4c279..7bac407 100644
--- a/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -69,8 +69,13 @@ static cl::opt<bool> ShouldStartVectorizeHorAtStore(
cl::desc(
"Attempt to vectorize horizontal reductions feeding into a store"));
+static cl::opt<int>
+MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
+ cl::desc("Attempt to vectorize for this register size in bits"));
+
namespace {
+// FIXME: Set this via cl::opt to allow overriding.
static const unsigned MinVecRegSize = 128;
static const unsigned RecursionMaxDepth = 12;
@@ -2136,9 +2141,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
}
// Prepare the operand vector.
- for (unsigned j = 0; j < E->Scalars.size(); ++j)
- Operands.push_back(cast<PHINode>(E->Scalars[j])->
- getIncomingValueForBlock(IBB));
+ for (Value *V : E->Scalars)
+ Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(IBB));
Builder.SetInsertPoint(IBB->getTerminator());
Builder.SetCurrentDebugLocation(PH->getDebugLoc());
@@ -2172,8 +2176,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
case Instruction::FPTrunc:
case Instruction::BitCast: {
ValueList INVL;
- for (int i = 0, e = E->Scalars.size(); i < e; ++i)
- INVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
+ for (Value *V : E->Scalars)
+ INVL.push_back(cast<Instruction>(V)->getOperand(0));
setInsertPointAfterBundle(E->Scalars);
@@ -2191,9 +2195,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
case Instruction::FCmp:
case Instruction::ICmp: {
ValueList LHSV, RHSV;
- for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
- LHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
- RHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
+ for (Value *V : E->Scalars) {
+ LHSV.push_back(cast<Instruction>(V)->getOperand(0));
+ RHSV.push_back(cast<Instruction>(V)->getOperand(1));
}
setInsertPointAfterBundle(E->Scalars);
@@ -2217,10 +2221,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
}
case Instruction::Select: {
ValueList TrueVec, FalseVec, CondVec;
- for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
- CondVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
- TrueVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
- FalseVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(2));
+ for (Value *V : E->Scalars) {
+ CondVec.push_back(cast<Instruction>(V)->getOperand(0));
+ TrueVec.push_back(cast<Instruction>(V)->getOperand(1));
+ FalseVec.push_back(cast<Instruction>(V)->getOperand(2));
}
setInsertPointAfterBundle(E->Scalars);
@@ -2259,9 +2263,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL);
else
- for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
- LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
- RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
+ for (Value *V : E->Scalars) {
+ LHSVL.push_back(cast<Instruction>(V)->getOperand(0));
+ RHSVL.push_back(cast<Instruction>(V)->getOperand(1));
}
setInsertPointAfterBundle(E->Scalars);
@@ -2322,8 +2326,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
unsigned AS = SI->getPointerAddressSpace();
ValueList ValueOp;
- for (int i = 0, e = E->Scalars.size(); i < e; ++i)
- ValueOp.push_back(cast<StoreInst>(E->Scalars[i])->getValueOperand());
+ for (Value *V : E->Scalars)
+ ValueOp.push_back(cast<StoreInst>(V)->getValueOperand());
setInsertPointAfterBundle(E->Scalars);
@@ -2351,8 +2355,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
setInsertPointAfterBundle(E->Scalars);
ValueList Op0VL;
- for (int i = 0, e = E->Scalars.size(); i < e; ++i)
- Op0VL.push_back(cast<GetElementPtrInst>(E->Scalars[i])->getOperand(0));
+ for (Value *V : E->Scalars)
+ Op0VL.push_back(cast<GetElementPtrInst>(V)->getOperand(0));
Value *Op0 = vectorizeTree(Op0VL);
@@ -2360,8 +2364,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
++j) {
ValueList OpVL;
- for (int i = 0, e = E->Scalars.size(); i < e; ++i)
- OpVL.push_back(cast<GetElementPtrInst>(E->Scalars[i])->getOperand(j));
+ for (Value *V : E->Scalars)
+ OpVL.push_back(cast<GetElementPtrInst>(V)->getOperand(j));
Value *OpVec = vectorizeTree(OpVL);
OpVecs.push_back(OpVec);
@@ -2397,8 +2401,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
OpVecs.push_back(CEI->getArgOperand(j));
continue;
}
- for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
- CallInst *CEI = cast<CallInst>(E->Scalars[i]);
+ for (Value *V : E->Scalars) {
+ CallInst *CEI = cast<CallInst>(V);
OpVL.push_back(CEI->getArgOperand(j));
}
@@ -3089,6 +3093,17 @@ struct SLPVectorizer : public FunctionPass {
if (!TTI->getNumberOfRegisters(true))
return false;
+ // Use the vector register size specified by the target unless overridden
+ // by a command-line option.
+ // TODO: It would be better to limit the vectorization factor based on
+ // data type rather than just register size. For example, x86 AVX has
+ // 256-bit registers, but it does not support integer operations
+ // at that width (that requires AVX2).
+ if (MaxVectorRegSizeOption.getNumOccurrences())
+ MaxVecRegSize = MaxVectorRegSizeOption;
+ else
+ MaxVecRegSize = TTI->getRegisterBitWidth(true);
+
// Don't vectorize when the attribute NoImplicitFloat is used.
if (F.hasFnAttribute(Attribute::NoImplicitFloat))
return false;
@@ -3166,12 +3181,13 @@ private:
bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R);
bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold,
- BoUpSLP &R);
+ BoUpSLP &R, unsigned VecRegSize);
bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold,
BoUpSLP &R);
private:
StoreListMap StoreRefs;
+ unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
};
/// \brief Check that the Values in the slice in VL array are still existent in
@@ -3186,14 +3202,15 @@ static bool hasValueBeenRAUWed(ArrayRef<Value *> VL, ArrayRef<WeakVH> VH,
}
bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain,
- int CostThreshold, BoUpSLP &R) {
+ int CostThreshold, BoUpSLP &R,
+ unsigned VecRegSize) {
unsigned ChainLen = Chain.size();
DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
<< "\n");
Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType();
auto &DL = cast<StoreInst>(Chain[0])->getModule()->getDataLayout();
unsigned Sz = DL.getTypeSizeInBits(StoreTy);
- unsigned VF = MinVecRegSize / Sz;
+ unsigned VF = VecRegSize / Sz;
if (!isPowerOf2_32(Sz) || VF < 2)
return false;
@@ -3277,12 +3294,16 @@ bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
I = ConsecutiveChain[I];
}
- bool Vectorized = vectorizeStoreChain(Operands, costThreshold, R);
-
- // Mark the vectorized stores so that we don't vectorize them again.
- if (Vectorized)
- VectorizedStores.insert(Operands.begin(), Operands.end());
- Changed |= Vectorized;
+ // FIXME: Is division-by-2 the correct step? Should we assert that the
+ // register size is a power-of-2?
+ for (unsigned Size = MaxVecRegSize; Size >= MinVecRegSize; Size /= 2) {
+ if (vectorizeStoreChain(Operands, costThreshold, R, Size)) {
+ // Mark the vectorized stores so that we don't vectorize them again.
+ VectorizedStores.insert(Operands.begin(), Operands.end());
+ Changed = true;
+ break;
+ }
+ }
}
return Changed;
@@ -3293,8 +3314,8 @@ unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) {
unsigned count = 0;
StoreRefs.clear();
const DataLayout &DL = BB->getModule()->getDataLayout();
- for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
- StoreInst *SI = dyn_cast<StoreInst>(it);
+ for (Instruction &I : *BB) {
+ StoreInst *SI = dyn_cast<StoreInst>(&I);
if (!SI)
continue;
@@ -3342,13 +3363,15 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
Type *Ty0 = I0->getType();
unsigned Sz = DL.getTypeSizeInBits(Ty0);
+ // FIXME: Register size should be a parameter to this function, so we can
+ // try different vectorization factors.
unsigned VF = MinVecRegSize / Sz;
- for (int i = 0, e = VL.size(); i < e; ++i) {
- Type *Ty = VL[i]->getType();
+ for (Value *V : VL) {
+ Type *Ty = V->getType();
if (!isValidElementType(Ty))
return false;
- Instruction *Inst = dyn_cast<Instruction>(VL[i]);
+ Instruction *Inst = dyn_cast<Instruction>(V);
if (!Inst || Inst->getOpcode() != Opcode0)
return false;
}
@@ -3571,6 +3594,8 @@ public:
const DataLayout &DL = B->getModule()->getDataLayout();
ReductionOpcode = B->getOpcode();
ReducedValueOpcode = 0;
+ // FIXME: Register size should be a parameter to this function, so we can
+ // try different vectorization factors.
ReduxWidth = MinVecRegSize / DL.getTypeSizeInBits(Ty);
ReductionRoot = B;
ReductionPHI = Phi;
@@ -3997,6 +4022,9 @@ bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) {
<< it->second.size() << ".\n");
// Process the stores in chunks of 16.
+ // TODO: The limit of 16 inhibits greater vectorization factors.
+ // For example, AVX2 supports v32i8. Increasing this limit, however,
+ // may cause a significant compile-time increase.
for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) {
unsigned Len = std::min<unsigned>(CE - CI, 16);
Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len),
OpenPOWER on IntegriCloud