summaryrefslogtreecommitdiffstats
path: root/contrib/llvm/lib/Transforms/Vectorize
diff options
context:
space:
mode:
authordim <dim@FreeBSD.org>2015-12-30 13:13:10 +0000
committerdim <dim@FreeBSD.org>2015-12-30 13:13:10 +0000
commit9b5bf5c4f53d65d6a48722d7410ed7cb15f5ba3a (patch)
treeb466a4817f79516eb1df8eae92bccf62ecc84003 /contrib/llvm/lib/Transforms/Vectorize
parentf09a28d1de99fda4f5517fb12670fc36552f4927 (diff)
parente194cd6d03d91631334d9d5e55b506036f423cc8 (diff)
downloadFreeBSD-src-9b5bf5c4f53d65d6a48722d7410ed7cb15f5ba3a.zip
FreeBSD-src-9b5bf5c4f53d65d6a48722d7410ed7cb15f5ba3a.tar.gz
Update llvm to trunk r256633.
Diffstat (limited to 'contrib/llvm/lib/Transforms/Vectorize')
-rw-r--r--contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp175
-rw-r--r--contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp2475
-rw-r--r--contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp489
3 files changed, 1871 insertions, 1268 deletions
diff --git a/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp b/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp
index 215d6f9..8844d57 100644
--- a/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp
+++ b/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp
@@ -25,8 +25,11 @@
#include "llvm/ADT/StringExtras.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Constants.h"
@@ -204,9 +207,10 @@ namespace {
BBVectorize(Pass *P, Function &F, const VectorizeConfig &C)
: BasicBlockPass(ID), Config(C) {
- AA = &P->getAnalysis<AliasAnalysis>();
+ AA = &P->getAnalysis<AAResultsWrapperPass>().getAAResults();
DT = &P->getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- SE = &P->getAnalysis<ScalarEvolution>();
+ SE = &P->getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ TLI = &P->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
TTI = IgnoreTargetInfo
? nullptr
: &P->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
@@ -221,6 +225,7 @@ namespace {
AliasAnalysis *AA;
DominatorTree *DT;
ScalarEvolution *SE;
+ const TargetLibraryInfo *TLI;
const TargetTransformInfo *TTI;
// FIXME: const correct?
@@ -437,9 +442,10 @@ namespace {
bool runOnBasicBlock(BasicBlock &BB) override {
// OptimizeNone check deferred to vectorizeBB().
- AA = &getAnalysis<AliasAnalysis>();
+ AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- SE = &getAnalysis<ScalarEvolution>();
+ SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
TTI = IgnoreTargetInfo
? nullptr
: &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
@@ -450,13 +456,15 @@ namespace {
void getAnalysisUsage(AnalysisUsage &AU) const override {
BasicBlockPass::getAnalysisUsage(AU);
- AU.addRequired<AliasAnalysis>();
+ AU.addRequired<AAResultsWrapperPass>();
AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<ScalarEvolution>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
AU.addRequired<TargetTransformInfoWrapperPass>();
- AU.addPreserved<AliasAnalysis>();
AU.addPreserved<DominatorTreeWrapperPass>();
- AU.addPreserved<ScalarEvolution>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addPreserved<ScalarEvolutionWrapperPass>();
+ AU.addPreserved<SCEVAAWrapperPass>();
AU.setPreservesCFG();
}
@@ -842,7 +850,7 @@ namespace {
// It is important to cleanup here so that future iterations of this
// function have less work to do.
- (void)SimplifyInstructionsInBlock(&BB, AA->getTargetLibraryInfo());
+ (void)SimplifyInstructionsInBlock(&BB, TLI);
return true;
}
@@ -1239,20 +1247,23 @@ namespace {
if (I == Start) IAfterStart = true;
bool IsSimpleLoadStore;
- if (!isInstVectorizable(I, IsSimpleLoadStore)) continue;
+ if (!isInstVectorizable(&*I, IsSimpleLoadStore))
+ continue;
// Look for an instruction with which to pair instruction *I...
DenseSet<Value *> Users;
AliasSetTracker WriteSet(*AA);
- if (I->mayWriteToMemory()) WriteSet.add(I);
+ if (I->mayWriteToMemory())
+ WriteSet.add(&*I);
bool JAfterStart = IAfterStart;
BasicBlock::iterator J = std::next(I);
for (unsigned ss = 0; J != E && ss <= Config.SearchLimit; ++J, ++ss) {
- if (J == Start) JAfterStart = true;
+ if (&*J == Start)
+ JAfterStart = true;
// Determine if J uses I, if so, exit the loop.
- bool UsesI = trackUsesOfI(Users, WriteSet, I, J, !Config.FastDep);
+ bool UsesI = trackUsesOfI(Users, WriteSet, &*I, &*J, !Config.FastDep);
if (Config.FastDep) {
// Note: For this heuristic to be effective, independent operations
// must tend to be intermixed. This is likely to be true from some
@@ -1269,25 +1280,26 @@ namespace {
// J does not use I, and comes before the first use of I, so it can be
// merged with I if the instructions are compatible.
int CostSavings, FixedOrder;
- if (!areInstsCompatible(I, J, IsSimpleLoadStore, NonPow2Len,
- CostSavings, FixedOrder)) continue;
+ if (!areInstsCompatible(&*I, &*J, IsSimpleLoadStore, NonPow2Len,
+ CostSavings, FixedOrder))
+ continue;
// J is a candidate for merging with I.
if (PairableInsts.empty() ||
- PairableInsts[PairableInsts.size()-1] != I) {
- PairableInsts.push_back(I);
+ PairableInsts[PairableInsts.size() - 1] != &*I) {
+ PairableInsts.push_back(&*I);
}
- CandidatePairs[I].push_back(J);
+ CandidatePairs[&*I].push_back(&*J);
++TotalPairs;
if (TTI)
- CandidatePairCostSavings.insert(ValuePairWithCost(ValuePair(I, J),
- CostSavings));
+ CandidatePairCostSavings.insert(
+ ValuePairWithCost(ValuePair(&*I, &*J), CostSavings));
if (FixedOrder == 1)
- FixedOrderPairs.insert(ValuePair(I, J));
+ FixedOrderPairs.insert(ValuePair(&*I, &*J));
else if (FixedOrder == -1)
- FixedOrderPairs.insert(ValuePair(J, I));
+ FixedOrderPairs.insert(ValuePair(&*J, &*I));
// The next call to this function must start after the last instruction
// selected during this invocation.
@@ -1468,14 +1480,16 @@ namespace {
BasicBlock::iterator E = BB.end(), EL =
BasicBlock::iterator(cast<Instruction>(PairableInsts.back()));
for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E; ++I) {
- if (IsInPair.find(I) == IsInPair.end()) continue;
+ if (IsInPair.find(&*I) == IsInPair.end())
+ continue;
DenseSet<Value *> Users;
AliasSetTracker WriteSet(*AA);
- if (I->mayWriteToMemory()) WriteSet.add(I);
+ if (I->mayWriteToMemory())
+ WriteSet.add(&*I);
for (BasicBlock::iterator J = std::next(I); J != E; ++J) {
- (void) trackUsesOfI(Users, WriteSet, I, J);
+ (void)trackUsesOfI(Users, WriteSet, &*I, &*J);
if (J == EL)
break;
@@ -1484,7 +1498,7 @@ namespace {
for (DenseSet<Value *>::iterator U = Users.begin(), E = Users.end();
U != E; ++U) {
if (IsInPair.find(*U) == IsInPair.end()) continue;
- PairableInstUsers.insert(ValuePair(I, *U));
+ PairableInstUsers.insert(ValuePair(&*I, *U));
}
if (I == EL)
@@ -2806,55 +2820,51 @@ namespace {
Instruction *J, Instruction *K,
Instruction *&InsertionPt,
Instruction *&K1, Instruction *&K2) {
- if (isa<StoreInst>(I)) {
- AA->replaceWithNewValue(I, K);
- AA->replaceWithNewValue(J, K);
- } else {
- Type *IType = I->getType();
- Type *JType = J->getType();
+ if (isa<StoreInst>(I))
+ return;
- VectorType *VType = getVecTypeForPair(IType, JType);
- unsigned numElem = VType->getNumElements();
+ Type *IType = I->getType();
+ Type *JType = J->getType();
- unsigned numElemI = getNumScalarElements(IType);
- unsigned numElemJ = getNumScalarElements(JType);
+ VectorType *VType = getVecTypeForPair(IType, JType);
+ unsigned numElem = VType->getNumElements();
- if (IType->isVectorTy()) {
- std::vector<Constant*> Mask1(numElemI), Mask2(numElemI);
- for (unsigned v = 0; v < numElemI; ++v) {
- Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
- Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemJ+v);
- }
+ unsigned numElemI = getNumScalarElements(IType);
+ unsigned numElemJ = getNumScalarElements(JType);
- K1 = new ShuffleVectorInst(K, UndefValue::get(VType),
- ConstantVector::get( Mask1),
- getReplacementName(K, false, 1));
- } else {
- Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
- K1 = ExtractElementInst::Create(K, CV0,
- getReplacementName(K, false, 1));
+ if (IType->isVectorTy()) {
+ std::vector<Constant *> Mask1(numElemI), Mask2(numElemI);
+ for (unsigned v = 0; v < numElemI; ++v) {
+ Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
+ Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemJ + v);
}
- if (JType->isVectorTy()) {
- std::vector<Constant*> Mask1(numElemJ), Mask2(numElemJ);
- for (unsigned v = 0; v < numElemJ; ++v) {
- Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
- Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemI+v);
- }
+ K1 = new ShuffleVectorInst(K, UndefValue::get(VType),
+ ConstantVector::get(Mask1),
+ getReplacementName(K, false, 1));
+ } else {
+ Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
+ K1 = ExtractElementInst::Create(K, CV0, getReplacementName(K, false, 1));
+ }
- K2 = new ShuffleVectorInst(K, UndefValue::get(VType),
- ConstantVector::get( Mask2),
- getReplacementName(K, false, 2));
- } else {
- Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem-1);
- K2 = ExtractElementInst::Create(K, CV1,
- getReplacementName(K, false, 2));
+ if (JType->isVectorTy()) {
+ std::vector<Constant *> Mask1(numElemJ), Mask2(numElemJ);
+ for (unsigned v = 0; v < numElemJ; ++v) {
+ Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
+ Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemI + v);
}
- K1->insertAfter(K);
- K2->insertAfter(K1);
- InsertionPt = K2;
+ K2 = new ShuffleVectorInst(K, UndefValue::get(VType),
+ ConstantVector::get(Mask2),
+ getReplacementName(K, false, 2));
+ } else {
+ Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem - 1);
+ K2 = ExtractElementInst::Create(K, CV1, getReplacementName(K, false, 2));
}
+
+ K1->insertAfter(K);
+ K2->insertAfter(K1);
+ InsertionPt = K2;
}
// Move all uses of the function I (including pairing-induced uses) after J.
@@ -2869,7 +2879,7 @@ namespace {
if (I->mayWriteToMemory()) WriteSet.add(I);
for (; cast<Instruction>(L) != J; ++L)
- (void) trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSetPairs);
+ (void)trackUsesOfI(Users, WriteSet, I, &*L, true, &LoadMoveSetPairs);
assert(cast<Instruction>(L) == J &&
"Tracking has not proceeded far enough to check for dependencies");
@@ -2891,9 +2901,9 @@ namespace {
if (I->mayWriteToMemory()) WriteSet.add(I);
for (; cast<Instruction>(L) != J;) {
- if (trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSetPairs)) {
+ if (trackUsesOfI(Users, WriteSet, I, &*L, true, &LoadMoveSetPairs)) {
// Move this instruction
- Instruction *InstToMove = L; ++L;
+ Instruction *InstToMove = &*L++;
DEBUG(dbgs() << "BBV: moving: " << *InstToMove <<
" to after " << *InsertionPt << "\n");
@@ -2924,11 +2934,11 @@ namespace {
// Note: We cannot end the loop when we reach J because J could be moved
// farther down the use chain by another instruction pairing. Also, J
// could be before I if this is an inverted input.
- for (BasicBlock::iterator E = BB.end(); cast<Instruction>(L) != E; ++L) {
- if (trackUsesOfI(Users, WriteSet, I, L)) {
+ for (BasicBlock::iterator E = BB.end(); L != E; ++L) {
+ if (trackUsesOfI(Users, WriteSet, I, &*L)) {
if (L->mayReadFromMemory()) {
- LoadMoveSet[L].push_back(I);
- LoadMoveSetPairs.insert(ValuePair(L, I));
+ LoadMoveSet[&*L].push_back(I);
+ LoadMoveSetPairs.insert(ValuePair(&*L, I));
}
}
}
@@ -2991,7 +3001,7 @@ namespace {
DEBUG(dbgs() << "BBV: initial: \n" << BB << "\n");
for (BasicBlock::iterator PI = BB.getFirstInsertionPt(); PI != BB.end();) {
- DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(PI);
+ DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(&*PI);
if (P == ChosenPairs.end()) {
++PI;
continue;
@@ -3116,12 +3126,9 @@ namespace {
} else if (!isa<StoreInst>(K))
K->mutateType(getVecTypeForPair(L->getType(), H->getType()));
- unsigned KnownIDs[] = {
- LLVMContext::MD_tbaa,
- LLVMContext::MD_alias_scope,
- LLVMContext::MD_noalias,
- LLVMContext::MD_fpmath
- };
+ unsigned KnownIDs[] = {LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
+ LLVMContext::MD_noalias, LLVMContext::MD_fpmath,
+ LLVMContext::MD_invariant_group};
combineMetadata(K, H, KnownIDs);
K->intersectOptionalDataWith(H);
@@ -3145,8 +3152,6 @@ namespace {
if (!isa<StoreInst>(I)) {
L->replaceAllUsesWith(K1);
H->replaceAllUsesWith(K2);
- AA->replaceWithNewValue(L, K1);
- AA->replaceWithNewValue(H, K2);
}
// Instructions that may read from memory may be in the load move set.
@@ -3197,10 +3202,14 @@ namespace {
char BBVectorize::ID = 0;
static const char bb_vectorize_name[] = "Basic-Block Vectorization";
INITIALIZE_PASS_BEGIN(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)
-INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
INITIALIZE_PASS_END(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)
BasicBlockPass *llvm::createBBVectorizePass(const VectorizeConfig &C) {
diff --git a/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 69ca268..a627dd6 100644
--- a/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -48,7 +48,6 @@
#include "llvm/Transforms/Vectorize.h"
#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/EquivalenceClasses.h"
#include "llvm/ADT/Hashing.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/SetVector.h"
@@ -58,10 +57,13 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
#include "llvm/Analysis/AliasSetTracker.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/DemandedBits.h"
+#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopIterator.h"
@@ -99,6 +101,7 @@
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include <algorithm>
+#include <functional>
#include <map>
#include <tuple>
@@ -123,6 +126,11 @@ TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16),
"trip count that is smaller than this "
"value."));
+static cl::opt<bool> MaximizeBandwidth(
+ "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
+ cl::desc("Maximize bandwidth when selecting vectorization factor which "
+ "will be determined by the smallest type in loop."));
+
/// This enables versioning on the strides of symbolically striding memory
/// accesses in code like the following.
/// for (i = 0; i < N; ++i)
@@ -136,7 +144,7 @@ TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16),
/// ...
static cl::opt<bool> EnableMemAccessVersioning(
"enable-mem-access-versioning", cl::init(true), cl::Hidden,
- cl::desc("Enable symblic stride memory access versioning"));
+ cl::desc("Enable symbolic stride memory access versioning"));
static cl::opt<bool> EnableInterleavedMemAccesses(
"enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
@@ -214,12 +222,27 @@ static cl::opt<unsigned> MaxNestedScalarReductionIC(
cl::desc("The maximum interleave count to use when interleaving a scalar "
"reduction in a nested loop."));
+static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
+ "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
+ cl::desc("The maximum allowed number of runtime memory checks with a "
+ "vectorize(enable) pragma."));
+
+static cl::opt<unsigned> VectorizeSCEVCheckThreshold(
+ "vectorize-scev-check-threshold", cl::init(16), cl::Hidden,
+ cl::desc("The maximum number of SCEV checks allowed."));
+
+static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold(
+ "pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden,
+ cl::desc("The maximum number of SCEV checks allowed with a "
+ "vectorize(enable) pragma"));
+
namespace {
// Forward declarations.
+class LoopVectorizeHints;
class LoopVectorizationLegality;
class LoopVectorizationCostModel;
-class LoopVectorizeHints;
+class LoopVectorizationRequirements;
/// \brief This modifies LoopAccessReport to initialize message with
/// loop-vectorizer-specific part.
@@ -245,6 +268,32 @@ static Type* ToVectorTy(Type *Scalar, unsigned VF) {
return VectorType::get(Scalar, VF);
}
+/// A helper function that returns GEP instruction and knows to skip a
+/// 'bitcast'. The 'bitcast' may be skipped if the source and the destination
+/// pointee types of the 'bitcast' have the same size.
+/// For example:
+/// bitcast double** %var to i64* - can be skipped
+/// bitcast double** %var to i8* - can not
+static GetElementPtrInst *getGEPInstruction(Value *Ptr) {
+
+ if (isa<GetElementPtrInst>(Ptr))
+ return cast<GetElementPtrInst>(Ptr);
+
+ if (isa<BitCastInst>(Ptr) &&
+ isa<GetElementPtrInst>(cast<BitCastInst>(Ptr)->getOperand(0))) {
+ Type *BitcastTy = Ptr->getType();
+ Type *GEPTy = cast<BitCastInst>(Ptr)->getSrcTy();
+ if (!isa<PointerType>(BitcastTy) || !isa<PointerType>(GEPTy))
+ return nullptr;
+ Type *Pointee1Ty = cast<PointerType>(BitcastTy)->getPointerElementType();
+ Type *Pointee2Ty = cast<PointerType>(GEPTy)->getPointerElementType();
+ const DataLayout &DL = cast<BitCastInst>(Ptr)->getModule()->getDataLayout();
+ if (DL.getTypeSizeInBits(Pointee1Ty) == DL.getTypeSizeInBits(Pointee2Ty))
+ return cast<GetElementPtrInst>(cast<BitCastInst>(Ptr)->getOperand(0));
+ }
+ return nullptr;
+}
+
/// InnerLoopVectorizer vectorizes loops which contain only one basic
/// block to a specified vectorization factor (VF).
/// This class performs the widening of scalars into vectors, or multiple
@@ -261,25 +310,30 @@ static Type* ToVectorTy(Type *Scalar, unsigned VF) {
/// and reduction variables that were found to a given vectorization factor.
class InnerLoopVectorizer {
public:
- InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
- DominatorTree *DT, const TargetLibraryInfo *TLI,
+ InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
+ LoopInfo *LI, DominatorTree *DT,
+ const TargetLibraryInfo *TLI,
const TargetTransformInfo *TTI, unsigned VecWidth,
unsigned UnrollFactor)
- : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
- VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()),
+ : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
+ VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()),
Induction(nullptr), OldInduction(nullptr), WidenMap(UnrollFactor),
- Legal(nullptr), AddedSafetyChecks(false) {}
+ TripCount(nullptr), VectorTripCount(nullptr), Legal(nullptr),
+ AddedSafetyChecks(false) {}
// Perform the actual loop widening (vectorization).
- void vectorize(LoopVectorizationLegality *L) {
+ // MinimumBitWidths maps scalar integer values to the smallest bitwidth they
+ // can be validly truncated to. The cost model has assumed this truncation
+ // will happen when vectorizing.
+ void vectorize(LoopVectorizationLegality *L,
+ MapVector<Instruction*,uint64_t> MinimumBitWidths) {
+ MinBWs = MinimumBitWidths;
Legal = L;
// Create a new empty loop. Unlink the old loop and connect the new one.
createEmptyLoop();
// Widen each instruction in the old loop to a new one in the new loop.
// Use the Legality module to find the induction and reduction variables.
vectorizeLoop();
- // Register the new loop and update the analysis passes.
- updateAnalysis();
}
// Return true if any runtime check is added.
@@ -302,14 +356,11 @@ protected:
typedef DenseMap<std::pair<BasicBlock*, BasicBlock*>,
VectorParts> EdgeMaskCache;
- /// \brief Add checks for strides that were assumed to be 1.
- ///
- /// Returns the last check instruction and the first check instruction in the
- /// pair as (first, last).
- std::pair<Instruction *, Instruction *> addStrideCheck(Instruction *Loc);
-
/// Create an empty loop, based on the loop ranges of the old loop.
void createEmptyLoop();
+ /// Create a new induction variable inside L.
+ PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
+ Value *Step, Instruction *DL);
/// Copy and widen the instructions from the old loop.
virtual void vectorizeLoop();
@@ -319,6 +370,9 @@ protected:
/// See PR14725.
void fixLCSSAPHIs();
+ /// Shrinks vector element sizes based on information in "MinBWs".
+ void truncateToMinimalBitwidths();
+
/// A helper function that computes the predicate of the block BB, assuming
/// that the header block of the loop is set to True. It returns the *entry*
/// mask for the block BB.
@@ -329,7 +383,7 @@ protected:
/// A helper function to vectorize a single BB within the innermost loop.
void vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV);
-
+
/// Vectorize a single PHINode in a block. This method handles the induction
/// variable canonicalization. It supports both VF = 1 for unrolled loops and
/// arbitrary length vectors.
@@ -374,6 +428,23 @@ protected:
/// Generate a shuffle sequence that will reverse the vector Vec.
virtual Value *reverseVector(Value *Vec);
+ /// Returns (and creates if needed) the original loop trip count.
+ Value *getOrCreateTripCount(Loop *NewLoop);
+
+ /// Returns (and creates if needed) the trip count of the widened loop.
+ Value *getOrCreateVectorTripCount(Loop *NewLoop);
+
+ /// Emit a bypass check to see if the trip count would overflow, or we
+ /// wouldn't have enough iterations to execute one vector loop.
+ void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
+ /// Emit a bypass check to see if the vector trip count is nonzero.
+ void emitVectorLoopEnteredCheck(Loop *L, BasicBlock *Bypass);
+ /// Emit a bypass check to see if all of the SCEV assumptions we've
+ /// had to make are correct.
+ void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
+ /// Emit bypass checks to check any memory assumptions we may have made.
+ void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
+
/// This is a helper class that holds the vectorizer state. It maps scalar
/// instructions to vector instructions. When the code is 'unrolled' then
/// then a single scalar value is mapped to multiple vector parts. The parts
@@ -416,8 +487,10 @@ protected:
/// The original loop.
Loop *OrigLoop;
- /// Scev analysis to use.
- ScalarEvolution *SE;
+ /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
+ /// dynamic knowledge to simplify SCEV expressions and converts them to a
+ /// more usable form.
+ PredicatedScalarEvolution &PSE;
/// Loop Info.
LoopInfo *LI;
/// Dominator Tree.
@@ -462,12 +535,21 @@ protected:
PHINode *Induction;
/// The induction variable of the old basic block.
PHINode *OldInduction;
- /// Holds the extended (to the widest induction type) start index.
- Value *ExtendedIdx;
/// Maps scalars to widened vectors.
ValueMap WidenMap;
+ /// Store instructions that should be predicated, as a pair
+ /// <StoreInst, Predicate>
+ SmallVector<std::pair<StoreInst*,Value*>, 4> PredicatedStores;
EdgeMaskCache MaskCache;
-
+ /// Trip count of the original loop.
+ Value *TripCount;
+ /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
+ Value *VectorTripCount;
+
+ /// Map of scalar integer values to the smallest bitwidth they can be legally
+ /// represented as. The vector equivalents of these values should be truncated
+ /// to this type.
+ MapVector<Instruction*,uint64_t> MinBWs;
LoopVectorizationLegality *Legal;
// Record whether runtime check is added.
@@ -476,10 +558,11 @@ protected:
class InnerLoopUnroller : public InnerLoopVectorizer {
public:
- InnerLoopUnroller(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
- DominatorTree *DT, const TargetLibraryInfo *TLI,
+ InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
+ LoopInfo *LI, DominatorTree *DT,
+ const TargetLibraryInfo *TLI,
const TargetTransformInfo *TTI, unsigned UnrollFactor)
- : InnerLoopVectorizer(OrigLoop, SE, LI, DT, TLI, TTI, 1, UnrollFactor) {}
+ : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, 1, UnrollFactor) {}
private:
void scalarizeInstruction(Instruction *Instr,
@@ -551,7 +634,8 @@ static void propagateMetadata(Instruction *To, const Instruction *From) {
if (Kind != LLVMContext::MD_tbaa &&
Kind != LLVMContext::MD_alias_scope &&
Kind != LLVMContext::MD_noalias &&
- Kind != LLVMContext::MD_fpmath)
+ Kind != LLVMContext::MD_fpmath &&
+ Kind != LLVMContext::MD_nontemporal)
continue;
To->setMetadata(Kind, M.second);
@@ -559,7 +643,8 @@ static void propagateMetadata(Instruction *To, const Instruction *From) {
}
/// \brief Propagate known metadata from one instruction to a vector of others.
-static void propagateMetadata(SmallVectorImpl<Value *> &To, const Instruction *From) {
+static void propagateMetadata(SmallVectorImpl<Value *> &To,
+ const Instruction *From) {
for (Value *V : To)
if (Instruction *I = dyn_cast<Instruction>(V))
propagateMetadata(I, From);
@@ -699,8 +784,9 @@ private:
/// between the member and the group in a map.
class InterleavedAccessInfo {
public:
- InterleavedAccessInfo(ScalarEvolution *SE, Loop *L, DominatorTree *DT)
- : SE(SE), TheLoop(L), DT(DT) {}
+ InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L,
+ DominatorTree *DT)
+ : PSE(PSE), TheLoop(L), DT(DT) {}
~InterleavedAccessInfo() {
SmallSet<InterleaveGroup *, 4> DelSet;
@@ -730,7 +816,11 @@ public:
}
private:
- ScalarEvolution *SE;
+ /// A wrapper around ScalarEvolution, used to add runtime SCEV checks.
+ /// Simplifies SCEV expressions in the context of existing SCEV assumptions.
+ /// The interleaved access analysis can also add new predicates (for example
+ /// by versioning strides of pointers).
+ PredicatedScalarEvolution &PSE;
Loop *TheLoop;
DominatorTree *DT;
@@ -778,6 +868,304 @@ private:
const ValueToValueMap &Strides);
};
+/// Utility class for getting and setting loop vectorizer hints in the form
+/// of loop metadata.
+/// This class keeps a number of loop annotations locally (as member variables)
+/// and can, upon request, write them back as metadata on the loop. It will
+/// initially scan the loop for existing metadata, and will update the local
+/// values based on information in the loop.
+/// We cannot write all values to metadata, as the mere presence of some info,
+/// for example 'force', means a decision has been made. So, we need to be
+/// careful NOT to add them if the user hasn't specifically asked so.
+class LoopVectorizeHints {
+ enum HintKind {
+ HK_WIDTH,
+ HK_UNROLL,
+ HK_FORCE
+ };
+
+ /// Hint - associates name and validation with the hint value.
+ struct Hint {
+ const char * Name;
+ unsigned Value; // This may have to change for non-numeric values.
+ HintKind Kind;
+
+ Hint(const char * Name, unsigned Value, HintKind Kind)
+ : Name(Name), Value(Value), Kind(Kind) { }
+
+ bool validate(unsigned Val) {
+ switch (Kind) {
+ case HK_WIDTH:
+ return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth;
+ case HK_UNROLL:
+ return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor;
+ case HK_FORCE:
+ return (Val <= 1);
+ }
+ return false;
+ }
+ };
+
+ /// Vectorization width.
+ Hint Width;
+ /// Vectorization interleave factor.
+ Hint Interleave;
+ /// Vectorization forced
+ Hint Force;
+
+ /// Return the loop metadata prefix.
+ static StringRef Prefix() { return "llvm.loop."; }
+
+public:
+ enum ForceKind {
+ FK_Undefined = -1, ///< Not selected.
+ FK_Disabled = 0, ///< Forcing disabled.
+ FK_Enabled = 1, ///< Forcing enabled.
+ };
+
+ LoopVectorizeHints(const Loop *L, bool DisableInterleaving)
+ : Width("vectorize.width", VectorizerParams::VectorizationFactor,
+ HK_WIDTH),
+ Interleave("interleave.count", DisableInterleaving, HK_UNROLL),
+ Force("vectorize.enable", FK_Undefined, HK_FORCE),
+ TheLoop(L) {
+ // Populate values with existing loop metadata.
+ getHintsFromMetadata();
+
+ // force-vector-interleave overrides DisableInterleaving.
+ if (VectorizerParams::isInterleaveForced())
+ Interleave.Value = VectorizerParams::VectorizationInterleave;
+
+ DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs()
+ << "LV: Interleaving disabled by the pass manager\n");
+ }
+
+ /// Mark the loop L as already vectorized by setting the width to 1.
+ void setAlreadyVectorized() {
+ Width.Value = Interleave.Value = 1;
+ Hint Hints[] = {Width, Interleave};
+ writeHintsToMetadata(Hints);
+ }
+
+ bool allowVectorization(Function *F, Loop *L, bool AlwaysVectorize) const {
+ if (getForce() == LoopVectorizeHints::FK_Disabled) {
+ DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
+ emitOptimizationRemarkAnalysis(F->getContext(),
+ vectorizeAnalysisPassName(), *F,
+ L->getStartLoc(), emitRemark());
+ return false;
+ }
+
+ if (!AlwaysVectorize && getForce() != LoopVectorizeHints::FK_Enabled) {
+ DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
+ emitOptimizationRemarkAnalysis(F->getContext(),
+ vectorizeAnalysisPassName(), *F,
+ L->getStartLoc(), emitRemark());
+ return false;
+ }
+
+ if (getWidth() == 1 && getInterleave() == 1) {
+ // FIXME: Add a separate metadata to indicate when the loop has already
+ // been vectorized instead of setting width and count to 1.
+ DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
+ // FIXME: Add interleave.disable metadata. This will allow
+ // vectorize.disable to be used without disabling the pass and errors
+ // to differentiate between disabled vectorization and a width of 1.
+ emitOptimizationRemarkAnalysis(
+ F->getContext(), vectorizeAnalysisPassName(), *F, L->getStartLoc(),
+ "loop not vectorized: vectorization and interleaving are explicitly "
+ "disabled, or vectorize width and interleave count are both set to "
+ "1");
+ return false;
+ }
+
+ return true;
+ }
+
+ /// Dumps all the hint information.
+ std::string emitRemark() const {
+ VectorizationReport R;
+ if (Force.Value == LoopVectorizeHints::FK_Disabled)
+ R << "vectorization is explicitly disabled";
+ else {
+ R << "use -Rpass-analysis=loop-vectorize for more info";
+ if (Force.Value == LoopVectorizeHints::FK_Enabled) {
+ R << " (Force=true";
+ if (Width.Value != 0)
+ R << ", Vector Width=" << Width.Value;
+ if (Interleave.Value != 0)
+ R << ", Interleave Count=" << Interleave.Value;
+ R << ")";
+ }
+ }
+
+ return R.str();
+ }
+
+ unsigned getWidth() const { return Width.Value; }
+ unsigned getInterleave() const { return Interleave.Value; }
+ enum ForceKind getForce() const { return (ForceKind)Force.Value; }
+ const char *vectorizeAnalysisPassName() const {
+ // If hints are provided that don't disable vectorization use the
+ // AlwaysPrint pass name to force the frontend to print the diagnostic.
+ if (getWidth() == 1)
+ return LV_NAME;
+ if (getForce() == LoopVectorizeHints::FK_Disabled)
+ return LV_NAME;
+ if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth() == 0)
+ return LV_NAME;
+ return DiagnosticInfo::AlwaysPrint;
+ }
+
+ bool allowReordering() const {
+ // When enabling loop hints are provided we allow the vectorizer to change
+ // the order of operations that is given by the scalar loop. This is not
+ // enabled by default because can be unsafe or inefficient. For example,
+ // reordering floating-point operations will change the way round-off
+ // error accumulates in the loop.
+ return getForce() == LoopVectorizeHints::FK_Enabled || getWidth() > 1;
+ }
+
+private:
+ /// Find hints specified in the loop metadata and update local values.
+ void getHintsFromMetadata() {
+ MDNode *LoopID = TheLoop->getLoopID();
+ if (!LoopID)
+ return;
+
+ // First operand should refer to the loop id itself.
+ assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
+ assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
+
+ for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+ const MDString *S = nullptr;
+ SmallVector<Metadata *, 4> Args;
+
+ // The expected hint is either a MDString or a MDNode with the first
+ // operand a MDString.
+ if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) {
+ if (!MD || MD->getNumOperands() == 0)
+ continue;
+ S = dyn_cast<MDString>(MD->getOperand(0));
+ for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i)
+ Args.push_back(MD->getOperand(i));
+ } else {
+ S = dyn_cast<MDString>(LoopID->getOperand(i));
+ assert(Args.size() == 0 && "too many arguments for MDString");
+ }
+
+ if (!S)
+ continue;
+
+ // Check if the hint starts with the loop metadata prefix.
+ StringRef Name = S->getString();
+ if (Args.size() == 1)
+ setHint(Name, Args[0]);
+ }
+ }
+
+ /// Checks string hint with one operand and set value if valid.
+ void setHint(StringRef Name, Metadata *Arg) {
+ if (!Name.startswith(Prefix()))
+ return;
+ Name = Name.substr(Prefix().size(), StringRef::npos);
+
+ const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg);
+ if (!C) return;
+ unsigned Val = C->getZExtValue();
+
+ Hint *Hints[] = {&Width, &Interleave, &Force};
+ for (auto H : Hints) {
+ if (Name == H->Name) {
+ if (H->validate(Val))
+ H->Value = Val;
+ else
+ DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n");
+ break;
+ }
+ }
+ }
+
+ /// Create a new hint from name / value pair.
+ MDNode *createHintMetadata(StringRef Name, unsigned V) const {
+ LLVMContext &Context = TheLoop->getHeader()->getContext();
+ Metadata *MDs[] = {MDString::get(Context, Name),
+ ConstantAsMetadata::get(
+ ConstantInt::get(Type::getInt32Ty(Context), V))};
+ return MDNode::get(Context, MDs);
+ }
+
+ /// Matches metadata with hint name.
+ bool matchesHintMetadataName(MDNode *Node, ArrayRef<Hint> HintTypes) {
+ MDString* Name = dyn_cast<MDString>(Node->getOperand(0));
+ if (!Name)
+ return false;
+
+ for (auto H : HintTypes)
+ if (Name->getString().endswith(H.Name))
+ return true;
+ return false;
+ }
+
+ /// Sets current hints into loop metadata, keeping other values intact.
+ void writeHintsToMetadata(ArrayRef<Hint> HintTypes) {
+ if (HintTypes.size() == 0)
+ return;
+
+ // Reserve the first element to LoopID (see below).
+ SmallVector<Metadata *, 4> MDs(1);
+ // If the loop already has metadata, then ignore the existing operands.
+ MDNode *LoopID = TheLoop->getLoopID();
+ if (LoopID) {
+ for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+ MDNode *Node = cast<MDNode>(LoopID->getOperand(i));
+ // If node in update list, ignore old value.
+ if (!matchesHintMetadataName(Node, HintTypes))
+ MDs.push_back(Node);
+ }
+ }
+
+ // Now, add the missing hints.
+ for (auto H : HintTypes)
+ MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value));
+
+ // Replace current metadata node with new one.
+ LLVMContext &Context = TheLoop->getHeader()->getContext();
+ MDNode *NewLoopID = MDNode::get(Context, MDs);
+ // Set operand 0 to refer to the loop id itself.
+ NewLoopID->replaceOperandWith(0, NewLoopID);
+
+ TheLoop->setLoopID(NewLoopID);
+ }
+
+ /// The loop these hints belong to.
+ const Loop *TheLoop;
+};
+
+static void emitAnalysisDiag(const Function *TheFunction, const Loop *TheLoop,
+ const LoopVectorizeHints &Hints,
+ const LoopAccessReport &Message) {
+ const char *Name = Hints.vectorizeAnalysisPassName();
+ LoopAccessReport::emitAnalysis(Message, TheFunction, TheLoop, Name);
+}
+
+static void emitMissedWarning(Function *F, Loop *L,
+ const LoopVectorizeHints &LH) {
+ emitOptimizationRemarkMissed(F->getContext(), LV_NAME, *F, L->getStartLoc(),
+ LH.emitRemark());
+
+ if (LH.getForce() == LoopVectorizeHints::FK_Enabled) {
+ if (LH.getWidth() != 1)
+ emitLoopVectorizeWarning(
+ F->getContext(), *F, L->getStartLoc(),
+ "failed explicitly specified loop vectorization");
+ else if (LH.getInterleave() != 1)
+ emitLoopInterleaveWarning(
+ F->getContext(), *F, L->getStartLoc(),
+ "failed explicitly specified loop interleaving");
+ }
+}
+
/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
/// to what vectorization factor.
/// This class does not look at the profitability of vectorization, only the
@@ -793,87 +1181,17 @@ private:
/// induction variable and the different reduction variables.
class LoopVectorizationLegality {
public:
- LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DominatorTree *DT,
- TargetLibraryInfo *TLI, AliasAnalysis *AA,
- Function *F, const TargetTransformInfo *TTI,
- LoopAccessAnalysis *LAA)
- : NumPredStores(0), TheLoop(L), SE(SE), TLI(TLI), TheFunction(F),
- TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), InterleaveInfo(SE, L, DT),
- Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false) {}
-
- /// This enum represents the kinds of inductions that we support.
- enum InductionKind {
- IK_NoInduction, ///< Not an induction variable.
- IK_IntInduction, ///< Integer induction variable. Step = C.
- IK_PtrInduction ///< Pointer induction var. Step = C / sizeof(elem).
- };
-
- /// A struct for saving information about induction variables.
- struct InductionInfo {
- InductionInfo(Value *Start, InductionKind K, ConstantInt *Step)
- : StartValue(Start), IK(K), StepValue(Step) {
- assert(IK != IK_NoInduction && "Not an induction");
- assert(StartValue && "StartValue is null");
- assert(StepValue && !StepValue->isZero() && "StepValue is zero");
- assert((IK != IK_PtrInduction || StartValue->getType()->isPointerTy()) &&
- "StartValue is not a pointer for pointer induction");
- assert((IK != IK_IntInduction || StartValue->getType()->isIntegerTy()) &&
- "StartValue is not an integer for integer induction");
- assert(StepValue->getType()->isIntegerTy() &&
- "StepValue is not an integer");
- }
- InductionInfo()
- : StartValue(nullptr), IK(IK_NoInduction), StepValue(nullptr) {}
-
- /// Get the consecutive direction. Returns:
- /// 0 - unknown or non-consecutive.
- /// 1 - consecutive and increasing.
- /// -1 - consecutive and decreasing.
- int getConsecutiveDirection() const {
- if (StepValue && (StepValue->isOne() || StepValue->isMinusOne()))
- return StepValue->getSExtValue();
- return 0;
- }
-
- /// Compute the transformed value of Index at offset StartValue using step
- /// StepValue.
- /// For integer induction, returns StartValue + Index * StepValue.
- /// For pointer induction, returns StartValue[Index * StepValue].
- /// FIXME: The newly created binary instructions should contain nsw/nuw
- /// flags, which can be found from the original scalar operations.
- Value *transform(IRBuilder<> &B, Value *Index) const {
- switch (IK) {
- case IK_IntInduction:
- assert(Index->getType() == StartValue->getType() &&
- "Index type does not match StartValue type");
- if (StepValue->isMinusOne())
- return B.CreateSub(StartValue, Index);
- if (!StepValue->isOne())
- Index = B.CreateMul(Index, StepValue);
- return B.CreateAdd(StartValue, Index);
-
- case IK_PtrInduction:
- assert(Index->getType() == StepValue->getType() &&
- "Index type does not match StepValue type");
- if (StepValue->isMinusOne())
- Index = B.CreateNeg(Index);
- else if (!StepValue->isOne())
- Index = B.CreateMul(Index, StepValue);
- return B.CreateGEP(nullptr, StartValue, Index);
-
- case IK_NoInduction:
- return nullptr;
- }
- llvm_unreachable("invalid enum");
- }
-
- /// Start value.
- TrackingVH<Value> StartValue;
- /// Induction kind.
- InductionKind IK;
- /// Step value.
- ConstantInt *StepValue;
- };
+ LoopVectorizationLegality(Loop *L, PredicatedScalarEvolution &PSE,
+ DominatorTree *DT, TargetLibraryInfo *TLI,
+ AliasAnalysis *AA, Function *F,
+ const TargetTransformInfo *TTI,
+ LoopAccessAnalysis *LAA,
+ LoopVectorizationRequirements *R,
+ const LoopVectorizeHints *H)
+ : NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TheFunction(F),
+ TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), InterleaveInfo(PSE, L, DT),
+ Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false),
+ Requirements(R), Hints(H) {}
/// ReductionList contains the reduction descriptors for all
/// of the reductions that were found in the loop.
@@ -881,7 +1199,7 @@ public:
/// InductionList saves induction variables and maps them to the
/// induction descriptor.
- typedef MapVector<PHINode*, InductionInfo> InductionList;
+ typedef MapVector<PHINode*, InductionDescriptor> InductionList;
/// Returns true if it is legal to vectorize this loop.
/// This does not mean that it is profitable to vectorize this
@@ -903,6 +1221,9 @@ public:
/// Returns True if V is an induction variable in this loop.
bool isInductionVariable(const Value *V);
+ /// Returns True if PN is a reduction variable in this loop.
+ bool isReductionVariable(PHINode *PN) { return Reductions.count(PN); }
+
/// Return true if the block BB needs to be predicated in order for the loop
/// to be vectorized.
bool blockNeedsPredication(BasicBlock *BB);
@@ -954,12 +1275,12 @@ public:
/// Returns true if the target machine supports masked store operation
/// for the given \p DataType and kind of access to \p Ptr.
bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
- return TTI->isLegalMaskedStore(DataType, isConsecutivePtr(Ptr));
+ return isConsecutivePtr(Ptr) && TTI->isLegalMaskedStore(DataType);
}
/// Returns true if the target machine supports masked load operation
/// for the given \p DataType and kind of access to \p Ptr.
bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
- return TTI->isLegalMaskedLoad(DataType, isConsecutivePtr(Ptr));
+ return isConsecutivePtr(Ptr) && TTI->isLegalMaskedLoad(DataType);
}
/// Returns true if vector representation of the instruction \p I
/// requires mask.
@@ -999,10 +1320,6 @@ private:
/// and we know that we can read from them without segfault.
bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs);
- /// Returns the induction kind of Phi and record the step. This function may
- /// return NoInduction if the PHI is not an induction variable.
- InductionKind isInductionVariable(PHINode *Phi, ConstantInt *&StepValue);
-
/// \brief Collect memory access with loop invariant strides.
///
/// Looks for accesses like "a[i * StrideA]" where "StrideA" is loop
@@ -1013,16 +1330,20 @@ private:
/// not vectorized. These are handled as LoopAccessReport rather than
/// VectorizationReport because the << operator of VectorizationReport returns
/// LoopAccessReport.
- void emitAnalysis(const LoopAccessReport &Message) {
- LoopAccessReport::emitAnalysis(Message, TheFunction, TheLoop, LV_NAME);
+ void emitAnalysis(const LoopAccessReport &Message) const {
+ emitAnalysisDiag(TheFunction, TheLoop, *Hints, Message);
}
unsigned NumPredStores;
/// The loop that we evaluate.
Loop *TheLoop;
- /// Scev analysis.
- ScalarEvolution *SE;
+ /// A wrapper around ScalarEvolution used to add runtime SCEV checks.
+ /// Applies dynamic knowledge to simplify SCEV expressions in the context
+ /// of existing SCEV assumptions. The analysis will also add a minimal set
+ /// of new predicates if this is required to enable vectorization and
+ /// unrolling.
+ PredicatedScalarEvolution &PSE;
/// Target Library Info.
TargetLibraryInfo *TLI;
/// Parent function
@@ -1065,12 +1386,18 @@ private:
/// Can we assume the absence of NaNs.
bool HasFunNoNaNAttr;
+ /// Vectorization requirements that will go through late-evaluation.
+ LoopVectorizationRequirements *Requirements;
+
+ /// Used to emit an analysis of any legality issues.
+ const LoopVectorizeHints *Hints;
+
ValueToValueMap Strides;
SmallPtrSet<Value *, 8> StrideSet;
/// While vectorizing these instructions we have to generate a
/// call to the appropriate masked intrinsic
- SmallPtrSet<const Instruction*, 8> MaskedOp;
+ SmallPtrSet<const Instruction *, 8> MaskedOp;
};
/// LoopVectorizationCostModel - estimates the expected speedups due to
@@ -1082,15 +1409,14 @@ private:
/// different operations.
class LoopVectorizationCostModel {
public:
- LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI,
- LoopVectorizationLegality *Legal,
+ LoopVectorizationCostModel(Loop *L, PredicatedScalarEvolution &PSE,
+ LoopInfo *LI, LoopVectorizationLegality *Legal,
const TargetTransformInfo &TTI,
- const TargetLibraryInfo *TLI, AssumptionCache *AC,
- const Function *F, const LoopVectorizeHints *Hints)
- : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI),
- TheFunction(F), Hints(Hints) {
- CodeMetrics::collectEphemeralValues(L, AC, EphValues);
- }
+ const TargetLibraryInfo *TLI, DemandedBits *DB,
+ AssumptionCache *AC, const Function *F,
+ const LoopVectorizeHints *Hints)
+ : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
+ AC(AC), TheFunction(F), Hints(Hints) {}
/// Information about vectorization costs
struct VectorizationFactor {
@@ -1103,10 +1429,10 @@ public:
/// possible.
VectorizationFactor selectVectorizationFactor(bool OptForSize);
- /// \return The size (in bits) of the widest type in the code that
- /// needs to be vectorized. We ignore values that remain scalar such as
+ /// \return The size (in bits) of the smallest and widest types in the code
+ /// that needs to be vectorized. We ignore values that remain scalar such as
/// 64 bit loop indices.
- unsigned getWidestType();
+ std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
/// \return The desired interleave count.
/// If interleave count has been specified by metadata it will be returned.
@@ -1133,8 +1459,13 @@ public:
unsigned NumInstructions;
};
- /// \return information about the register usage of the loop.
- RegisterUsage calculateRegisterUsage();
+ /// \return Returns information about the register usages of the loop for the
+ /// given vectorization factors.
+ SmallVector<RegisterUsage, 8>
+ calculateRegisterUsage(const SmallVector<unsigned, 8> &VFs);
+
+ /// Collect values we want to ignore in the cost model.
+ void collectValuesToIgnore();
private:
/// Returns the expected execution cost. The unit of the cost does
@@ -1155,17 +1486,20 @@ private:
/// not vectorized. These are handled as LoopAccessReport rather than
/// VectorizationReport because the << operator of VectorizationReport returns
/// LoopAccessReport.
- void emitAnalysis(const LoopAccessReport &Message) {
- LoopAccessReport::emitAnalysis(Message, TheFunction, TheLoop, LV_NAME);
+ void emitAnalysis(const LoopAccessReport &Message) const {
+ emitAnalysisDiag(TheFunction, TheLoop, *Hints, Message);
}
- /// Values used only by @llvm.assume calls.
- SmallPtrSet<const Value *, 32> EphValues;
+public:
+ /// Map of scalar integer values to the smallest bitwidth they can be legally
+ /// represented as. The vector equivalents of these values should be truncated
+ /// to this type.
+ MapVector<Instruction*,uint64_t> MinBWs;
/// The loop that we evaluate.
Loop *TheLoop;
- /// Scev analysis.
- ScalarEvolution *SE;
+ /// Predicated scalar evolution analysis.
+ PredicatedScalarEvolution &PSE;
/// Loop Info analysis.
LoopInfo *LI;
/// Vectorization legality.
@@ -1174,247 +1508,78 @@ private:
const TargetTransformInfo &TTI;
/// Target Library Info.
const TargetLibraryInfo *TLI;
+ /// Demanded bits analysis.
+ DemandedBits *DB;
+ /// Assumption cache.
+ AssumptionCache *AC;
const Function *TheFunction;
- // Loop Vectorize Hint.
+ /// Loop Vectorize Hint.
const LoopVectorizeHints *Hints;
+ /// Values to ignore in the cost model.
+ SmallPtrSet<const Value *, 16> ValuesToIgnore;
+ /// Values to ignore in the cost model when VF > 1.
+ SmallPtrSet<const Value *, 16> VecValuesToIgnore;
};
-/// Utility class for getting and setting loop vectorizer hints in the form
-/// of loop metadata.
-/// This class keeps a number of loop annotations locally (as member variables)
-/// and can, upon request, write them back as metadata on the loop. It will
-/// initially scan the loop for existing metadata, and will update the local
-/// values based on information in the loop.
-/// We cannot write all values to metadata, as the mere presence of some info,
-/// for example 'force', means a decision has been made. So, we need to be
-/// careful NOT to add them if the user hasn't specifically asked so.
-class LoopVectorizeHints {
- enum HintKind {
- HK_WIDTH,
- HK_UNROLL,
- HK_FORCE
- };
-
- /// Hint - associates name and validation with the hint value.
- struct Hint {
- const char * Name;
- unsigned Value; // This may have to change for non-numeric values.
- HintKind Kind;
-
- Hint(const char * Name, unsigned Value, HintKind Kind)
- : Name(Name), Value(Value), Kind(Kind) { }
-
- bool validate(unsigned Val) {
- switch (Kind) {
- case HK_WIDTH:
- return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth;
- case HK_UNROLL:
- return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor;
- case HK_FORCE:
- return (Val <= 1);
- }
- return false;
- }
- };
-
- /// Vectorization width.
- Hint Width;
- /// Vectorization interleave factor.
- Hint Interleave;
- /// Vectorization forced
- Hint Force;
-
- /// Return the loop metadata prefix.
- static StringRef Prefix() { return "llvm.loop."; }
-
+/// \brief This holds vectorization requirements that must be verified late in
+/// the process. The requirements are set by legalize and costmodel. Once
+/// vectorization has been determined to be possible and profitable the
+/// requirements can be verified by looking for metadata or compiler options.
+/// For example, some loops require FP commutativity which is only allowed if
+/// vectorization is explicitly specified or if the fast-math compiler option
+/// has been provided.
+/// Late evaluation of these requirements allows helpful diagnostics to be
+/// composed that tells the user what need to be done to vectorize the loop. For
+/// example, by specifying #pragma clang loop vectorize or -ffast-math. Late
+/// evaluation should be used only when diagnostics can generated that can be
+/// followed by a non-expert user.
+class LoopVectorizationRequirements {
public:
- enum ForceKind {
- FK_Undefined = -1, ///< Not selected.
- FK_Disabled = 0, ///< Forcing disabled.
- FK_Enabled = 1, ///< Forcing enabled.
- };
-
- LoopVectorizeHints(const Loop *L, bool DisableInterleaving)
- : Width("vectorize.width", VectorizerParams::VectorizationFactor,
- HK_WIDTH),
- Interleave("interleave.count", DisableInterleaving, HK_UNROLL),
- Force("vectorize.enable", FK_Undefined, HK_FORCE),
- TheLoop(L) {
- // Populate values with existing loop metadata.
- getHintsFromMetadata();
-
- // force-vector-interleave overrides DisableInterleaving.
- if (VectorizerParams::isInterleaveForced())
- Interleave.Value = VectorizerParams::VectorizationInterleave;
-
- DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs()
- << "LV: Interleaving disabled by the pass manager\n");
- }
-
- /// Mark the loop L as already vectorized by setting the width to 1.
- void setAlreadyVectorized() {
- Width.Value = Interleave.Value = 1;
- Hint Hints[] = {Width, Interleave};
- writeHintsToMetadata(Hints);
- }
-
- /// Dumps all the hint information.
- std::string emitRemark() const {
- VectorizationReport R;
- if (Force.Value == LoopVectorizeHints::FK_Disabled)
- R << "vectorization is explicitly disabled";
- else {
- R << "use -Rpass-analysis=loop-vectorize for more info";
- if (Force.Value == LoopVectorizeHints::FK_Enabled) {
- R << " (Force=true";
- if (Width.Value != 0)
- R << ", Vector Width=" << Width.Value;
- if (Interleave.Value != 0)
- R << ", Interleave Count=" << Interleave.Value;
- R << ")";
- }
- }
-
- return R.str();
- }
-
- unsigned getWidth() const { return Width.Value; }
- unsigned getInterleave() const { return Interleave.Value; }
- enum ForceKind getForce() const { return (ForceKind)Force.Value; }
-
-private:
- /// Find hints specified in the loop metadata and update local values.
- void getHintsFromMetadata() {
- MDNode *LoopID = TheLoop->getLoopID();
- if (!LoopID)
- return;
-
- // First operand should refer to the loop id itself.
- assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
- assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
-
- for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
- const MDString *S = nullptr;
- SmallVector<Metadata *, 4> Args;
-
- // The expected hint is either a MDString or a MDNode with the first
- // operand a MDString.
- if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) {
- if (!MD || MD->getNumOperands() == 0)
- continue;
- S = dyn_cast<MDString>(MD->getOperand(0));
- for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i)
- Args.push_back(MD->getOperand(i));
- } else {
- S = dyn_cast<MDString>(LoopID->getOperand(i));
- assert(Args.size() == 0 && "too many arguments for MDString");
- }
-
- if (!S)
- continue;
-
- // Check if the hint starts with the loop metadata prefix.
- StringRef Name = S->getString();
- if (Args.size() == 1)
- setHint(Name, Args[0]);
+ LoopVectorizationRequirements()
+ : NumRuntimePointerChecks(0), UnsafeAlgebraInst(nullptr) {}
+
+ void addUnsafeAlgebraInst(Instruction *I) {
+ // First unsafe algebra instruction.
+ if (!UnsafeAlgebraInst)
+ UnsafeAlgebraInst = I;
+ }
+
+ void addRuntimePointerChecks(unsigned Num) { NumRuntimePointerChecks = Num; }
+
+ bool doesNotMeet(Function *F, Loop *L, const LoopVectorizeHints &Hints) {
+ const char *Name = Hints.vectorizeAnalysisPassName();
+ bool Failed = false;
+ if (UnsafeAlgebraInst && !Hints.allowReordering()) {
+ emitOptimizationRemarkAnalysisFPCommute(
+ F->getContext(), Name, *F, UnsafeAlgebraInst->getDebugLoc(),
+ VectorizationReport() << "cannot prove it is safe to reorder "
+ "floating-point operations");
+ Failed = true;
}
- }
-
- /// Checks string hint with one operand and set value if valid.
- void setHint(StringRef Name, Metadata *Arg) {
- if (!Name.startswith(Prefix()))
- return;
- Name = Name.substr(Prefix().size(), StringRef::npos);
-
- const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg);
- if (!C) return;
- unsigned Val = C->getZExtValue();
- Hint *Hints[] = {&Width, &Interleave, &Force};
- for (auto H : Hints) {
- if (Name == H->Name) {
- if (H->validate(Val))
- H->Value = Val;
- else
- DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n");
- break;
- }
+ // Test if runtime memcheck thresholds are exceeded.
+ bool PragmaThresholdReached =
+ NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
+ bool ThresholdReached =
+ NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
+ if ((ThresholdReached && !Hints.allowReordering()) ||
+ PragmaThresholdReached) {
+ emitOptimizationRemarkAnalysisAliasing(
+ F->getContext(), Name, *F, L->getStartLoc(),
+ VectorizationReport()
+ << "cannot prove it is safe to reorder memory operations");
+ DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
+ Failed = true;
}
- }
- /// Create a new hint from name / value pair.
- MDNode *createHintMetadata(StringRef Name, unsigned V) const {
- LLVMContext &Context = TheLoop->getHeader()->getContext();
- Metadata *MDs[] = {MDString::get(Context, Name),
- ConstantAsMetadata::get(
- ConstantInt::get(Type::getInt32Ty(Context), V))};
- return MDNode::get(Context, MDs);
+ return Failed;
}
- /// Matches metadata with hint name.
- bool matchesHintMetadataName(MDNode *Node, ArrayRef<Hint> HintTypes) {
- MDString* Name = dyn_cast<MDString>(Node->getOperand(0));
- if (!Name)
- return false;
-
- for (auto H : HintTypes)
- if (Name->getString().endswith(H.Name))
- return true;
- return false;
- }
-
- /// Sets current hints into loop metadata, keeping other values intact.
- void writeHintsToMetadata(ArrayRef<Hint> HintTypes) {
- if (HintTypes.size() == 0)
- return;
-
- // Reserve the first element to LoopID (see below).
- SmallVector<Metadata *, 4> MDs(1);
- // If the loop already has metadata, then ignore the existing operands.
- MDNode *LoopID = TheLoop->getLoopID();
- if (LoopID) {
- for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
- MDNode *Node = cast<MDNode>(LoopID->getOperand(i));
- // If node in update list, ignore old value.
- if (!matchesHintMetadataName(Node, HintTypes))
- MDs.push_back(Node);
- }
- }
-
- // Now, add the missing hints.
- for (auto H : HintTypes)
- MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value));
-
- // Replace current metadata node with new one.
- LLVMContext &Context = TheLoop->getHeader()->getContext();
- MDNode *NewLoopID = MDNode::get(Context, MDs);
- // Set operand 0 to refer to the loop id itself.
- NewLoopID->replaceOperandWith(0, NewLoopID);
-
- TheLoop->setLoopID(NewLoopID);
- }
-
- /// The loop these hints belong to.
- const Loop *TheLoop;
+private:
+ unsigned NumRuntimePointerChecks;
+ Instruction *UnsafeAlgebraInst;
};
-static void emitMissedWarning(Function *F, Loop *L,
- const LoopVectorizeHints &LH) {
- emitOptimizationRemarkMissed(F->getContext(), DEBUG_TYPE, *F,
- L->getStartLoc(), LH.emitRemark());
-
- if (LH.getForce() == LoopVectorizeHints::FK_Enabled) {
- if (LH.getWidth() != 1)
- emitLoopVectorizeWarning(
- F->getContext(), *F, L->getStartLoc(),
- "failed explicitly specified loop vectorization");
- else if (LH.getInterleave() != 1)
- emitLoopInterleaveWarning(
- F->getContext(), *F, L->getStartLoc(),
- "failed explicitly specified loop interleaving");
- }
-}
-
static void addInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) {
if (L.empty())
return V.push_back(&L);
@@ -1441,6 +1606,7 @@ struct LoopVectorize : public FunctionPass {
DominatorTree *DT;
BlockFrequencyInfo *BFI;
TargetLibraryInfo *TLI;
+ DemandedBits *DB;
AliasAnalysis *AA;
AssumptionCache *AC;
LoopAccessAnalysis *LAA;
@@ -1450,16 +1616,17 @@ struct LoopVectorize : public FunctionPass {
BlockFrequency ColdEntryFreq;
bool runOnFunction(Function &F) override {
- SE = &getAnalysis<ScalarEvolution>();
+ SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- BFI = &getAnalysis<BlockFrequencyInfo>();
+ BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
TLI = TLIP ? &TLIP->getTLI() : nullptr;
- AA = &getAnalysis<AliasAnalysis>();
+ AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
LAA = &getAnalysis<LoopAccessAnalysis>();
+ DB = &getAnalysis<DemandedBits>();
// Compute some weights outside of the loop over the loops. Compute this
// using a BranchProbability to re-use its scaling math.
@@ -1562,26 +1729,8 @@ struct LoopVectorize : public FunctionPass {
// less verbose reporting vectorized loops and unvectorized loops that may
// benefit from vectorization, respectively.
- if (Hints.getForce() == LoopVectorizeHints::FK_Disabled) {
- DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
- emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE, *F,
- L->getStartLoc(), Hints.emitRemark());
- return false;
- }
-
- if (!AlwaysVectorize && Hints.getForce() != LoopVectorizeHints::FK_Enabled) {
- DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
- emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE, *F,
- L->getStartLoc(), Hints.emitRemark());
- return false;
- }
-
- if (Hints.getWidth() == 1 && Hints.getInterleave() == 1) {
- DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
- emitOptimizationRemarkAnalysis(
- F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
- "loop not vectorized: vector width and interleave count are "
- "explicitly set to 1");
+ if (!Hints.allowVectorization(F, L, AlwaysVectorize)) {
+ DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
return false;
}
@@ -1595,15 +1744,19 @@ struct LoopVectorize : public FunctionPass {
DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
else {
DEBUG(dbgs() << "\n");
- emitOptimizationRemarkAnalysis(
- F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
- "vectorization is not beneficial and is not explicitly forced");
+ emitAnalysisDiag(F, L, Hints, VectorizationReport()
+ << "vectorization is not beneficial "
+ "and is not explicitly forced");
return false;
}
}
+ PredicatedScalarEvolution PSE(*SE);
+
// Check if it is legal to vectorize the loop.
- LoopVectorizationLegality LVL(L, SE, DT, TLI, AA, F, TTI, LAA);
+ LoopVectorizationRequirements Requirements;
+ LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, LAA,
+ &Requirements, &Hints);
if (!LVL.canVectorize()) {
DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
emitMissedWarning(F, L, Hints);
@@ -1611,16 +1764,18 @@ struct LoopVectorize : public FunctionPass {
}
// Use the cost model.
- LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, TLI, AC, F, &Hints);
+ LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, F,
+ &Hints);
+ CM.collectValuesToIgnore();
// Check the function attributes to find out if this function should be
// optimized for size.
bool OptForSize = Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
- F->hasFnAttribute(Attribute::OptimizeForSize);
+ F->optForSize();
// Compute the weighted frequency of this loop being executed and see if it
// is less than 20% of the function entry baseline frequency. Note that we
- // always have a canonical loop here because we think we *can* vectoriez.
+ // always have a canonical loop here because we think we *can* vectorize.
// FIXME: This is hidden behind a flag due to pervasive problems with
// exactly what block frequency models.
if (LoopVectorizeWithBlockFrequency) {
@@ -1630,16 +1785,17 @@ struct LoopVectorize : public FunctionPass {
OptForSize = true;
}
- // Check the function attributes to see if implicit floats are allowed.a
+ // Check the function attributes to see if implicit floats are allowed.
// FIXME: This check doesn't seem possibly correct -- what if the loop is
// an integer loop and the vector instructions selected are purely integer
// vector instructions?
if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
"attribute is used.\n");
- emitOptimizationRemarkAnalysis(
- F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
- "loop not vectorized due to NoImplicitFloat attribute");
+ emitAnalysisDiag(
+ F, L, Hints,
+ VectorizationReport()
+ << "loop not vectorized due to NoImplicitFloat attribute");
emitMissedWarning(F, L, Hints);
return false;
}
@@ -1651,32 +1807,86 @@ struct LoopVectorize : public FunctionPass {
// Select the interleave count.
unsigned IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost);
- DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "
- << DebugLocStr << '\n');
- DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
+ // Get user interleave count.
+ unsigned UserIC = Hints.getInterleave();
+
+ // Identify the diagnostic messages that should be produced.
+ std::string VecDiagMsg, IntDiagMsg;
+ bool VectorizeLoop = true, InterleaveLoop = true;
+
+ if (Requirements.doesNotMeet(F, L, Hints)) {
+ DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
+ "requirements.\n");
+ emitMissedWarning(F, L, Hints);
+ return false;
+ }
if (VF.Width == 1) {
- DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial\n");
+ DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
+ VecDiagMsg =
+ "the cost-model indicates that vectorization is not beneficial";
+ VectorizeLoop = false;
+ }
- if (IC == 1) {
- emitOptimizationRemarkAnalysis(
- F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
- "not beneficial to vectorize and user disabled interleaving");
- return false;
- }
- DEBUG(dbgs() << "LV: Trying to at least unroll the loops.\n");
+ if (IC == 1 && UserIC <= 1) {
+ // Tell the user interleaving is not beneficial.
+ DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
+ IntDiagMsg =
+ "the cost-model indicates that interleaving is not beneficial";
+ InterleaveLoop = false;
+ if (UserIC == 1)
+ IntDiagMsg +=
+ " and is explicitly disabled or interleave count is set to 1";
+ } else if (IC > 1 && UserIC == 1) {
+ // Tell the user interleaving is beneficial, but it explicitly disabled.
+ DEBUG(dbgs()
+ << "LV: Interleaving is beneficial but is explicitly disabled.");
+ IntDiagMsg = "the cost-model indicates that interleaving is beneficial "
+ "but is explicitly disabled or interleave count is set to 1";
+ InterleaveLoop = false;
+ }
- // Report the unrolling decision.
- emitOptimizationRemark(F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
- Twine("interleaved by " + Twine(IC) +
- " (vectorization not beneficial)"));
+ // Override IC if user provided an interleave count.
+ IC = UserIC > 0 ? UserIC : IC;
+
+ // Emit diagnostic messages, if any.
+ const char *VAPassName = Hints.vectorizeAnalysisPassName();
+ if (!VectorizeLoop && !InterleaveLoop) {
+ // Do not vectorize or interleaving the loop.
+ emitOptimizationRemarkAnalysis(F->getContext(), VAPassName, *F,
+ L->getStartLoc(), VecDiagMsg);
+ emitOptimizationRemarkAnalysis(F->getContext(), LV_NAME, *F,
+ L->getStartLoc(), IntDiagMsg);
+ return false;
+ } else if (!VectorizeLoop && InterleaveLoop) {
+ DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
+ emitOptimizationRemarkAnalysis(F->getContext(), VAPassName, *F,
+ L->getStartLoc(), VecDiagMsg);
+ } else if (VectorizeLoop && !InterleaveLoop) {
+ DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "
+ << DebugLocStr << '\n');
+ emitOptimizationRemarkAnalysis(F->getContext(), LV_NAME, *F,
+ L->getStartLoc(), IntDiagMsg);
+ } else if (VectorizeLoop && InterleaveLoop) {
+ DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "
+ << DebugLocStr << '\n');
+ DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
+ }
+
+ if (!VectorizeLoop) {
+ assert(IC > 1 && "interleave count should not be 1 or 0");
+ // If we decided that it is not legal to vectorize the loop then
+ // interleave it.
+ InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, IC);
+ Unroller.vectorize(&LVL, CM.MinBWs);
- InnerLoopUnroller Unroller(L, SE, LI, DT, TLI, TTI, IC);
- Unroller.vectorize(&LVL);
+ emitOptimizationRemark(F->getContext(), LV_NAME, *F, L->getStartLoc(),
+ Twine("interleaved loop (interleaved count: ") +
+ Twine(IC) + ")");
} else {
// If we decided that it is *legal* to vectorize the loop then do it.
- InnerLoopVectorizer LB(L, SE, LI, DT, TLI, TTI, VF.Width, IC);
- LB.vectorize(&LVL);
+ InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, VF.Width, IC);
+ LB.vectorize(&LVL, CM.MinBWs);
++LoopsVectorized;
// Add metadata to disable runtime unrolling scalar loop when there's no
@@ -1686,7 +1896,7 @@ struct LoopVectorize : public FunctionPass {
AddRuntimeUnrollDisableMetaData(L);
// Report the vectorization decision.
- emitOptimizationRemark(F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
+ emitOptimizationRemark(F->getContext(), LV_NAME, *F, L->getStartLoc(),
Twine("vectorized loop (vectorization width: ") +
Twine(VF.Width) + ", interleaved count: " +
Twine(IC) + ")");
@@ -1703,16 +1913,19 @@ struct LoopVectorize : public FunctionPass {
AU.addRequired<AssumptionCacheTracker>();
AU.addRequiredID(LoopSimplifyID);
AU.addRequiredID(LCSSAID);
- AU.addRequired<BlockFrequencyInfo>();
+ AU.addRequired<BlockFrequencyInfoWrapperPass>();
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<LoopInfoWrapperPass>();
- AU.addRequired<ScalarEvolution>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
AU.addRequired<TargetTransformInfoWrapperPass>();
- AU.addRequired<AliasAnalysis>();
+ AU.addRequired<AAResultsWrapperPass>();
AU.addRequired<LoopAccessAnalysis>();
+ AU.addRequired<DemandedBits>();
AU.addPreserved<LoopInfoWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
- AU.addPreserved<AliasAnalysis>();
+ AU.addPreserved<BasicAAWrapperPass>();
+ AU.addPreserved<AAResultsWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
}
};
@@ -1773,6 +1986,7 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx,
int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr");
+ auto *SE = PSE.getSE();
// Make sure that the pointer does not point to structs.
if (Ptr->getType()->getPointerElementType()->isAggregateType())
return 0;
@@ -1780,11 +1994,11 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
// If this value is a pointer induction variable we know it is consecutive.
PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr);
if (Phi && Inductions.count(Phi)) {
- InductionInfo II = Inductions[Phi];
+ InductionDescriptor II = Inductions[Phi];
return II.getConsecutiveDirection();
}
- GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr);
+ GetElementPtrInst *Gep = getGEPInstruction(Ptr);
if (!Gep)
return 0;
@@ -1802,10 +2016,10 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
// Make sure that all of the index operands are loop invariant.
for (unsigned i = 1; i < NumOperands; ++i)
- if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
+ if (!SE->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)), TheLoop))
return 0;
- InductionInfo II = Inductions[Phi];
+ InductionDescriptor II = Inductions[Phi];
return II.getConsecutiveDirection();
}
@@ -1815,14 +2029,14 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
// operand.
for (unsigned i = 0; i != NumOperands; ++i)
if (i != InductionOperand &&
- !SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
+ !SE->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)), TheLoop))
return 0;
// We can emit wide load/stores only if the last non-zero index is the
// induction variable.
const SCEV *Last = nullptr;
if (!Strides.count(Gep))
- Last = SE->getSCEV(Gep->getOperand(InductionOperand));
+ Last = PSE.getSCEV(Gep->getOperand(InductionOperand));
else {
// Because of the multiplication by a stride we can have a s/zext cast.
// We are going to replace this stride by 1 so the cast is safe to ignore.
@@ -1833,7 +2047,7 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
// %idxprom = zext i32 %mul to i64 << Safe cast.
// %arrayidx = getelementptr inbounds i32* %B, i64 %idxprom
//
- Last = replaceSymbolicStrideSCEV(SE, Strides,
+ Last = replaceSymbolicStrideSCEV(PSE, Strides,
Gep->getOperand(InductionOperand), Gep);
if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(Last))
Last =
@@ -2177,7 +2391,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
VectorParts &Entry = WidenMap.get(Instr);
// Handle consecutive loads/stores.
- GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
+ GetElementPtrInst *Gep = getGEPInstruction(Ptr);
if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) {
setDebugLocFromInst(Builder, Gep);
Value *PtrOperand = Gep->getPointerOperand();
@@ -2191,8 +2405,9 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
Ptr = Builder.Insert(Gep2);
} else if (Gep) {
setDebugLocFromInst(Builder, Gep);
- assert(SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()),
- OrigLoop) && "Base ptr must be invariant");
+ assert(PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getPointerOperand()),
+ OrigLoop) &&
+ "Base ptr must be invariant");
// The last index does not have to be the induction. It can be
// consecutive and be a function of the index. For example A[I+1];
@@ -2209,7 +2424,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
if (i == InductionOperand ||
(GepOperandInst && OrigLoop->contains(GepOperandInst))) {
assert((i == InductionOperand ||
- SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) &&
+ PSE.getSE()->isLoopInvariant(PSE.getSCEV(GepOperandInst),
+ OrigLoop)) &&
"Must be last index or loop invariant");
VectorParts &GEPParts = getVectorValue(GepOperand);
@@ -2237,14 +2453,14 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
// We don't want to update the value in the map as it might be used in
// another expression. So don't use a reference type for "StoredVal".
VectorParts StoredVal = getVectorValue(SI->getValueOperand());
-
+
for (unsigned Part = 0; Part < UF; ++Part) {
// Calculate the pointer for the specific unroll-part.
Value *PartPtr =
Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF));
if (Reverse) {
- // If we store to reverse consecutive memory locations then we need
+ // If we store to reverse consecutive memory locations, then we need
// to reverse the order of elements in the stored value.
StoredVal[Part] = reverseVector(StoredVal[Part]);
// If the address is consecutive but reversed, then the
@@ -2298,7 +2514,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
}
}
-void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredicateStore) {
+void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
+ bool IfPredicateStore) {
assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
// Holds vector parameters or scalars, in case of uniform vals.
SmallVector<VectorParts, 4> Params;
@@ -2318,7 +2535,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic
// Try using previously calculated values.
Instruction *SrcInst = dyn_cast<Instruction>(SrcOp);
- // If the src is an instruction that appeared earlier in the basic block
+ // If the src is an instruction that appeared earlier in the basic block,
// then it should already be vectorized.
if (SrcInst && OrigLoop->contains(SrcInst)) {
assert(WidenMap.has(SrcInst) && "Source operand is unavailable");
@@ -2343,19 +2560,12 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic
// Create a new entry in the WidenMap and initialize it to Undef or Null.
VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
- Instruction *InsertPt = Builder.GetInsertPoint();
- BasicBlock *IfBlock = Builder.GetInsertBlock();
- BasicBlock *CondBlock = nullptr;
-
VectorParts Cond;
- Loop *VectorLp = nullptr;
if (IfPredicateStore) {
assert(Instr->getParent()->getSinglePredecessor() &&
"Only support single predecessor blocks");
Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(),
Instr->getParent());
- VectorLp = LI->getLoopFor(IfBlock);
- assert(VectorLp && "Must have a loop for this block");
}
// For each vector unroll 'part':
@@ -2367,12 +2577,8 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic
Value *Cmp = nullptr;
if (IfPredicateStore) {
Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Width));
- Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, ConstantInt::get(Cmp->getType(), 1));
- CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
- LoopVectorBody.push_back(CondBlock);
- VectorLp->addBasicBlockToLoop(CondBlock, *LI);
- // Update Builder with newly created basic block.
- Builder.SetInsertPoint(InsertPt);
+ Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp,
+ ConstantInt::get(Cmp->getType(), 1));
}
Instruction *Cloned = Instr->clone();
@@ -2396,85 +2602,223 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic
VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned,
Builder.getInt32(Width));
// End if-block.
- if (IfPredicateStore) {
- BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
- LoopVectorBody.push_back(NewIfBlock);
- VectorLp->addBasicBlockToLoop(NewIfBlock, *LI);
- Builder.SetInsertPoint(InsertPt);
- ReplaceInstWithInst(IfBlock->getTerminator(),
- BranchInst::Create(CondBlock, NewIfBlock, Cmp));
- IfBlock = NewIfBlock;
- }
+ if (IfPredicateStore)
+ PredicatedStores.push_back(std::make_pair(cast<StoreInst>(Cloned),
+ Cmp));
}
}
}
-static Instruction *getFirstInst(Instruction *FirstInst, Value *V,
- Instruction *Loc) {
- if (FirstInst)
- return FirstInst;
- if (Instruction *I = dyn_cast<Instruction>(V))
- return I->getParent() == Loc->getParent() ? I : nullptr;
- return nullptr;
+PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
+ Value *End, Value *Step,
+ Instruction *DL) {
+ BasicBlock *Header = L->getHeader();
+ BasicBlock *Latch = L->getLoopLatch();
+ // As we're just creating this loop, it's possible no latch exists
+ // yet. If so, use the header as this will be a single block loop.
+ if (!Latch)
+ Latch = Header;
+
+ IRBuilder<> Builder(&*Header->getFirstInsertionPt());
+ setDebugLocFromInst(Builder, getDebugLocFromInstOrOperands(OldInduction));
+ auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
+
+ Builder.SetInsertPoint(Latch->getTerminator());
+
+ // Create i+1 and fill the PHINode.
+ Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
+ Induction->addIncoming(Start, L->getLoopPreheader());
+ Induction->addIncoming(Next, Latch);
+ // Create the compare.
+ Value *ICmp = Builder.CreateICmpEQ(Next, End);
+ Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
+
+ // Now we have two terminators. Remove the old one from the block.
+ Latch->getTerminator()->eraseFromParent();
+
+ return Induction;
}
-std::pair<Instruction *, Instruction *>
-InnerLoopVectorizer::addStrideCheck(Instruction *Loc) {
- Instruction *tnullptr = nullptr;
- if (!Legal->mustCheckStrides())
- return std::pair<Instruction *, Instruction *>(tnullptr, tnullptr);
-
- IRBuilder<> ChkBuilder(Loc);
-
- // Emit checks.
- Value *Check = nullptr;
- Instruction *FirstInst = nullptr;
- for (SmallPtrSet<Value *, 8>::iterator SI = Legal->strides_begin(),
- SE = Legal->strides_end();
- SI != SE; ++SI) {
- Value *Ptr = stripIntegerCast(*SI);
- Value *C = ChkBuilder.CreateICmpNE(Ptr, ConstantInt::get(Ptr->getType(), 1),
- "stride.chk");
- // Store the first instruction we create.
- FirstInst = getFirstInst(FirstInst, C, Loc);
- if (Check)
- Check = ChkBuilder.CreateOr(Check, C);
- else
- Check = C;
- }
+Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
+ if (TripCount)
+ return TripCount;
- // We have to do this trickery because the IRBuilder might fold the check to a
- // constant expression in which case there is no Instruction anchored in a
- // the block.
- LLVMContext &Ctx = Loc->getContext();
- Instruction *TheCheck =
- BinaryOperator::CreateAnd(Check, ConstantInt::getTrue(Ctx));
- ChkBuilder.Insert(TheCheck, "stride.not.one");
- FirstInst = getFirstInst(FirstInst, TheCheck, Loc);
+ IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
+ // Find the loop boundaries.
+ ScalarEvolution *SE = PSE.getSE();
+ const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(OrigLoop);
+ assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
+ "Invalid loop count");
- return std::make_pair(FirstInst, TheCheck);
+ Type *IdxTy = Legal->getWidestInductionType();
+
+ // The exit count might have the type of i64 while the phi is i32. This can
+ // happen if we have an induction variable that is sign extended before the
+ // compare. The only way that we get a backedge taken count is that the
+ // induction variable was signed and as such will not overflow. In such a case
+ // truncation is legal.
+ if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
+ IdxTy->getPrimitiveSizeInBits())
+ BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
+ BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
+
+ // Get the total trip count from the count by adding 1.
+ const SCEV *ExitCount = SE->getAddExpr(
+ BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
+
+ const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+
+ // Expand the trip count and place the new instructions in the preheader.
+ // Notice that the pre-header does not change, only the loop body.
+ SCEVExpander Exp(*SE, DL, "induction");
+
+ // Count holds the overall loop count (N).
+ TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
+ L->getLoopPreheader()->getTerminator());
+
+ if (TripCount->getType()->isPointerTy())
+ TripCount =
+ CastInst::CreatePointerCast(TripCount, IdxTy,
+ "exitcount.ptrcnt.to.int",
+ L->getLoopPreheader()->getTerminator());
+
+ return TripCount;
}
+Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
+ if (VectorTripCount)
+ return VectorTripCount;
+
+ Value *TC = getOrCreateTripCount(L);
+ IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
+
+ // Now we need to generate the expression for N - (N % VF), which is
+ // the part that the vectorized body will execute.
+ // The loop step is equal to the vectorization factor (num of SIMD elements)
+ // times the unroll factor (num of SIMD instructions).
+ Constant *Step = ConstantInt::get(TC->getType(), VF * UF);
+ Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
+ VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
+
+ return VectorTripCount;
+}
+
+void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
+ BasicBlock *Bypass) {
+ Value *Count = getOrCreateTripCount(L);
+ BasicBlock *BB = L->getLoopPreheader();
+ IRBuilder<> Builder(BB->getTerminator());
+
+ // Generate code to check that the loop's trip count that we computed by
+ // adding one to the backedge-taken count will not overflow.
+ Value *CheckMinIters =
+ Builder.CreateICmpULT(Count,
+ ConstantInt::get(Count->getType(), VF * UF),
+ "min.iters.check");
+
+ BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(),
+ "min.iters.checked");
+ if (L->getParentLoop())
+ L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
+ ReplaceInstWithInst(BB->getTerminator(),
+ BranchInst::Create(Bypass, NewBB, CheckMinIters));
+ LoopBypassBlocks.push_back(BB);
+}
+
+void InnerLoopVectorizer::emitVectorLoopEnteredCheck(Loop *L,
+ BasicBlock *Bypass) {
+ Value *TC = getOrCreateVectorTripCount(L);
+ BasicBlock *BB = L->getLoopPreheader();
+ IRBuilder<> Builder(BB->getTerminator());
+
+ // Now, compare the new count to zero. If it is zero skip the vector loop and
+ // jump to the scalar loop.
+ Value *Cmp = Builder.CreateICmpEQ(TC, Constant::getNullValue(TC->getType()),
+ "cmp.zero");
+
+ // Generate code to check that the loop's trip count that we computed by
+ // adding one to the backedge-taken count will not overflow.
+ BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(),
+ "vector.ph");
+ if (L->getParentLoop())
+ L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
+ ReplaceInstWithInst(BB->getTerminator(),
+ BranchInst::Create(Bypass, NewBB, Cmp));
+ LoopBypassBlocks.push_back(BB);
+}
+
+void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
+ BasicBlock *BB = L->getLoopPreheader();
+
+ // Generate the code to check that the SCEV assumptions that we made.
+ // We want the new basic block to start at the first instruction in a
+ // sequence of instructions that form a check.
+ SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
+ "scev.check");
+ Value *SCEVCheck =
+ Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator());
+
+ if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
+ if (C->isZero())
+ return;
+
+ // Create a new block containing the stride check.
+ BB->setName("vector.scevcheck");
+ auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
+ if (L->getParentLoop())
+ L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
+ ReplaceInstWithInst(BB->getTerminator(),
+ BranchInst::Create(Bypass, NewBB, SCEVCheck));
+ LoopBypassBlocks.push_back(BB);
+ AddedSafetyChecks = true;
+}
+
+void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L,
+ BasicBlock *Bypass) {
+ BasicBlock *BB = L->getLoopPreheader();
+
+ // Generate the code that checks in runtime if arrays overlap. We put the
+ // checks into a separate block to make the more common case of few elements
+ // faster.
+ Instruction *FirstCheckInst;
+ Instruction *MemRuntimeCheck;
+ std::tie(FirstCheckInst, MemRuntimeCheck) =
+ Legal->getLAI()->addRuntimeChecks(BB->getTerminator());
+ if (!MemRuntimeCheck)
+ return;
+
+ // Create a new block containing the memory check.
+ BB->setName("vector.memcheck");
+ auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
+ if (L->getParentLoop())
+ L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
+ ReplaceInstWithInst(BB->getTerminator(),
+ BranchInst::Create(Bypass, NewBB, MemRuntimeCheck));
+ LoopBypassBlocks.push_back(BB);
+ AddedSafetyChecks = true;
+}
+
+
void InnerLoopVectorizer::createEmptyLoop() {
/*
In this function we generate a new loop. The new loop will contain
the vectorized instructions while the old loop will continue to run the
scalar remainder.
- [ ] <-- Back-edge taken count overflow check.
+ [ ] <-- loop iteration number check.
/ |
/ v
| [ ] <-- vector loop bypass (may consist of multiple blocks).
| / |
| / v
|| [ ] <-- vector pre header.
- || |
- || v
- || [ ] \
- || [ ]_| <-- vector loop.
- || |
- | \ v
- | >[ ] <--- middle-block.
+ |/ |
+ | v
+ | [ ] \
+ | [ ]_| <-- vector loop.
+ | |
+ | v
+ | -[ ] <--- middle-block.
| / |
| / v
-|- >[ ] <--- new preheader.
@@ -2498,65 +2842,16 @@ void InnerLoopVectorizer::createEmptyLoop() {
// don't. One example is c++ iterators that often have multiple pointer
// induction variables. In the code below we also support a case where we
// don't have a single induction variable.
+ //
+ // We try to obtain an induction variable from the original loop as hard
+ // as possible. However if we don't find one that:
+ // - is an integer
+ // - counts from zero, stepping by one
+ // - is the size of the widest induction variable type
+ // then we create a new one.
OldInduction = Legal->getInduction();
Type *IdxTy = Legal->getWidestInductionType();
- // Find the loop boundaries.
- const SCEV *ExitCount = SE->getBackedgeTakenCount(OrigLoop);
- assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count");
-
- // The exit count might have the type of i64 while the phi is i32. This can
- // happen if we have an induction variable that is sign extended before the
- // compare. The only way that we get a backedge taken count is that the
- // induction variable was signed and as such will not overflow. In such a case
- // truncation is legal.
- if (ExitCount->getType()->getPrimitiveSizeInBits() >
- IdxTy->getPrimitiveSizeInBits())
- ExitCount = SE->getTruncateOrNoop(ExitCount, IdxTy);
-
- const SCEV *BackedgeTakeCount = SE->getNoopOrZeroExtend(ExitCount, IdxTy);
- // Get the total trip count from the count by adding 1.
- ExitCount = SE->getAddExpr(BackedgeTakeCount,
- SE->getConstant(BackedgeTakeCount->getType(), 1));
-
- const DataLayout &DL = OldBasicBlock->getModule()->getDataLayout();
-
- // Expand the trip count and place the new instructions in the preheader.
- // Notice that the pre-header does not change, only the loop body.
- SCEVExpander Exp(*SE, DL, "induction");
-
- // We need to test whether the backedge-taken count is uint##_max. Adding one
- // to it will cause overflow and an incorrect loop trip count in the vector
- // body. In case of overflow we want to directly jump to the scalar remainder
- // loop.
- Value *BackedgeCount =
- Exp.expandCodeFor(BackedgeTakeCount, BackedgeTakeCount->getType(),
- VectorPH->getTerminator());
- if (BackedgeCount->getType()->isPointerTy())
- BackedgeCount = CastInst::CreatePointerCast(BackedgeCount, IdxTy,
- "backedge.ptrcnt.to.int",
- VectorPH->getTerminator());
- Instruction *CheckBCOverflow =
- CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, BackedgeCount,
- Constant::getAllOnesValue(BackedgeCount->getType()),
- "backedge.overflow", VectorPH->getTerminator());
-
- // The loop index does not have to start at Zero. Find the original start
- // value from the induction PHI node. If we don't have an induction variable
- // then we know that it starts at zero.
- Builder.SetInsertPoint(VectorPH->getTerminator());
- Value *StartIdx = ExtendedIdx =
- OldInduction
- ? Builder.CreateZExt(OldInduction->getIncomingValueForBlock(VectorPH),
- IdxTy)
- : ConstantInt::get(IdxTy, 0);
-
- // Count holds the overall loop count (N).
- Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
- VectorPH->getTerminator());
-
- LoopBypassBlocks.push_back(VectorPH);
-
// Split the single block loop into the two loop structure described above.
BasicBlock *VecBody =
VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
@@ -2580,118 +2875,36 @@ void InnerLoopVectorizer::createEmptyLoop() {
}
Lp->addBasicBlockToLoop(VecBody, *LI);
- // Use this IR builder to create the loop instructions (Phi, Br, Cmp)
- // inside the loop.
- Builder.SetInsertPoint(VecBody->getFirstNonPHI());
-
- // Generate the induction variable.
- setDebugLocFromInst(Builder, getDebugLocFromInstOrOperands(OldInduction));
- Induction = Builder.CreatePHI(IdxTy, 2, "index");
- // The loop step is equal to the vectorization factor (num of SIMD elements)
- // times the unroll factor (num of SIMD instructions).
- Constant *Step = ConstantInt::get(IdxTy, VF * UF);
-
- // Generate code to check that the loop's trip count that we computed by
- // adding one to the backedge-taken count will not overflow.
- BasicBlock *NewVectorPH =
- VectorPH->splitBasicBlock(VectorPH->getTerminator(), "overflow.checked");
- if (ParentLoop)
- ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI);
- ReplaceInstWithInst(
- VectorPH->getTerminator(),
- BranchInst::Create(ScalarPH, NewVectorPH, CheckBCOverflow));
- VectorPH = NewVectorPH;
-
- // This is the IR builder that we use to add all of the logic for bypassing
- // the new vector loop.
- IRBuilder<> BypassBuilder(VectorPH->getTerminator());
- setDebugLocFromInst(BypassBuilder,
- getDebugLocFromInstOrOperands(OldInduction));
-
- // We may need to extend the index in case there is a type mismatch.
- // We know that the count starts at zero and does not overflow.
- if (Count->getType() != IdxTy) {
- // The exit count can be of pointer type. Convert it to the correct
- // integer type.
- if (ExitCount->getType()->isPointerTy())
- Count = BypassBuilder.CreatePointerCast(Count, IdxTy, "ptrcnt.to.int");
- else
- Count = BypassBuilder.CreateZExtOrTrunc(Count, IdxTy, "cnt.cast");
- }
-
- // Add the start index to the loop count to get the new end index.
- Value *IdxEnd = BypassBuilder.CreateAdd(Count, StartIdx, "end.idx");
+ // Find the loop boundaries.
+ Value *Count = getOrCreateTripCount(Lp);
- // Now we need to generate the expression for N - (N % VF), which is
- // the part that the vectorized body will execute.
- Value *R = BypassBuilder.CreateURem(Count, Step, "n.mod.vf");
- Value *CountRoundDown = BypassBuilder.CreateSub(Count, R, "n.vec");
- Value *IdxEndRoundDown = BypassBuilder.CreateAdd(CountRoundDown, StartIdx,
- "end.idx.rnd.down");
+ Value *StartIdx = ConstantInt::get(IdxTy, 0);
+ // We need to test whether the backedge-taken count is uint##_max. Adding one
+ // to it will cause overflow and an incorrect loop trip count in the vector
+ // body. In case of overflow we want to directly jump to the scalar remainder
+ // loop.
+ emitMinimumIterationCountCheck(Lp, ScalarPH);
// Now, compare the new count to zero. If it is zero skip the vector loop and
// jump to the scalar loop.
- Value *Cmp =
- BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx, "cmp.zero");
- NewVectorPH =
- VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.ph");
- if (ParentLoop)
- ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI);
- LoopBypassBlocks.push_back(VectorPH);
- ReplaceInstWithInst(VectorPH->getTerminator(),
- BranchInst::Create(MiddleBlock, NewVectorPH, Cmp));
- VectorPH = NewVectorPH;
-
- // Generate the code to check that the strides we assumed to be one are really
- // one. We want the new basic block to start at the first instruction in a
- // sequence of instructions that form a check.
- Instruction *StrideCheck;
- Instruction *FirstCheckInst;
- std::tie(FirstCheckInst, StrideCheck) =
- addStrideCheck(VectorPH->getTerminator());
- if (StrideCheck) {
- AddedSafetyChecks = true;
- // Create a new block containing the stride check.
- VectorPH->setName("vector.stridecheck");
- NewVectorPH =
- VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.ph");
- if (ParentLoop)
- ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI);
- LoopBypassBlocks.push_back(VectorPH);
-
- // Replace the branch into the memory check block with a conditional branch
- // for the "few elements case".
- ReplaceInstWithInst(
- VectorPH->getTerminator(),
- BranchInst::Create(MiddleBlock, NewVectorPH, StrideCheck));
-
- VectorPH = NewVectorPH;
- }
+ emitVectorLoopEnteredCheck(Lp, ScalarPH);
+ // Generate the code to check any assumptions that we've made for SCEV
+ // expressions.
+ emitSCEVChecks(Lp, ScalarPH);
// Generate the code that checks in runtime if arrays overlap. We put the
// checks into a separate block to make the more common case of few elements
// faster.
- Instruction *MemRuntimeCheck;
- std::tie(FirstCheckInst, MemRuntimeCheck) =
- Legal->getLAI()->addRuntimeCheck(VectorPH->getTerminator());
- if (MemRuntimeCheck) {
- AddedSafetyChecks = true;
- // Create a new block containing the memory check.
- VectorPH->setName("vector.memcheck");
- NewVectorPH =
- VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.ph");
- if (ParentLoop)
- ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI);
- LoopBypassBlocks.push_back(VectorPH);
-
- // Replace the branch into the memory check block with a conditional branch
- // for the "few elements case".
- ReplaceInstWithInst(
- VectorPH->getTerminator(),
- BranchInst::Create(MiddleBlock, NewVectorPH, MemRuntimeCheck));
-
- VectorPH = NewVectorPH;
- }
+ emitMemRuntimeChecks(Lp, ScalarPH);
+
+ // Generate the induction variable.
+ // The loop step is equal to the vectorization factor (num of SIMD elements)
+ // times the unroll factor (num of SIMD instructions).
+ Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
+ Constant *Step = ConstantInt::get(IdxTy, VF * UF);
+ Induction =
+ createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
+ getDebugLocFromInstOrOperands(OldInduction));
// We are going to resume the execution of the scalar loop.
// Go over all of the induction variables that we found and fix the
@@ -2701,152 +2914,60 @@ void InnerLoopVectorizer::createEmptyLoop() {
// If we come from a bypass edge then we need to start from the original
// start value.
- // This variable saves the new starting index for the scalar loop.
- PHINode *ResumeIndex = nullptr;
+ // This variable saves the new starting index for the scalar loop. It is used
+ // to test if there are any tail iterations left once the vector loop has
+ // completed.
LoopVectorizationLegality::InductionList::iterator I, E;
LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
- // Set builder to point to last bypass block.
- BypassBuilder.SetInsertPoint(LoopBypassBlocks.back()->getTerminator());
for (I = List->begin(), E = List->end(); I != E; ++I) {
PHINode *OrigPhi = I->first;
- LoopVectorizationLegality::InductionInfo II = I->second;
-
- Type *ResumeValTy = (OrigPhi == OldInduction) ? IdxTy : OrigPhi->getType();
- PHINode *ResumeVal = PHINode::Create(ResumeValTy, 2, "resume.val",
- MiddleBlock->getTerminator());
- // We might have extended the type of the induction variable but we need a
- // truncated version for the scalar loop.
- PHINode *TruncResumeVal = (OrigPhi == OldInduction) ?
- PHINode::Create(OrigPhi->getType(), 2, "trunc.resume.val",
- MiddleBlock->getTerminator()) : nullptr;
+ InductionDescriptor II = I->second;
// Create phi nodes to merge from the backedge-taken check block.
- PHINode *BCResumeVal = PHINode::Create(ResumeValTy, 3, "bc.resume.val",
+ PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3,
+ "bc.resume.val",
ScalarPH->getTerminator());
- BCResumeVal->addIncoming(ResumeVal, MiddleBlock);
-
- PHINode *BCTruncResumeVal = nullptr;
+ Value *EndValue;
if (OrigPhi == OldInduction) {
- BCTruncResumeVal =
- PHINode::Create(OrigPhi->getType(), 2, "bc.trunc.resume.val",
- ScalarPH->getTerminator());
- BCTruncResumeVal->addIncoming(TruncResumeVal, MiddleBlock);
- }
-
- Value *EndValue = nullptr;
- switch (II.IK) {
- case LoopVectorizationLegality::IK_NoInduction:
- llvm_unreachable("Unknown induction");
- case LoopVectorizationLegality::IK_IntInduction: {
- // Handle the integer induction counter.
- assert(OrigPhi->getType()->isIntegerTy() && "Invalid type");
-
- // We have the canonical induction variable.
- if (OrigPhi == OldInduction) {
- // Create a truncated version of the resume value for the scalar loop,
- // we might have promoted the type to a larger width.
- EndValue =
- BypassBuilder.CreateTrunc(IdxEndRoundDown, OrigPhi->getType());
- // The new PHI merges the original incoming value, in case of a bypass,
- // or the value at the end of the vectorized loop.
- for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
- TruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
- TruncResumeVal->addIncoming(EndValue, VecBody);
-
- BCTruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]);
-
- // We know what the end value is.
- EndValue = IdxEndRoundDown;
- // We also know which PHI node holds it.
- ResumeIndex = ResumeVal;
- break;
- }
-
- // Not the canonical induction variable - add the vector loop count to the
- // start value.
- Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown,
- II.StartValue->getType(),
- "cast.crd");
- EndValue = II.transform(BypassBuilder, CRD);
+ // We know what the end value is.
+ EndValue = CountRoundDown;
+ } else {
+ IRBuilder<> B(LoopBypassBlocks.back()->getTerminator());
+ Value *CRD = B.CreateSExtOrTrunc(CountRoundDown,
+ II.getStepValue()->getType(),
+ "cast.crd");
+ EndValue = II.transform(B, CRD);
EndValue->setName("ind.end");
- break;
}
- case LoopVectorizationLegality::IK_PtrInduction: {
- Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown,
- II.StepValue->getType(),
- "cast.crd");
- EndValue = II.transform(BypassBuilder, CRD);
- EndValue->setName("ptr.ind.end");
- break;
- }
- }// end of case
// The new PHI merges the original incoming value, in case of a bypass,
// or the value at the end of the vectorized loop.
- for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) {
- if (OrigPhi == OldInduction)
- ResumeVal->addIncoming(StartIdx, LoopBypassBlocks[I]);
- else
- ResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
- }
- ResumeVal->addIncoming(EndValue, VecBody);
+ BCResumeVal->addIncoming(EndValue, MiddleBlock);
// Fix the scalar body counter (PHI node).
unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
// The old induction's phi node in the scalar body needs the truncated
// value.
- if (OrigPhi == OldInduction) {
- BCResumeVal->addIncoming(StartIdx, LoopBypassBlocks[0]);
- OrigPhi->setIncomingValue(BlockIdx, BCTruncResumeVal);
- } else {
- BCResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]);
- OrigPhi->setIncomingValue(BlockIdx, BCResumeVal);
- }
+ for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+ BCResumeVal->addIncoming(II.getStartValue(), LoopBypassBlocks[I]);
+ OrigPhi->setIncomingValue(BlockIdx, BCResumeVal);
}
- // If we are generating a new induction variable then we also need to
- // generate the code that calculates the exit value. This value is not
- // simply the end of the counter because we may skip the vectorized body
- // in case of a runtime check.
- if (!OldInduction){
- assert(!ResumeIndex && "Unexpected resume value found");
- ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val",
- MiddleBlock->getTerminator());
- for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
- ResumeIndex->addIncoming(StartIdx, LoopBypassBlocks[I]);
- ResumeIndex->addIncoming(IdxEndRoundDown, VecBody);
- }
-
- // Make sure that we found the index where scalar loop needs to continue.
- assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() &&
- "Invalid resume Index");
-
// Add a check in the middle block to see if we have completed
// all of the iterations in the first vector loop.
// If (N - N%VF) == N, then we *don't* need to run the remainder.
- Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd,
- ResumeIndex, "cmp.n",
+ Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
+ CountRoundDown, "cmp.n",
MiddleBlock->getTerminator());
ReplaceInstWithInst(MiddleBlock->getTerminator(),
BranchInst::Create(ExitBlock, ScalarPH, CmpN));
- // Create i+1 and fill the PHINode.
- Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next");
- Induction->addIncoming(StartIdx, VectorPH);
- Induction->addIncoming(NextIdx, VecBody);
- // Create the compare.
- Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown);
- Builder.CreateCondBr(ICmp, MiddleBlock, VecBody);
-
- // Now we have two terminators. Remove the old one from the block.
- VecBody->getTerminator()->eraseFromParent();
-
// Get ready to start creating new instructions into the vectorized body.
- Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
+ Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt());
// Save the state.
- LoopVectorPreHeader = VectorPH;
+ LoopVectorPreHeader = Lp->getLoopPreheader();
LoopScalarPreHeader = ScalarPH;
LoopMiddleBlock = MiddleBlock;
LoopExitBlock = ExitBlock;
@@ -2899,7 +3020,7 @@ static void cse(SmallVector<BasicBlock *, 4> &BBs) {
for (unsigned i = 0, e = BBs.size(); i != e; ++i) {
BasicBlock *BB = BBs[i];
for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
- Instruction *In = I++;
+ Instruction *In = &*I++;
if (!CSEDenseMapInfo::canHandle(In))
continue;
@@ -3021,6 +3142,117 @@ static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF,
return TTI.getIntrinsicInstrCost(ID, RetTy, Tys);
}
+static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
+ IntegerType *I1 = cast<IntegerType>(T1->getVectorElementType());
+ IntegerType *I2 = cast<IntegerType>(T2->getVectorElementType());
+ return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
+}
+static Type *largestIntegerVectorType(Type *T1, Type *T2) {
+ IntegerType *I1 = cast<IntegerType>(T1->getVectorElementType());
+ IntegerType *I2 = cast<IntegerType>(T2->getVectorElementType());
+ return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
+}
+
+void InnerLoopVectorizer::truncateToMinimalBitwidths() {
+ // For every instruction `I` in MinBWs, truncate the operands, create a
+ // truncated version of `I` and reextend its result. InstCombine runs
+ // later and will remove any ext/trunc pairs.
+ //
+ for (auto &KV : MinBWs) {
+ VectorParts &Parts = WidenMap.get(KV.first);
+ for (Value *&I : Parts) {
+ if (I->use_empty())
+ continue;
+ Type *OriginalTy = I->getType();
+ Type *ScalarTruncatedTy = IntegerType::get(OriginalTy->getContext(),
+ KV.second);
+ Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
+ OriginalTy->getVectorNumElements());
+ if (TruncatedTy == OriginalTy)
+ continue;
+
+ IRBuilder<> B(cast<Instruction>(I));
+ auto ShrinkOperand = [&](Value *V) -> Value* {
+ if (auto *ZI = dyn_cast<ZExtInst>(V))
+ if (ZI->getSrcTy() == TruncatedTy)
+ return ZI->getOperand(0);
+ return B.CreateZExtOrTrunc(V, TruncatedTy);
+ };
+
+ // The actual instruction modification depends on the instruction type,
+ // unfortunately.
+ Value *NewI = nullptr;
+ if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
+ NewI = B.CreateBinOp(BO->getOpcode(),
+ ShrinkOperand(BO->getOperand(0)),
+ ShrinkOperand(BO->getOperand(1)));
+ cast<BinaryOperator>(NewI)->copyIRFlags(I);
+ } else if (ICmpInst *CI = dyn_cast<ICmpInst>(I)) {
+ NewI = B.CreateICmp(CI->getPredicate(),
+ ShrinkOperand(CI->getOperand(0)),
+ ShrinkOperand(CI->getOperand(1)));
+ } else if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
+ NewI = B.CreateSelect(SI->getCondition(),
+ ShrinkOperand(SI->getTrueValue()),
+ ShrinkOperand(SI->getFalseValue()));
+ } else if (CastInst *CI = dyn_cast<CastInst>(I)) {
+ switch (CI->getOpcode()) {
+ default: llvm_unreachable("Unhandled cast!");
+ case Instruction::Trunc:
+ NewI = ShrinkOperand(CI->getOperand(0));
+ break;
+ case Instruction::SExt:
+ NewI = B.CreateSExtOrTrunc(CI->getOperand(0),
+ smallestIntegerVectorType(OriginalTy,
+ TruncatedTy));
+ break;
+ case Instruction::ZExt:
+ NewI = B.CreateZExtOrTrunc(CI->getOperand(0),
+ smallestIntegerVectorType(OriginalTy,
+ TruncatedTy));
+ break;
+ }
+ } else if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(I)) {
+ auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
+ auto *O0 =
+ B.CreateZExtOrTrunc(SI->getOperand(0),
+ VectorType::get(ScalarTruncatedTy, Elements0));
+ auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
+ auto *O1 =
+ B.CreateZExtOrTrunc(SI->getOperand(1),
+ VectorType::get(ScalarTruncatedTy, Elements1));
+
+ NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
+ } else if (isa<LoadInst>(I)) {
+ // Don't do anything with the operands, just extend the result.
+ continue;
+ } else {
+ llvm_unreachable("Unhandled instruction type!");
+ }
+
+ // Lastly, extend the result.
+ NewI->takeName(cast<Instruction>(I));
+ Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
+ I->replaceAllUsesWith(Res);
+ cast<Instruction>(I)->eraseFromParent();
+ I = Res;
+ }
+ }
+
+ // We'll have created a bunch of ZExts that are now parentless. Clean up.
+ for (auto &KV : MinBWs) {
+ VectorParts &Parts = WidenMap.get(KV.first);
+ for (Value *&I : Parts) {
+ ZExtInst *Inst = dyn_cast<ZExtInst>(I);
+ if (Inst && Inst->use_empty()) {
+ Value *NewI = Inst->getOperand(0);
+ Inst->eraseFromParent();
+ I = NewI;
+ }
+ }
+ }
+}
+
void InnerLoopVectorizer::vectorizeLoop() {
//===------------------------------------------------===//
//
@@ -3051,6 +3283,11 @@ void InnerLoopVectorizer::vectorizeLoop() {
be = DFS.endRPO(); bb != be; ++bb)
vectorizeBlockInLoop(*bb, &RdxPHIsToFix);
+ // Insert truncates and extends for any truncated instructions as hints to
+ // InstCombine.
+ if (VF > 1)
+ truncateToMinimalBitwidths();
+
// At this point every instruction in the original loop is widened to
// a vector form. We are almost done. Now, we need to fix the PHI nodes
// that we vectorized. The PHI nodes are currently empty because we did
@@ -3066,7 +3303,7 @@ void InnerLoopVectorizer::vectorizeLoop() {
assert(RdxPhi && "Unable to recover vectorized PHI");
// Find the reduction variable descriptor.
- assert(Legal->getReductionVars()->count(RdxPhi) &&
+ assert(Legal->isReductionVariable(RdxPhi) &&
"Unable to find the reduction variable");
RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[RdxPhi];
@@ -3141,21 +3378,33 @@ void InnerLoopVectorizer::vectorizeLoop() {
// the PHIs and the values we are going to write.
// This allows us to write both PHINodes and the extractelement
// instructions.
- Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt());
+ Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
- VectorParts RdxParts;
+ VectorParts RdxParts = getVectorValue(LoopExitInst);
setDebugLocFromInst(Builder, LoopExitInst);
- for (unsigned part = 0; part < UF; ++part) {
- // This PHINode contains the vectorized reduction variable, or
- // the initial value vector, if we bypass the vector loop.
- VectorParts &RdxExitVal = getVectorValue(LoopExitInst);
- PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
- Value *StartVal = (part == 0) ? VectorStart : Identity;
- for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
- NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]);
- NewPhi->addIncoming(RdxExitVal[part],
- LoopVectorBody.back());
- RdxParts.push_back(NewPhi);
+
+ // If the vector reduction can be performed in a smaller type, we truncate
+ // then extend the loop exit value to enable InstCombine to evaluate the
+ // entire expression in the smaller type.
+ if (VF > 1 && RdxPhi->getType() != RdxDesc.getRecurrenceType()) {
+ Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
+ Builder.SetInsertPoint(LoopVectorBody.back()->getTerminator());
+ for (unsigned part = 0; part < UF; ++part) {
+ Value *Trunc = Builder.CreateTrunc(RdxParts[part], RdxVecTy);
+ Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
+ : Builder.CreateZExt(Trunc, VecTy);
+ for (Value::user_iterator UI = RdxParts[part]->user_begin();
+ UI != RdxParts[part]->user_end();)
+ if (*UI != Trunc) {
+ (*UI++)->replaceUsesOfWith(RdxParts[part], Extnd);
+ RdxParts[part] = Extnd;
+ } else {
+ ++UI;
+ }
+ }
+ Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
+ for (unsigned part = 0; part < UF; ++part)
+ RdxParts[part] = Builder.CreateTrunc(RdxParts[part], RdxVecTy);
}
// Reduce all of the unrolled parts into a single vector.
@@ -3208,13 +3457,22 @@ void InnerLoopVectorizer::vectorizeLoop() {
// The result is in the first element of the vector.
ReducedPartRdx = Builder.CreateExtractElement(TmpVec,
Builder.getInt32(0));
+
+ // If the reduction can be performed in a smaller type, we need to extend
+ // the reduction to the wider type before we branch to the original loop.
+ if (RdxPhi->getType() != RdxDesc.getRecurrenceType())
+ ReducedPartRdx =
+ RdxDesc.isSigned()
+ ? Builder.CreateSExt(ReducedPartRdx, RdxPhi->getType())
+ : Builder.CreateZExt(ReducedPartRdx, RdxPhi->getType());
}
// Create a phi node that merges control-flow from the backedge-taken check
// block and the middle block.
PHINode *BCBlockPhi = PHINode::Create(RdxPhi->getType(), 2, "bc.merge.rdx",
LoopScalarPreHeader->getTerminator());
- BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[0]);
+ for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+ BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
// Now, we need to fix the users of the reduction variable
@@ -3252,6 +3510,20 @@ void InnerLoopVectorizer::vectorizeLoop() {
fixLCSSAPHIs();
+ // Make sure DomTree is updated.
+ updateAnalysis();
+
+ // Predicate any stores.
+ for (auto KV : PredicatedStores) {
+ BasicBlock::iterator I(KV.first);
+ auto *BB = SplitBlock(I->getParent(), &*std::next(I), DT, LI);
+ auto *T = SplitBlockAndInsertIfThen(KV.second, &*I, /*Unreachable=*/false,
+ /*BranchWeights=*/nullptr, DT);
+ I->moveBefore(T);
+ I->getParent()->setName("pred.store.if");
+ BB->setName("pred.store.continue");
+ }
+ DEBUG(DT->verifyDomTree());
// Remove redundant induction instructions.
cse(LoopVectorBody);
}
@@ -3326,18 +3598,18 @@ InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
return BlockMask;
}
-void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
- InnerLoopVectorizer::VectorParts &Entry,
- unsigned UF, unsigned VF, PhiVector *PV) {
+void InnerLoopVectorizer::widenPHIInstruction(
+ Instruction *PN, InnerLoopVectorizer::VectorParts &Entry, unsigned UF,
+ unsigned VF, PhiVector *PV) {
PHINode* P = cast<PHINode>(PN);
// Handle reduction variables:
- if (Legal->getReductionVars()->count(P)) {
+ if (Legal->isReductionVariable(P)) {
for (unsigned part = 0; part < UF; ++part) {
// This is phase one of vectorizing PHIs.
Type *VecTy = (VF == 1) ? PN->getType() :
VectorType::get(PN->getType(), VF);
- Entry[part] = PHINode::Create(VecTy, 2, "vec.phi",
- LoopVectorBody.back()-> getFirstInsertionPt());
+ Entry[part] = PHINode::Create(
+ VecTy, 2, "vec.phi", &*LoopVectorBody.back()->getFirstInsertionPt());
}
PV->push_back(P);
return;
@@ -3385,53 +3657,44 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
assert(Legal->getInductionVars()->count(P) &&
"Not an induction variable");
- LoopVectorizationLegality::InductionInfo II =
- Legal->getInductionVars()->lookup(P);
+ InductionDescriptor II = Legal->getInductionVars()->lookup(P);
// FIXME: The newly created binary instructions should contain nsw/nuw flags,
// which can be found from the original scalar operations.
- switch (II.IK) {
- case LoopVectorizationLegality::IK_NoInduction:
+ switch (II.getKind()) {
+ case InductionDescriptor::IK_NoInduction:
llvm_unreachable("Unknown induction");
- case LoopVectorizationLegality::IK_IntInduction: {
- assert(P->getType() == II.StartValue->getType() && "Types must match");
- Type *PhiTy = P->getType();
- Value *Broadcasted;
- if (P == OldInduction) {
- // Handle the canonical induction variable. We might have had to
- // extend the type.
- Broadcasted = Builder.CreateTrunc(Induction, PhiTy);
- } else {
- // Handle other induction variables that are now based on the
- // canonical one.
- Value *NormalizedIdx = Builder.CreateSub(Induction, ExtendedIdx,
- "normalized.idx");
- NormalizedIdx = Builder.CreateSExtOrTrunc(NormalizedIdx, PhiTy);
- Broadcasted = II.transform(Builder, NormalizedIdx);
- Broadcasted->setName("offset.idx");
+ case InductionDescriptor::IK_IntInduction: {
+ assert(P->getType() == II.getStartValue()->getType() &&
+ "Types must match");
+ // Handle other induction variables that are now based on the
+ // canonical one.
+ Value *V = Induction;
+ if (P != OldInduction) {
+ V = Builder.CreateSExtOrTrunc(Induction, P->getType());
+ V = II.transform(Builder, V);
+ V->setName("offset.idx");
}
- Broadcasted = getBroadcastInstrs(Broadcasted);
+ Value *Broadcasted = getBroadcastInstrs(V);
// After broadcasting the induction variable we need to make the vector
// consecutive by adding 0, 1, 2, etc.
for (unsigned part = 0; part < UF; ++part)
- Entry[part] = getStepVector(Broadcasted, VF * part, II.StepValue);
+ Entry[part] = getStepVector(Broadcasted, VF * part, II.getStepValue());
return;
}
- case LoopVectorizationLegality::IK_PtrInduction:
+ case InductionDescriptor::IK_PtrInduction:
// Handle the pointer induction variable case.
assert(P->getType()->isPointerTy() && "Unexpected type.");
// This is the normalized GEP that starts counting at zero.
- Value *NormalizedIdx =
- Builder.CreateSub(Induction, ExtendedIdx, "normalized.idx");
- NormalizedIdx =
- Builder.CreateSExtOrTrunc(NormalizedIdx, II.StepValue->getType());
+ Value *PtrInd = Induction;
+ PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStepValue()->getType());
// This is the vector of results. Notice that we don't generate
// vector geps because scalar geps result in better code.
for (unsigned part = 0; part < UF; ++part) {
if (VF == 1) {
int EltIndex = part;
- Constant *Idx = ConstantInt::get(NormalizedIdx->getType(), EltIndex);
- Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx);
+ Constant *Idx = ConstantInt::get(PtrInd->getType(), EltIndex);
+ Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
Value *SclrGep = II.transform(Builder, GlobalIdx);
SclrGep->setName("next.gep");
Entry[part] = SclrGep;
@@ -3441,8 +3704,8 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
for (unsigned int i = 0; i < VF; ++i) {
int EltIndex = i + part * VF;
- Constant *Idx = ConstantInt::get(NormalizedIdx->getType(), EltIndex);
- Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx);
+ Constant *Idx = ConstantInt::get(PtrInd->getType(), EltIndex);
+ Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
Value *SclrGep = II.transform(Builder, GlobalIdx);
SclrGep->setName("next.gep");
VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
@@ -3458,7 +3721,8 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
// For each instruction in the old loop.
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
- VectorParts &Entry = WidenMap.get(it);
+ VectorParts &Entry = WidenMap.get(&*it);
+
switch (it->getOpcode()) {
case Instruction::Br:
// Nothing to do for PHIs and BR, since we already took care of the
@@ -3466,7 +3730,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
continue;
case Instruction::PHI: {
// Vectorize PHINodes.
- widenPHIInstruction(it, Entry, UF, VF, PV);
+ widenPHIInstruction(&*it, Entry, UF, VF, PV);
continue;
}// End of PHI.
@@ -3504,16 +3768,17 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
Entry[Part] = V;
}
- propagateMetadata(Entry, it);
+ propagateMetadata(Entry, &*it);
break;
}
case Instruction::Select: {
// Widen selects.
// If the selector is loop invariant we can create a select
// instruction with a scalar condition. Otherwise, use vector-select.
- bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(it->getOperand(0)),
- OrigLoop);
- setDebugLocFromInst(Builder, it);
+ auto *SE = PSE.getSE();
+ bool InvariantCond =
+ SE->isLoopInvariant(PSE.getSCEV(it->getOperand(0)), OrigLoop);
+ setDebugLocFromInst(Builder, &*it);
// The condition can be loop invariant but still defined inside the
// loop. This means that we can't just use the original 'cond' value.
@@ -3522,7 +3787,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
VectorParts &Cond = getVectorValue(it->getOperand(0));
VectorParts &Op0 = getVectorValue(it->getOperand(1));
VectorParts &Op1 = getVectorValue(it->getOperand(2));
-
+
Value *ScalarCond = (VF == 1) ? Cond[0] :
Builder.CreateExtractElement(Cond[0], Builder.getInt32(0));
@@ -3533,7 +3798,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
Op1[Part]);
}
- propagateMetadata(Entry, it);
+ propagateMetadata(Entry, &*it);
break;
}
@@ -3542,25 +3807,27 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
// Widen compares. Generate vector compares.
bool FCmp = (it->getOpcode() == Instruction::FCmp);
CmpInst *Cmp = dyn_cast<CmpInst>(it);
- setDebugLocFromInst(Builder, it);
+ setDebugLocFromInst(Builder, &*it);
VectorParts &A = getVectorValue(it->getOperand(0));
VectorParts &B = getVectorValue(it->getOperand(1));
for (unsigned Part = 0; Part < UF; ++Part) {
Value *C = nullptr;
- if (FCmp)
+ if (FCmp) {
C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]);
- else
+ cast<FCmpInst>(C)->copyFastMathFlags(&*it);
+ } else {
C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]);
+ }
Entry[Part] = C;
}
- propagateMetadata(Entry, it);
+ propagateMetadata(Entry, &*it);
break;
}
case Instruction::Store:
case Instruction::Load:
- vectorizeMemoryInstruction(it);
+ vectorizeMemoryInstruction(&*it);
break;
case Instruction::ZExt:
case Instruction::SExt:
@@ -3575,7 +3842,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
case Instruction::FPTrunc:
case Instruction::BitCast: {
CastInst *CI = dyn_cast<CastInst>(it);
- setDebugLocFromInst(Builder, it);
+ setDebugLocFromInst(Builder, &*it);
/// Optimize the special case where the source is the induction
/// variable. Notice that we can only optimize the 'trunc' case
/// because: a. FP conversions lose precision, b. sext/zext may wrap,
@@ -3585,13 +3852,13 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction,
CI->getType());
Value *Broadcasted = getBroadcastInstrs(ScalarCast);
- LoopVectorizationLegality::InductionInfo II =
+ InductionDescriptor II =
Legal->getInductionVars()->lookup(OldInduction);
- Constant *Step =
- ConstantInt::getSigned(CI->getType(), II.StepValue->getSExtValue());
+ Constant *Step = ConstantInt::getSigned(
+ CI->getType(), II.getStepValue()->getSExtValue());
for (unsigned Part = 0; Part < UF; ++Part)
Entry[Part] = getStepVector(Broadcasted, VF * Part, Step);
- propagateMetadata(Entry, it);
+ propagateMetadata(Entry, &*it);
break;
}
/// Vectorize casts.
@@ -3601,7 +3868,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
VectorParts &A = getVectorValue(it->getOperand(0));
for (unsigned Part = 0; Part < UF; ++Part)
Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy);
- propagateMetadata(Entry, it);
+ propagateMetadata(Entry, &*it);
break;
}
@@ -3609,7 +3876,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
// Ignore dbg intrinsics.
if (isa<DbgInfoIntrinsic>(it))
break;
- setDebugLocFromInst(Builder, it);
+ setDebugLocFromInst(Builder, &*it);
Module *M = BB->getParent()->getParent();
CallInst *CI = cast<CallInst>(it);
@@ -3625,7 +3892,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
if (ID &&
(ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
ID == Intrinsic::lifetime_start)) {
- scalarizeInstruction(it);
+ scalarizeInstruction(&*it);
break;
}
// The flag shows whether we use Intrinsic or a usual Call for vectorized
@@ -3636,7 +3903,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
bool UseVectorIntrinsic =
ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;
if (!UseVectorIntrinsic && NeedToScalarize) {
- scalarizeInstruction(it);
+ scalarizeInstruction(&*it);
break;
}
@@ -3677,13 +3944,13 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
Entry[Part] = Builder.CreateCall(VectorF, Args);
}
- propagateMetadata(Entry, it);
+ propagateMetadata(Entry, &*it);
break;
}
default:
// All other instructions are unsupported. Scalarize them.
- scalarizeInstruction(it);
+ scalarizeInstruction(&*it);
break;
}// end of switch.
}// end of for_each instr.
@@ -3691,7 +3958,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
void InnerLoopVectorizer::updateAnalysis() {
// Forget the original basic block.
- SE->forgetLoop(OrigLoop);
+ PSE.getSE()->forgetLoop(OrigLoop);
// Update the dominator tree information.
assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
@@ -3701,19 +3968,12 @@ void InnerLoopVectorizer::updateAnalysis() {
DT->addNewBlock(LoopBypassBlocks[I], LoopBypassBlocks[I-1]);
DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlocks.back());
- // Due to if predication of stores we might create a sequence of "if(pred)
- // a[i] = ...; " blocks.
- for (unsigned i = 0, e = LoopVectorBody.size(); i != e; ++i) {
- if (i == 0)
- DT->addNewBlock(LoopVectorBody[0], LoopVectorPreHeader);
- else if (isPredicatedBlock(i)) {
- DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-1]);
- } else {
- DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-2]);
- }
- }
+ // We don't predicate stores by this point, so the vector body should be a
+ // single loop.
+ assert(LoopVectorBody.size() == 1 && "Expected single block loop!");
+ DT->addNewBlock(LoopVectorBody[0], LoopVectorPreHeader);
- DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks[1]);
+ DT->addNewBlock(LoopMiddleBlock, LoopVectorBody.back());
DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
@@ -3850,10 +4110,10 @@ bool LoopVectorizationLegality::canVectorize() {
}
// ScalarEvolution needs to be able to find the exit count.
- const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop);
- if (ExitCount == SE->getCouldNotCompute()) {
- emitAnalysis(VectorizationReport() <<
- "could not determine number of loop iterations");
+ const SCEV *ExitCount = PSE.getSE()->getBackedgeTakenCount(TheLoop);
+ if (ExitCount == PSE.getSE()->getCouldNotCompute()) {
+ emitAnalysis(VectorizationReport()
+ << "could not determine number of loop iterations");
DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
return false;
}
@@ -3879,10 +4139,28 @@ bool LoopVectorizationLegality::canVectorize() {
: "")
<< "!\n");
+ bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
+
+ // If an override option has been passed in for interleaved accesses, use it.
+ if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
+ UseInterleaved = EnableInterleavedMemAccesses;
+
// Analyze interleaved memory accesses.
- if (EnableInterleavedMemAccesses)
+ if (UseInterleaved)
InterleaveInfo.analyzeInterleaving(Strides);
+ unsigned SCEVThreshold = VectorizeSCEVCheckThreshold;
+ if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
+ SCEVThreshold = PragmaVectorizeSCEVCheckThreshold;
+
+ if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) {
+ emitAnalysis(VectorizationReport()
+ << "Too many SCEV assumptions need to be made and checked "
+ << "at runtime");
+ DEBUG(dbgs() << "LV: Too many SCEV checks needed.\n");
+ return false;
+ }
+
// Okay! We can vectorize. At this point we don't have any other mem analysis
// which may limit our maximum vectorization factor, so just return true with
// no restrictions.
@@ -3929,7 +4207,6 @@ static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
}
bool LoopVectorizationLegality::canVectorizeInstrs() {
- BasicBlock *PreHeader = TheLoop->getLoopPreheader();
BasicBlock *Header = TheLoop->getHeader();
// Look for the attribute signaling the absence of NaNs.
@@ -3953,7 +4230,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
if (!PhiTy->isIntegerTy() &&
!PhiTy->isFloatingPointTy() &&
!PhiTy->isPointerTy()) {
- emitAnalysis(VectorizationReport(it)
+ emitAnalysis(VectorizationReport(&*it)
<< "loop control flow is not understood by vectorizer");
DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
return false;
@@ -3965,9 +4242,9 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
if (*bb != Header) {
// Check that this instruction has no outside users or is an
// identified reduction value with an outside user.
- if (!hasOutsideLoopUser(TheLoop, it, AllowedExit))
+ if (!hasOutsideLoopUser(TheLoop, &*it, AllowedExit))
continue;
- emitAnalysis(VectorizationReport(it) <<
+ emitAnalysis(VectorizationReport(&*it) <<
"value could not be identified as "
"an induction or reduction variable");
return false;
@@ -3975,19 +4252,15 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
// We only allow if-converted PHIs with exactly two incoming values.
if (Phi->getNumIncomingValues() != 2) {
- emitAnalysis(VectorizationReport(it)
+ emitAnalysis(VectorizationReport(&*it)
<< "control flow not understood by vectorizer");
DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
return false;
}
- // This is the value coming from the preheader.
- Value *StartValue = Phi->getIncomingValueForBlock(PreHeader);
- ConstantInt *StepValue = nullptr;
- // Check if this is an induction variable.
- InductionKind IK = isInductionVariable(Phi, StepValue);
-
- if (IK_NoInduction != IK) {
+ InductionDescriptor ID;
+ if (InductionDescriptor::isInductionPHI(Phi, PSE.getSE(), ID)) {
+ Inductions[Phi] = ID;
// Get the widest type.
if (!WidestIndTy)
WidestIndTy = convertPointerToIntegerType(DL, PhiTy);
@@ -3995,21 +4268,24 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy);
// Int inductions are special because we only allow one IV.
- if (IK == IK_IntInduction && StepValue->isOne()) {
+ if (ID.getKind() == InductionDescriptor::IK_IntInduction &&
+ ID.getStepValue()->isOne() &&
+ isa<Constant>(ID.getStartValue()) &&
+ cast<Constant>(ID.getStartValue())->isNullValue()) {
// Use the phi node with the widest type as induction. Use the last
// one if there are multiple (no good reason for doing this other
- // than it is expedient).
+ // than it is expedient). We've checked that it begins at zero and
+ // steps by one, so this is a canonical induction variable.
if (!Induction || PhiTy == WidestIndTy)
Induction = Phi;
}
DEBUG(dbgs() << "LV: Found an induction variable.\n");
- Inductions[Phi] = InductionInfo(StartValue, IK, StepValue);
// Until we explicitly handle the case of an induction variable with
// an outside loop user we have to give up vectorizing this loop.
- if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) {
- emitAnalysis(VectorizationReport(it) <<
+ if (hasOutsideLoopUser(TheLoop, &*it, AllowedExit)) {
+ emitAnalysis(VectorizationReport(&*it) <<
"use of induction value outside of the "
"loop is not handled by vectorizer");
return false;
@@ -4020,11 +4296,14 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop,
Reductions[Phi])) {
+ if (Reductions[Phi].hasUnsafeAlgebra())
+ Requirements->addUnsafeAlgebraInst(
+ Reductions[Phi].getUnsafeAlgebraInst());
AllowedExit.insert(Reductions[Phi].getLoopExitInstr());
continue;
}
- emitAnalysis(VectorizationReport(it) <<
+ emitAnalysis(VectorizationReport(&*it) <<
"value that could not be identified as "
"reduction is used outside the loop");
DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n");
@@ -4039,8 +4318,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgInfoIntrinsic>(CI) &&
!(CI->getCalledFunction() && TLI &&
TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) {
- emitAnalysis(VectorizationReport(it) <<
- "call instruction cannot be vectorized");
+ emitAnalysis(VectorizationReport(&*it)
+ << "call instruction cannot be vectorized");
DEBUG(dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n");
return false;
}
@@ -4049,8 +4328,9 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
// second argument is the same (i.e. loop invariant)
if (CI &&
hasVectorInstrinsicScalarOpd(getIntrinsicIDForCall(CI, TLI), 1)) {
- if (!SE->isLoopInvariant(SE->getSCEV(CI->getOperand(1)), TheLoop)) {
- emitAnalysis(VectorizationReport(it)
+ auto *SE = PSE.getSE();
+ if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(1)), TheLoop)) {
+ emitAnalysis(VectorizationReport(&*it)
<< "intrinsic instruction cannot be vectorized");
DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n");
return false;
@@ -4061,7 +4341,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
// Also, we can't vectorize extractelement instructions.
if ((!VectorType::isValidElementType(it->getType()) &&
!it->getType()->isVoidTy()) || isa<ExtractElementInst>(it)) {
- emitAnalysis(VectorizationReport(it)
+ emitAnalysis(VectorizationReport(&*it)
<< "instruction return type cannot be vectorized");
DEBUG(dbgs() << "LV: Found unvectorizable type.\n");
return false;
@@ -4085,8 +4365,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
// Reduction instructions are allowed to have exit users.
// All other instructions must not have external users.
- if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) {
- emitAnalysis(VectorizationReport(it) <<
+ if (hasOutsideLoopUser(TheLoop, &*it, AllowedExit)) {
+ emitAnalysis(VectorizationReport(&*it) <<
"value cannot be used outside the loop");
return false;
}
@@ -4104,6 +4384,12 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
}
}
+ // Now we know the widest induction type, check if our found induction
+ // is the same size. If it's not, unset it here and InnerLoopVectorizer
+ // will create another.
+ if (Induction && WidestIndTy != Induction->getType())
+ Induction = nullptr;
+
return true;
}
@@ -4116,7 +4402,7 @@ void LoopVectorizationLegality::collectStridedAccess(Value *MemAccess) {
else
return;
- Value *Stride = getStrideFromPointer(Ptr, SE, TheLoop);
+ Value *Stride = getStrideFromPointer(Ptr, PSE.getSE(), TheLoop);
if (!Stride)
return;
@@ -4142,7 +4428,7 @@ void LoopVectorizationLegality::collectLoopUniforms() {
BE = TheLoop->block_end(); B != BE; ++B)
for (BasicBlock::iterator I = (*B)->begin(), IE = (*B)->end();
I != IE; ++I)
- if (I->getType()->isPointerTy() && isConsecutivePtr(I))
+ if (I->getType()->isPointerTy() && isConsecutivePtr(&*I))
Worklist.insert(Worklist.end(), I->op_begin(), I->op_end());
while (!Worklist.empty()) {
@@ -4179,30 +4465,10 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
return false;
}
- if (LAI->getNumRuntimePointerChecks() >
- VectorizerParams::RuntimeMemoryCheckThreshold) {
- emitAnalysis(VectorizationReport()
- << LAI->getNumRuntimePointerChecks() << " exceeds limit of "
- << VectorizerParams::RuntimeMemoryCheckThreshold
- << " dependent memory operations checked at runtime");
- DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
- return false;
- }
- return true;
-}
+ Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks());
+ PSE.addPredicate(LAI->PSE.getUnionPredicate());
-LoopVectorizationLegality::InductionKind
-LoopVectorizationLegality::isInductionVariable(PHINode *Phi,
- ConstantInt *&StepValue) {
- if (!isInductionPHI(Phi, SE, StepValue))
- return IK_NoInduction;
-
- Type *PhiTy = Phi->getType();
- // Found an Integer induction variable.
- if (PhiTy->isIntegerTy())
- return IK_IntInduction;
- // Found an Pointer induction variable.
- return IK_PtrInduction;
+ return true;
}
bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
@@ -4256,8 +4522,8 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB,
if (++NumPredStores > NumberOfStoresToPredicate || !isSafePtr ||
!isSinglePredecessor) {
- // Build a masked store if it is legal for the target, otherwise scalarize
- // the block.
+ // Build a masked store if it is legal for the target, otherwise
+ // scalarize the block.
bool isLegalMaskedOp =
isLegalMaskedStore(SI->getValueOperand()->getType(),
SI->getPointerOperand());
@@ -4315,7 +4581,7 @@ void InterleavedAccessInfo::collectConstStridedAccesses(
StoreInst *SI = dyn_cast<StoreInst>(I);
Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
- int Stride = isStridedPtr(SE, Ptr, TheLoop, Strides);
+ int Stride = isStridedPtr(PSE, Ptr, TheLoop, Strides);
// The factor of the corresponding interleave group.
unsigned Factor = std::abs(Stride);
@@ -4324,7 +4590,7 @@ void InterleavedAccessInfo::collectConstStridedAccesses(
if (Factor < 2 || Factor > MaxInterleaveGroupFactor)
continue;
- const SCEV *Scev = replaceSymbolicStrideSCEV(SE, Strides, Ptr);
+ const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
unsigned Size = DL.getTypeAllocSize(PtrTy->getElementType());
@@ -4411,12 +4677,12 @@ void InterleavedAccessInfo::analyzeInterleaving(
continue;
// Calculate the distance and prepare for the rule 3.
- const SCEVConstant *DistToA =
- dyn_cast<SCEVConstant>(SE->getMinusSCEV(DesB.Scev, DesA.Scev));
+ const SCEVConstant *DistToA = dyn_cast<SCEVConstant>(
+ PSE.getSE()->getMinusSCEV(DesB.Scev, DesA.Scev));
if (!DistToA)
continue;
- int DistanceToA = DistToA->getValue()->getValue().getSExtValue();
+ int DistanceToA = DistToA->getAPInt().getSExtValue();
// Skip if the distance is not multiple of size as they are not in the
// same group.
@@ -4454,8 +4720,9 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
emitAnalysis(VectorizationReport() <<
"runtime pointer checks needed. Enable vectorization of this "
"loop with '#pragma clang loop vectorize(enable)' when "
- "compiling with -Os");
- DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n");
+ "compiling with -Os/-Oz");
+ DEBUG(dbgs() <<
+ "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n");
return Factor;
}
@@ -4467,10 +4734,12 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
}
// Find the trip count.
- unsigned TC = SE->getSmallConstantTripCount(TheLoop);
+ unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
- unsigned WidestType = getWidestType();
+ MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
+ unsigned SmallestType, WidestType;
+ std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
unsigned WidestRegister = TTI.getRegisterBitWidth(true);
unsigned MaxSafeDepDist = -1U;
if (Legal->getMaxSafeDepDistBytes() != -1U)
@@ -4478,7 +4747,9 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
WidestRegister = ((WidestRegister < MaxSafeDepDist) ?
WidestRegister : MaxSafeDepDist);
unsigned MaxVectorSize = WidestRegister / WidestType;
- DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n");
+
+ DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType << " / "
+ << WidestType << " bits.\n");
DEBUG(dbgs() << "LV: The Widest register is: "
<< WidestRegister << " bits.\n");
@@ -4491,6 +4762,26 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
" into one vector!");
unsigned VF = MaxVectorSize;
+ if (MaximizeBandwidth && !OptForSize) {
+ // Collect all viable vectorization factors.
+ SmallVector<unsigned, 8> VFs;
+ unsigned NewMaxVectorSize = WidestRegister / SmallestType;
+ for (unsigned VS = MaxVectorSize; VS <= NewMaxVectorSize; VS *= 2)
+ VFs.push_back(VS);
+
+ // For each VF calculate its register usage.
+ auto RUs = calculateRegisterUsage(VFs);
+
+ // Select the largest VF which doesn't require more registers than existing
+ // ones.
+ unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true);
+ for (int i = RUs.size() - 1; i >= 0; --i) {
+ if (RUs[i].MaxLocalUsers <= TargetNumRegisters) {
+ VF = VFs[i];
+ break;
+ }
+ }
+ }
// If we optimize the program for size, avoid creating the tail loop.
if (OptForSize) {
@@ -4499,7 +4790,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
emitAnalysis
(VectorizationReport() <<
"unable to calculate the loop count due to complex control flow");
- DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n");
+ DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
return Factor;
}
@@ -4515,8 +4806,8 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
"cannot optimize for size and vectorize at the "
"same time. Enable vectorization of this loop "
"with '#pragma clang loop vectorize(enable)' "
- "when compiling with -Os");
- DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n");
+ "when compiling with -Os/-Oz");
+ DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
return Factor;
}
}
@@ -4566,7 +4857,9 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
return Factor;
}
-unsigned LoopVectorizationCostModel::getWidestType() {
+std::pair<unsigned, unsigned>
+LoopVectorizationCostModel::getSmallestAndWidestTypes() {
+ unsigned MinWidth = -1U;
unsigned MaxWidth = 8;
const DataLayout &DL = TheFunction->getParent()->getDataLayout();
@@ -4579,18 +4872,22 @@ unsigned LoopVectorizationCostModel::getWidestType() {
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
Type *T = it->getType();
- // Ignore ephemeral values.
- if (EphValues.count(it))
+ // Skip ignored values.
+ if (ValuesToIgnore.count(&*it))
continue;
// Only examine Loads, Stores and PHINodes.
if (!isa<LoadInst>(it) && !isa<StoreInst>(it) && !isa<PHINode>(it))
continue;
- // Examine PHI nodes that are reduction variables.
- if (PHINode *PN = dyn_cast<PHINode>(it))
- if (!Legal->getReductionVars()->count(PN))
+ // Examine PHI nodes that are reduction variables. Update the type to
+ // account for the recurrence type.
+ if (PHINode *PN = dyn_cast<PHINode>(it)) {
+ if (!Legal->isReductionVariable(PN))
continue;
+ RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN];
+ T = RdxDesc.getRecurrenceType();
+ }
// Examine the stored values.
if (StoreInst *ST = dyn_cast<StoreInst>(it))
@@ -4599,15 +4896,17 @@ unsigned LoopVectorizationCostModel::getWidestType() {
// Ignore loaded pointer types and stored pointer types that are not
// consecutive. However, we do want to take consecutive stores/loads of
// pointer vectors into account.
- if (T->isPointerTy() && !isConsecutiveLoadOrStore(it))
+ if (T->isPointerTy() && !isConsecutiveLoadOrStore(&*it))
continue;
+ MinWidth = std::min(MinWidth,
+ (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
MaxWidth = std::max(MaxWidth,
(unsigned)DL.getTypeSizeInBits(T->getScalarType()));
}
}
- return MaxWidth;
+ return {MinWidth, MaxWidth};
}
unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
@@ -4628,11 +4927,6 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
// 3. We don't interleave if we think that we will spill registers to memory
// due to the increased register pressure.
- // Use the user preference, unless 'auto' is selected.
- int UserUF = Hints->getInterleave();
- if (UserUF != 0)
- return UserUF;
-
// When we optimize for size, we don't interleave.
if (OptForSize)
return 1;
@@ -4642,7 +4936,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
return 1;
// Do not interleave loops with a relatively small trip count.
- unsigned TC = SE->getSmallConstantTripCount(TheLoop);
+ unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
if (TC > 1 && TC < TinyTripCountInterleaveThreshold)
return 1;
@@ -4658,7 +4952,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
TargetNumRegisters = ForceTargetNumVectorRegs;
}
- LoopVectorizationCostModel::RegisterUsage R = calculateRegisterUsage();
+ RegisterUsage R = calculateRegisterUsage({VF})[0];
// We divide by these constants so assume that we have at least one
// instruction that uses at least one register.
R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
@@ -4756,8 +5050,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
}
// Interleave if this is a large loop (small loops are already dealt with by
- // this
- // point) that could benefit from interleaving.
+ // this point) that could benefit from interleaving.
bool HasReductions = (Legal->getReductionVars()->size() > 0);
if (TTI.enableAggressiveInterleaving(HasReductions)) {
DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
@@ -4768,8 +5061,9 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
return 1;
}
-LoopVectorizationCostModel::RegisterUsage
-LoopVectorizationCostModel::calculateRegisterUsage() {
+SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
+LoopVectorizationCostModel::calculateRegisterUsage(
+ const SmallVector<unsigned, 8> &VFs) {
// This function calculates the register usage by measuring the highest number
// of values that are alive at a single location. Obviously, this is a very
// rough estimation. We scan the loop in a topological order in order and
@@ -4790,8 +5084,8 @@ LoopVectorizationCostModel::calculateRegisterUsage() {
LoopBlocksDFS DFS(TheLoop);
DFS.perform(LI);
- RegisterUsage R;
- R.NumInstructions = 0;
+ RegisterUsage RU;
+ RU.NumInstructions = 0;
// Each 'key' in the map opens a new interval. The values
// of the map are the index of the 'last seen' usage of the
@@ -4810,15 +5104,13 @@ LoopVectorizationCostModel::calculateRegisterUsage() {
unsigned Index = 0;
for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
be = DFS.endRPO(); bb != be; ++bb) {
- R.NumInstructions += (*bb)->size();
- for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
- ++it) {
- Instruction *I = it;
- IdxToInstr[Index++] = I;
+ RU.NumInstructions += (*bb)->size();
+ for (Instruction &I : **bb) {
+ IdxToInstr[Index++] = &I;
// Save the end location of each USE.
- for (unsigned i = 0; i < I->getNumOperands(); ++i) {
- Value *U = I->getOperand(i);
+ for (unsigned i = 0; i < I.getNumOperands(); ++i) {
+ Value *U = I.getOperand(i);
Instruction *Instr = dyn_cast<Instruction>(U);
// Ignore non-instruction values such as arguments, constants, etc.
@@ -4847,42 +5139,85 @@ LoopVectorizationCostModel::calculateRegisterUsage() {
TransposeEnds[it->second].push_back(it->first);
SmallSet<Instruction*, 8> OpenIntervals;
- unsigned MaxUsage = 0;
+ // Get the size of the widest register.
+ unsigned MaxSafeDepDist = -1U;
+ if (Legal->getMaxSafeDepDistBytes() != -1U)
+ MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
+ unsigned WidestRegister =
+ std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
+ const DataLayout &DL = TheFunction->getParent()->getDataLayout();
+
+ SmallVector<RegisterUsage, 8> RUs(VFs.size());
+ SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0);
DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
+
+ // A lambda that gets the register usage for the given type and VF.
+ auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
+ unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
+ return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
+ };
+
for (unsigned int i = 0; i < Index; ++i) {
Instruction *I = IdxToInstr[i];
// Ignore instructions that are never used within the loop.
if (!Ends.count(I)) continue;
- // Ignore ephemeral values.
- if (EphValues.count(I))
- continue;
-
// Remove all of the instructions that end at this location.
InstrList &List = TransposeEnds[i];
- for (unsigned int j=0, e = List.size(); j < e; ++j)
+ for (unsigned int j = 0, e = List.size(); j < e; ++j)
OpenIntervals.erase(List[j]);
- // Count the number of live interals.
- MaxUsage = std::max(MaxUsage, OpenIntervals.size());
+ // Skip ignored values.
+ if (ValuesToIgnore.count(I))
+ continue;
- DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " <<
- OpenIntervals.size() << '\n');
+ // For each VF find the maximum usage of registers.
+ for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
+ if (VFs[j] == 1) {
+ MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size());
+ continue;
+ }
+
+ // Count the number of live intervals.
+ unsigned RegUsage = 0;
+ for (auto Inst : OpenIntervals) {
+ // Skip ignored values for VF > 1.
+ if (VecValuesToIgnore.count(Inst))
+ continue;
+ RegUsage += GetRegUsage(Inst->getType(), VFs[j]);
+ }
+ MaxUsages[j] = std::max(MaxUsages[j], RegUsage);
+ }
+
+ DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
+ << OpenIntervals.size() << '\n');
// Add the current instruction to the list of open intervals.
OpenIntervals.insert(I);
}
- unsigned Invariant = LoopInvariants.size();
- DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << '\n');
- DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n');
- DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << '\n');
+ for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
+ unsigned Invariant = 0;
+ if (VFs[i] == 1)
+ Invariant = LoopInvariants.size();
+ else {
+ for (auto Inst : LoopInvariants)
+ Invariant += GetRegUsage(Inst->getType(), VFs[i]);
+ }
+
+ DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n');
+ DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n');
+ DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n');
+ DEBUG(dbgs() << "LV(REG): LoopSize: " << RU.NumInstructions << '\n');
- R.LoopInvariantRegs = Invariant;
- R.MaxLocalUsers = MaxUsage;
- return R;
+ RU.LoopInvariantRegs = Invariant;
+ RU.MaxLocalUsers = MaxUsages[i];
+ RUs[i] = RU;
+ }
+
+ return RUs;
}
unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
@@ -4900,11 +5235,11 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
if (isa<DbgInfoIntrinsic>(it))
continue;
- // Ignore ephemeral values.
- if (EphValues.count(it))
+ // Skip ignored values.
+ if (ValuesToIgnore.count(&*it))
continue;
- unsigned C = getInstructionCost(it, VF);
+ unsigned C = getInstructionCost(&*it, VF);
// Check if we should override the cost.
if (ForceTargetInstructionCost.getNumOccurrences() > 0)
@@ -4969,7 +5304,7 @@ static bool isLikelyComplexAddressComputation(Value *Ptr,
if (!C)
return true;
- const APInt &APStepVal = C->getValue()->getValue();
+ const APInt &APStepVal = C->getAPInt();
// Huge step value - give up.
if (APStepVal.getBitWidth() > 64)
@@ -4981,9 +5316,8 @@ static bool isLikelyComplexAddressComputation(Value *Ptr,
}
static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
- if (Legal->hasStride(I->getOperand(0)) || Legal->hasStride(I->getOperand(1)))
- return true;
- return false;
+ return Legal->hasStride(I->getOperand(0)) ||
+ Legal->hasStride(I->getOperand(1));
}
unsigned
@@ -4994,7 +5328,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
VF = 1;
Type *RetTy = I->getType();
+ if (VF > 1 && MinBWs.count(I))
+ RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
Type *VectorTy = ToVectorTy(RetTy, VF);
+ auto SE = PSE.getSE();
// TODO: We need to estimate the cost of intrinsic calls.
switch (I->getOpcode()) {
@@ -5076,6 +5413,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
case Instruction::ICmp:
case Instruction::FCmp: {
Type *ValTy = I->getOperand(0)->getType();
+ Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
+ auto It = MinBWs.find(Op0AsInstruction);
+ if (VF > 1 && It != MinBWs.end())
+ ValTy = IntegerType::get(ValTy->getContext(), It->second);
VectorTy = ToVectorTy(ValTy, VF);
return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy);
}
@@ -5199,8 +5540,28 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
Legal->isInductionVariable(I->getOperand(0)))
return TTI.getCastInstrCost(I->getOpcode(), I->getType(),
I->getOperand(0)->getType());
-
- Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF);
+
+ Type *SrcScalarTy = I->getOperand(0)->getType();
+ Type *SrcVecTy = ToVectorTy(SrcScalarTy, VF);
+ if (VF > 1 && MinBWs.count(I)) {
+ // This cast is going to be shrunk. This may remove the cast or it might
+ // turn it into slightly different cast. For example, if MinBW == 16,
+ // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
+ //
+ // Calculate the modified src and dest types.
+ Type *MinVecTy = VectorTy;
+ if (I->getOpcode() == Instruction::Trunc) {
+ SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
+ VectorTy = largestIntegerVectorType(ToVectorTy(I->getType(), VF),
+ MinVecTy);
+ } else if (I->getOpcode() == Instruction::ZExt ||
+ I->getOpcode() == Instruction::SExt) {
+ SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
+ VectorTy = smallestIntegerVectorType(ToVectorTy(I->getType(), VF),
+ MinVecTy);
+ }
+ }
+
return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
}
case Instruction::Call: {
@@ -5240,15 +5601,18 @@ char LoopVectorize::ID = 0;
static const char lv_name[] = "Loop Vectorization";
INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfo)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LCSSA)
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DemandedBits)
INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
namespace llvm {
@@ -5269,6 +5633,79 @@ bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
return false;
}
+void LoopVectorizationCostModel::collectValuesToIgnore() {
+ // Ignore ephemeral values.
+ CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
+
+ // Ignore type-promoting instructions we identified during reduction
+ // detection.
+ for (auto &Reduction : *Legal->getReductionVars()) {
+ RecurrenceDescriptor &RedDes = Reduction.second;
+ SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
+ VecValuesToIgnore.insert(Casts.begin(), Casts.end());
+ }
+
+ // Ignore induction phis that are only used in either GetElementPtr or ICmp
+ // instruction to exit loop. Induction variables usually have large types and
+ // can have big impact when estimating register usage.
+ // This is for when VF > 1.
+ for (auto &Induction : *Legal->getInductionVars()) {
+ auto *PN = Induction.first;
+ auto *UpdateV = PN->getIncomingValueForBlock(TheLoop->getLoopLatch());
+
+ // Check that the PHI is only used by the induction increment (UpdateV) or
+ // by GEPs. Then check that UpdateV is only used by a compare instruction or
+ // the loop header PHI.
+ // FIXME: Need precise def-use analysis to determine if this instruction
+ // variable will be vectorized.
+ if (std::all_of(PN->user_begin(), PN->user_end(),
+ [&](const User *U) -> bool {
+ return U == UpdateV || isa<GetElementPtrInst>(U);
+ }) &&
+ std::all_of(UpdateV->user_begin(), UpdateV->user_end(),
+ [&](const User *U) -> bool {
+ return U == PN || isa<ICmpInst>(U);
+ })) {
+ VecValuesToIgnore.insert(PN);
+ VecValuesToIgnore.insert(UpdateV);
+ }
+ }
+
+ // Ignore instructions that will not be vectorized.
+ // This is for when VF > 1.
+ for (auto bb = TheLoop->block_begin(), be = TheLoop->block_end(); bb != be;
+ ++bb) {
+ for (auto &Inst : **bb) {
+ switch (Inst.getOpcode()) {
+ case Instruction::GetElementPtr: {
+ // Ignore GEP if its last operand is an induction variable so that it is
+ // a consecutive load/store and won't be vectorized as scatter/gather
+ // pattern.
+
+ GetElementPtrInst *Gep = cast<GetElementPtrInst>(&Inst);
+ unsigned NumOperands = Gep->getNumOperands();
+ unsigned InductionOperand = getGEPInductionOperand(Gep);
+ bool GepToIgnore = true;
+
+ // Check that all of the gep indices are uniform except for the
+ // induction operand.
+ for (unsigned i = 0; i != NumOperands; ++i) {
+ if (i != InductionOperand &&
+ !PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)),
+ TheLoop)) {
+ GepToIgnore = false;
+ break;
+ }
+ }
+
+ if (GepToIgnore)
+ VecValuesToIgnore.insert(&Inst);
+ break;
+ }
+ }
+ }
+ }
+}
void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
bool IfPredicateStore) {
@@ -5316,19 +5753,12 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
// Create a new entry in the WidenMap and initialize it to Undef or Null.
VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
- Instruction *InsertPt = Builder.GetInsertPoint();
- BasicBlock *IfBlock = Builder.GetInsertBlock();
- BasicBlock *CondBlock = nullptr;
-
VectorParts Cond;
- Loop *VectorLp = nullptr;
if (IfPredicateStore) {
assert(Instr->getParent()->getSinglePredecessor() &&
"Only support single predecessor blocks");
Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(),
Instr->getParent());
- VectorLp = LI->getLoopFor(IfBlock);
- assert(VectorLp && "Must have a loop for this block");
}
// For each vector unroll 'part':
@@ -5343,11 +5773,6 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
Builder.CreateExtractElement(Cond[Part], Builder.getInt32(0));
Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cond[Part],
ConstantInt::get(Cond[Part]->getType(), 1));
- CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
- LoopVectorBody.push_back(CondBlock);
- VectorLp->addBasicBlockToLoop(CondBlock, *LI);
- // Update Builder with newly created basic block.
- Builder.SetInsertPoint(InsertPt);
}
Instruction *Cloned = Instr->clone();
@@ -5367,16 +5792,10 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
if (!IsVoidRetTy)
VecResults[Part] = Cloned;
- // End if-block.
- if (IfPredicateStore) {
- BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
- LoopVectorBody.push_back(NewIfBlock);
- VectorLp->addBasicBlockToLoop(NewIfBlock, *LI);
- Builder.SetInsertPoint(InsertPt);
- ReplaceInstWithInst(IfBlock->getTerminator(),
- BranchInst::Create(CondBlock, NewIfBlock, Cmp));
- IfBlock = NewIfBlock;
- }
+ // End if-block.
+ if (IfPredicateStore)
+ PredicatedStores.push_back(std::make_pair(cast<StoreInst>(Cloned),
+ Cmp));
}
}
diff --git a/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index b180c97..9ed44d1 100644
--- a/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -22,6 +22,7 @@
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/LoopInfo.h"
@@ -61,7 +62,7 @@ static cl::opt<int>
"number "));
static cl::opt<bool>
-ShouldVectorizeHor("slp-vectorize-hor", cl::init(false), cl::Hidden,
+ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
cl::desc("Attempt to vectorize horizontal reductions"));
static cl::opt<bool> ShouldStartVectorizeHorAtStore(
@@ -73,6 +74,14 @@ static cl::opt<int>
MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
cl::desc("Attempt to vectorize for this register size in bits"));
+/// Limits the size of scheduling regions in a block.
+/// It avoid long compile times for _very_ large blocks where vector
+/// instructions are spread over a wide range.
+/// This limit is way higher than needed by real-world functions.
+static cl::opt<int>
+ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
+ cl::desc("Limit the size of the SLP scheduling region per block"));
+
namespace {
// FIXME: Set this via cl::opt to allow overriding.
@@ -89,6 +98,10 @@ static const unsigned AliasedCheckLimit = 10;
// This limit is useful for very large basic blocks.
static const unsigned MaxMemDepDistance = 160;
+/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
+/// regions to be handled.
+static const int MinScheduleRegionSize = 16;
+
/// \brief Predicate for the element types that the SLP vectorizer supports.
///
/// The most important thing to filter here are types which are invalid in LLVM
@@ -156,13 +169,11 @@ static unsigned getAltOpcode(unsigned Op) {
/// of an alternate sequence which can later be merged as
/// a ShuffleVector instruction.
static bool canCombineAsAltInst(unsigned Op) {
- if (Op == Instruction::FAdd || Op == Instruction::FSub ||
- Op == Instruction::Sub || Op == Instruction::Add)
- return true;
- return false;
+ return Op == Instruction::FAdd || Op == Instruction::FSub ||
+ Op == Instruction::Sub || Op == Instruction::Add;
}
-/// \returns ShuffleVector instruction if intructions in \p VL have
+/// \returns ShuffleVector instruction if instructions in \p VL have
/// alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence.
/// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...)
static unsigned isAltInst(ArrayRef<Value *> VL) {
@@ -242,6 +253,9 @@ static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) {
case LLVMContext::MD_fpmath:
MD = MDNode::getMostGenericFPMath(MD, IMD);
break;
+ case LLVMContext::MD_nontemporal:
+ MD = MDNode::intersect(MD, IMD);
+ break;
}
}
I->setMetadata(Kind, MD);
@@ -393,7 +407,7 @@ public:
/// \brief Perform LICM and CSE on the newly generated gather sequences.
void optimizeGatherSequence();
- /// \returns true if it is benefitial to reverse the vector order.
+ /// \returns true if it is beneficial to reverse the vector order.
bool shouldReorder() const {
return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder;
}
@@ -441,7 +455,7 @@ private:
/// \returns a vector from a collection of scalars in \p VL.
Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
- /// \returns whether the VectorizableTree is fully vectoriable and will
+ /// \returns whether the VectorizableTree is fully vectorizable and will
/// be beneficial even the tree height is tiny.
bool isFullyVectorizableTinyTree();
@@ -506,7 +520,7 @@ private:
/// This POD struct describes one external user in the vectorized tree.
struct ExternalUser {
ExternalUser (Value *S, llvm::User *U, int L) :
- Scalar(S), User(U), Lane(L){};
+ Scalar(S), User(U), Lane(L){}
// Which scalar in our function.
Value *Scalar;
// Which user that uses the scalar.
@@ -717,6 +731,8 @@ private:
: BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize),
ScheduleStart(nullptr), ScheduleEnd(nullptr),
FirstLoadStoreInRegion(nullptr), LastLoadStoreInRegion(nullptr),
+ ScheduleRegionSize(0),
+ ScheduleRegionSizeLimit(ScheduleRegionSizeBudget),
// Make sure that the initial SchedulingRegionID is greater than the
// initial SchedulingRegionID in ScheduleData (which is 0).
SchedulingRegionID(1) {}
@@ -728,6 +744,13 @@ private:
FirstLoadStoreInRegion = nullptr;
LastLoadStoreInRegion = nullptr;
+ // Reduce the maximum schedule region size by the size of the
+ // previous scheduling run.
+ ScheduleRegionSizeLimit -= ScheduleRegionSize;
+ if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
+ ScheduleRegionSizeLimit = MinScheduleRegionSize;
+ ScheduleRegionSize = 0;
+
// Make a new scheduling region, i.e. all existing ScheduleData is not
// in the new region yet.
++SchedulingRegionID;
@@ -804,7 +827,8 @@ private:
void cancelScheduling(ArrayRef<Value *> VL);
/// Extends the scheduling region so that V is inside the region.
- void extendSchedulingRegion(Value *V);
+ /// \returns true if the region size is within the limit.
+ bool extendSchedulingRegion(Value *V);
/// Initialize the ScheduleData structures for new instructions in the
/// scheduling region.
@@ -858,6 +882,12 @@ private:
/// (can be null).
ScheduleData *LastLoadStoreInRegion;
+ /// The current size of the scheduling region.
+ int ScheduleRegionSize;
+
+ /// The maximum size allowed for the scheduling region.
+ int ScheduleRegionSizeLimit;
+
/// The ID of the scheduling region. For a new vectorization iteration this
/// is incremented which "removes" all ScheduleData from the region.
int SchedulingRegionID;
@@ -1077,7 +1107,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
if (!BS.tryScheduleBundle(VL, this)) {
DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
- BS.cancelScheduling(VL);
+ assert((!BS.getScheduleData(VL[0]) ||
+ !BS.getScheduleData(VL[0])->isPartOfBundle()) &&
+ "tryScheduleBundle should cancelScheduling on failure");
newTreeEntry(VL, false);
return;
}
@@ -1125,6 +1157,23 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
return;
}
case Instruction::Load: {
+ // Check that a vectorized load would load the same memory as a scalar
+ // load.
+ // For example we don't want vectorize loads that are smaller than 8 bit.
+ // Even though we have a packed struct {<i2, i2, i2, i2>} LLVM treats
+ // loading/storing it as an i8 struct. If we vectorize loads/stores from
+ // such a struct we read/write packed bits disagreeing with the
+ // unvectorized version.
+ const DataLayout &DL = F->getParent()->getDataLayout();
+ Type *ScalarTy = VL[0]->getType();
+
+ if (DL.getTypeSizeInBits(ScalarTy) !=
+ DL.getTypeAllocSizeInBits(ScalarTy)) {
+ BS.cancelScheduling(VL);
+ newTreeEntry(VL, false);
+ DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
+ return;
+ }
// Check if the loads are consecutive or of we need to swizzle them.
for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
LoadInst *L = cast<LoadInst>(VL[i]);
@@ -1134,7 +1183,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
return;
}
- const DataLayout &DL = F->getParent()->getDataLayout();
+
if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) {
if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0], DL)) {
++NumLoadsWantToChangeOrder;
@@ -1690,7 +1739,8 @@ int BoUpSLP::getSpillCost() {
}
// Now find the sequence of instructions between PrevInst and Inst.
- BasicBlock::reverse_iterator InstIt(Inst), PrevInstIt(PrevInst);
+ BasicBlock::reverse_iterator InstIt(Inst->getIterator()),
+ PrevInstIt(PrevInst->getIterator());
--PrevInstIt;
while (InstIt != PrevInstIt) {
if (PrevInstIt == PrevInst->getParent()->rend()) {
@@ -1890,106 +1940,126 @@ void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL,
}
}
+// Return true if I should be commuted before adding it's left and right
+// operands to the arrays Left and Right.
+//
+// The vectorizer is trying to either have all elements one side being
+// instruction with the same opcode to enable further vectorization, or having
+// a splat to lower the vectorizing cost.
+static bool shouldReorderOperands(int i, Instruction &I,
+ SmallVectorImpl<Value *> &Left,
+ SmallVectorImpl<Value *> &Right,
+ bool AllSameOpcodeLeft,
+ bool AllSameOpcodeRight, bool SplatLeft,
+ bool SplatRight) {
+ Value *VLeft = I.getOperand(0);
+ Value *VRight = I.getOperand(1);
+ // If we have "SplatRight", try to see if commuting is needed to preserve it.
+ if (SplatRight) {
+ if (VRight == Right[i - 1])
+ // Preserve SplatRight
+ return false;
+ if (VLeft == Right[i - 1]) {
+ // Commuting would preserve SplatRight, but we don't want to break
+ // SplatLeft either, i.e. preserve the original order if possible.
+ // (FIXME: why do we care?)
+ if (SplatLeft && VLeft == Left[i - 1])
+ return false;
+ return true;
+ }
+ }
+ // Symmetrically handle Right side.
+ if (SplatLeft) {
+ if (VLeft == Left[i - 1])
+ // Preserve SplatLeft
+ return false;
+ if (VRight == Left[i - 1])
+ return true;
+ }
+
+ Instruction *ILeft = dyn_cast<Instruction>(VLeft);
+ Instruction *IRight = dyn_cast<Instruction>(VRight);
+
+ // If we have "AllSameOpcodeRight", try to see if the left operands preserves
+ // it and not the right, in this case we want to commute.
+ if (AllSameOpcodeRight) {
+ unsigned RightPrevOpcode = cast<Instruction>(Right[i - 1])->getOpcode();
+ if (IRight && RightPrevOpcode == IRight->getOpcode())
+ // Do not commute, a match on the right preserves AllSameOpcodeRight
+ return false;
+ if (ILeft && RightPrevOpcode == ILeft->getOpcode()) {
+ // We have a match and may want to commute, but first check if there is
+ // not also a match on the existing operands on the Left to preserve
+ // AllSameOpcodeLeft, i.e. preserve the original order if possible.
+ // (FIXME: why do we care?)
+ if (AllSameOpcodeLeft && ILeft &&
+ cast<Instruction>(Left[i - 1])->getOpcode() == ILeft->getOpcode())
+ return false;
+ return true;
+ }
+ }
+ // Symmetrically handle Left side.
+ if (AllSameOpcodeLeft) {
+ unsigned LeftPrevOpcode = cast<Instruction>(Left[i - 1])->getOpcode();
+ if (ILeft && LeftPrevOpcode == ILeft->getOpcode())
+ return false;
+ if (IRight && LeftPrevOpcode == IRight->getOpcode())
+ return true;
+ }
+ return false;
+}
+
void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
SmallVectorImpl<Value *> &Left,
SmallVectorImpl<Value *> &Right) {
- SmallVector<Value *, 16> OrigLeft, OrigRight;
-
- bool AllSameOpcodeLeft = true;
- bool AllSameOpcodeRight = true;
- for (unsigned i = 0, e = VL.size(); i != e; ++i) {
- Instruction *I = cast<Instruction>(VL[i]);
- Value *VLeft = I->getOperand(0);
- Value *VRight = I->getOperand(1);
-
- OrigLeft.push_back(VLeft);
- OrigRight.push_back(VRight);
-
- Instruction *ILeft = dyn_cast<Instruction>(VLeft);
- Instruction *IRight = dyn_cast<Instruction>(VRight);
-
- // Check whether all operands on one side have the same opcode. In this case
- // we want to preserve the original order and not make things worse by
- // reordering.
- if (i && AllSameOpcodeLeft && ILeft) {
- if (Instruction *PLeft = dyn_cast<Instruction>(OrigLeft[i - 1])) {
- if (PLeft->getOpcode() != ILeft->getOpcode())
- AllSameOpcodeLeft = false;
- } else
- AllSameOpcodeLeft = false;
- }
- if (i && AllSameOpcodeRight && IRight) {
- if (Instruction *PRight = dyn_cast<Instruction>(OrigRight[i - 1])) {
- if (PRight->getOpcode() != IRight->getOpcode())
- AllSameOpcodeRight = false;
- } else
- AllSameOpcodeRight = false;
- }
-
- // Sort two opcodes. In the code below we try to preserve the ability to use
- // broadcast of values instead of individual inserts.
- // vl1 = load
- // vl2 = phi
- // vr1 = load
- // vr2 = vr2
- // = vl1 x vr1
- // = vl2 x vr2
- // If we just sorted according to opcode we would leave the first line in
- // tact but we would swap vl2 with vr2 because opcode(phi) > opcode(load).
- // = vl1 x vr1
- // = vr2 x vl2
- // Because vr2 and vr1 are from the same load we loose the opportunity of a
- // broadcast for the packed right side in the backend: we have [vr1, vl2]
- // instead of [vr1, vr2=vr1].
- if (ILeft && IRight) {
- if (!i && ILeft->getOpcode() > IRight->getOpcode()) {
- Left.push_back(IRight);
- Right.push_back(ILeft);
- } else if (i && ILeft->getOpcode() > IRight->getOpcode() &&
- Right[i - 1] != IRight) {
- // Try not to destroy a broad cast for no apparent benefit.
- Left.push_back(IRight);
- Right.push_back(ILeft);
- } else if (i && ILeft->getOpcode() == IRight->getOpcode() &&
- Right[i - 1] == ILeft) {
- // Try preserve broadcasts.
- Left.push_back(IRight);
- Right.push_back(ILeft);
- } else if (i && ILeft->getOpcode() == IRight->getOpcode() &&
- Left[i - 1] == IRight) {
- // Try preserve broadcasts.
- Left.push_back(IRight);
- Right.push_back(ILeft);
- } else {
- Left.push_back(ILeft);
- Right.push_back(IRight);
- }
- continue;
- }
- // One opcode, put the instruction on the right.
- if (ILeft) {
- Left.push_back(VRight);
- Right.push_back(ILeft);
- continue;
- }
+ if (VL.size()) {
+ // Peel the first iteration out of the loop since there's nothing
+ // interesting to do anyway and it simplifies the checks in the loop.
+ auto VLeft = cast<Instruction>(VL[0])->getOperand(0);
+ auto VRight = cast<Instruction>(VL[0])->getOperand(1);
+ if (!isa<Instruction>(VRight) && isa<Instruction>(VLeft))
+ // Favor having instruction to the right. FIXME: why?
+ std::swap(VLeft, VRight);
Left.push_back(VLeft);
Right.push_back(VRight);
}
- bool LeftBroadcast = isSplat(Left);
- bool RightBroadcast = isSplat(Right);
-
- // If operands end up being broadcast return this operand order.
- if (LeftBroadcast || RightBroadcast)
- return;
+ // Keep track if we have instructions with all the same opcode on one side.
+ bool AllSameOpcodeLeft = isa<Instruction>(Left[0]);
+ bool AllSameOpcodeRight = isa<Instruction>(Right[0]);
+ // Keep track if we have one side with all the same value (broadcast).
+ bool SplatLeft = true;
+ bool SplatRight = true;
- // Don't reorder if the operands where good to begin.
- if (AllSameOpcodeRight || AllSameOpcodeLeft) {
- Left = OrigLeft;
- Right = OrigRight;
+ for (unsigned i = 1, e = VL.size(); i != e; ++i) {
+ Instruction *I = cast<Instruction>(VL[i]);
+ assert(I->isCommutative() && "Can only process commutative instruction");
+ // Commute to favor either a splat or maximizing having the same opcodes on
+ // one side.
+ if (shouldReorderOperands(i, *I, Left, Right, AllSameOpcodeLeft,
+ AllSameOpcodeRight, SplatLeft, SplatRight)) {
+ Left.push_back(I->getOperand(1));
+ Right.push_back(I->getOperand(0));
+ } else {
+ Left.push_back(I->getOperand(0));
+ Right.push_back(I->getOperand(1));
+ }
+ // Update Splat* and AllSameOpcode* after the insertion.
+ SplatRight = SplatRight && (Right[i - 1] == Right[i]);
+ SplatLeft = SplatLeft && (Left[i - 1] == Left[i]);
+ AllSameOpcodeLeft = AllSameOpcodeLeft && isa<Instruction>(Left[i]) &&
+ (cast<Instruction>(Left[i - 1])->getOpcode() ==
+ cast<Instruction>(Left[i])->getOpcode());
+ AllSameOpcodeRight = AllSameOpcodeRight && isa<Instruction>(Right[i]) &&
+ (cast<Instruction>(Right[i - 1])->getOpcode() ==
+ cast<Instruction>(Right[i])->getOpcode());
}
+ // If one operand end up being broadcast, return this operand order.
+ if (SplatRight || SplatLeft)
+ return;
+
const DataLayout &DL = F->getParent()->getDataLayout();
// Finally check if we can get longer vectorizable chain by reordering
@@ -2030,7 +2100,7 @@ void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) {
Instruction *VL0 = cast<Instruction>(VL[0]);
- BasicBlock::iterator NextInst = VL0;
+ BasicBlock::iterator NextInst(VL0);
++NextInst;
Builder.SetInsertPoint(VL0->getParent(), NextInst);
Builder.SetCurrentDebugLocation(VL0->getDebugLoc());
@@ -2487,7 +2557,7 @@ Value *BoUpSLP::vectorizeTree() {
scheduleBlock(BSIter.second.get());
}
- Builder.SetInsertPoint(F->getEntryBlock().begin());
+ Builder.SetInsertPoint(&F->getEntryBlock().front());
vectorizeTree(&VectorizableTree[0]);
DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n");
@@ -2532,7 +2602,7 @@ Value *BoUpSLP::vectorizeTree() {
User->replaceUsesOfWith(Scalar, Ex);
}
} else {
- Builder.SetInsertPoint(F->getEntryBlock().begin());
+ Builder.SetInsertPoint(&F->getEntryBlock().front());
Value *Ex = Builder.CreateExtractElement(Vec, Lane);
CSEBlocks.insert(&F->getEntryBlock());
User->replaceUsesOfWith(Scalar, Ex);
@@ -2641,7 +2711,7 @@ void BoUpSLP::optimizeGatherSequence() {
BasicBlock *BB = (*I)->getBlock();
// For all instructions in blocks containing gather sequences:
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
- Instruction *In = it++;
+ Instruction *In = &*it++;
if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))
continue;
@@ -2681,8 +2751,15 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
ScheduleData *Bundle = nullptr;
bool ReSchedule = false;
DEBUG(dbgs() << "SLP: bundle: " << *VL[0] << "\n");
+
+ // Make sure that the scheduling region contains all
+ // instructions of the bundle.
+ for (Value *V : VL) {
+ if (!extendSchedulingRegion(V))
+ return false;
+ }
+
for (Value *V : VL) {
- extendSchedulingRegion(V);
ScheduleData *BundleMember = getScheduleData(V);
assert(BundleMember &&
"no ScheduleData for bundle member (maybe not in same basic block)");
@@ -2743,7 +2820,11 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
schedule(pickedSD, ReadyInsts);
}
}
- return Bundle->isReady();
+ if (!Bundle->isReady()) {
+ cancelScheduling(VL);
+ return false;
+ }
+ return true;
}
void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL) {
@@ -2772,9 +2853,9 @@ void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL) {
}
}
-void BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
+bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
if (getScheduleData(V))
- return;
+ return true;
Instruction *I = dyn_cast<Instruction>(V);
assert(I && "bundle member must be an instruction");
assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled");
@@ -2785,21 +2866,26 @@ void BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
ScheduleEnd = I->getNextNode();
assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
- return;
+ return true;
}
// Search up and down at the same time, because we don't know if the new
// instruction is above or below the existing scheduling region.
- BasicBlock::reverse_iterator UpIter(ScheduleStart);
+ BasicBlock::reverse_iterator UpIter(ScheduleStart->getIterator());
BasicBlock::reverse_iterator UpperEnd = BB->rend();
BasicBlock::iterator DownIter(ScheduleEnd);
BasicBlock::iterator LowerEnd = BB->end();
for (;;) {
+ if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
+ DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
+ return false;
+ }
+
if (UpIter != UpperEnd) {
if (&*UpIter == I) {
initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
ScheduleStart = I;
DEBUG(dbgs() << "SLP: extend schedule region start to " << *I << "\n");
- return;
+ return true;
}
UpIter++;
}
@@ -2810,13 +2896,14 @@ void BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
ScheduleEnd = I->getNextNode();
assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
- return;
+ return true;
}
DownIter++;
}
assert((UpIter != UpperEnd || DownIter != LowerEnd) &&
"instruction not found in block");
}
+ return true;
}
void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
@@ -2896,8 +2983,8 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
}
} else {
// I'm not sure if this can ever happen. But we need to be safe.
- // This lets the instruction/bundle never be scheduled and eventally
- // disable vectorization.
+ // This lets the instruction/bundle never be scheduled and
+ // eventually disable vectorization.
BundleMember->Dependencies++;
BundleMember->incrementUnscheduledDeps(1);
}
@@ -3003,7 +3090,7 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
};
std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
- // Ensure that all depencency data is updated and fill the ready-list with
+ // Ensure that all dependency data is updated and fill the ready-list with
// initial instructions.
int Idx = 0;
int NumToSchedule = 0;
@@ -3035,7 +3122,8 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
Instruction *pickedInst = BundleMember->Inst;
if (LastScheduledInst->getNextNode() != pickedInst) {
BS->BB->getInstList().remove(pickedInst);
- BS->BB->getInstList().insert(LastScheduledInst, pickedInst);
+ BS->BB->getInstList().insert(LastScheduledInst->getIterator(),
+ pickedInst);
}
LastScheduledInst = pickedInst;
BundleMember = BundleMember->NextInBundle;
@@ -3074,11 +3162,11 @@ struct SLPVectorizer : public FunctionPass {
if (skipOptnoneFunction(F))
return false;
- SE = &getAnalysis<ScalarEvolution>();
+ SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
TLI = TLIP ? &TLIP->getTLI() : nullptr;
- AA = &getAnalysis<AliasAnalysis>();
+ AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
@@ -3139,13 +3227,15 @@ struct SLPVectorizer : public FunctionPass {
void getAnalysisUsage(AnalysisUsage &AU) const override {
FunctionPass::getAnalysisUsage(AU);
AU.addRequired<AssumptionCacheTracker>();
- AU.addRequired<ScalarEvolution>();
- AU.addRequired<AliasAnalysis>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addRequired<AAResultsWrapperPass>();
AU.addRequired<TargetTransformInfoWrapperPass>();
AU.addRequired<LoopInfoWrapperPass>();
AU.addRequired<DominatorTreeWrapperPass>();
AU.addPreserved<LoopInfoWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<AAResultsWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
AU.setPreservesCFG();
}
@@ -3260,15 +3350,26 @@ bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
// Do a quadratic search on all of the given stores and find
// all of the pairs of stores that follow each other.
+ SmallVector<unsigned, 16> IndexQueue;
for (unsigned i = 0, e = Stores.size(); i < e; ++i) {
- for (unsigned j = 0; j < e; ++j) {
- if (i == j)
- continue;
- const DataLayout &DL = Stores[i]->getModule()->getDataLayout();
- if (R.isConsecutiveAccess(Stores[i], Stores[j], DL)) {
- Tails.insert(Stores[j]);
+ const DataLayout &DL = Stores[i]->getModule()->getDataLayout();
+ IndexQueue.clear();
+ // If a store has multiple consecutive store candidates, search Stores
+ // array according to the sequence: from i+1 to e, then from i-1 to 0.
+ // This is because usually pairing with immediate succeeding or preceding
+ // candidate create the best chance to find slp vectorization opportunity.
+ unsigned j = 0;
+ for (j = i + 1; j < e; ++j)
+ IndexQueue.push_back(j);
+ for (j = i; j > 0; --j)
+ IndexQueue.push_back(j - 1);
+
+ for (auto &k : IndexQueue) {
+ if (R.isConsecutiveAccess(Stores[i], Stores[k], DL)) {
+ Tails.insert(Stores[k]);
Heads.insert(Stores[i]);
- ConsecutiveChain[Stores[i]] = Stores[j];
+ ConsecutiveChain[Stores[i]] = Stores[k];
+ break;
}
}
}
@@ -3428,7 +3529,7 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
unsigned VecIdx = 0;
for (auto &V : BuildVectorSlice) {
IRBuilder<true, NoFolder> Builder(
- ++BasicBlock::iterator(InsertAfter));
+ InsertAfter->getParent(), ++BasicBlock::iterator(InsertAfter));
InsertElementInst *IE = cast<InsertElementInst>(V);
Instruction *Extract = cast<Instruction>(Builder.CreateExtractElement(
VectorizedRoot, Builder.getInt32(VecIdx++)));
@@ -3552,16 +3653,17 @@ class HorizontalReduction {
unsigned ReductionOpcode;
/// The opcode of the values we perform a reduction on.
unsigned ReducedValueOpcode;
- /// The width of one full horizontal reduction operation.
- unsigned ReduxWidth;
/// Should we model this reduction as a pairwise reduction tree or a tree that
/// splits the vector in halves and adds those halves.
bool IsPairwiseReduction;
public:
+ /// The width of one full horizontal reduction operation.
+ unsigned ReduxWidth;
+
HorizontalReduction()
: ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0),
- ReducedValueOpcode(0), ReduxWidth(0), IsPairwiseReduction(false) {}
+ ReducedValueOpcode(0), IsPairwiseReduction(false), ReduxWidth(0) {}
/// \brief Try to find a reduction tree.
bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) {
@@ -3607,11 +3709,11 @@ public:
return false;
// Post order traverse the reduction tree starting at B. We only handle true
- // trees containing only binary operators.
- SmallVector<std::pair<BinaryOperator *, unsigned>, 32> Stack;
+ // trees containing only binary operators or selects.
+ SmallVector<std::pair<Instruction *, unsigned>, 32> Stack;
Stack.push_back(std::make_pair(B, 0));
while (!Stack.empty()) {
- BinaryOperator *TreeN = Stack.back().first;
+ Instruction *TreeN = Stack.back().first;
unsigned EdgeToVist = Stack.back().second++;
bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode;
@@ -3647,9 +3749,10 @@ public:
// Visit left or right.
Value *NextV = TreeN->getOperand(EdgeToVist);
- BinaryOperator *Next = dyn_cast<BinaryOperator>(NextV);
- if (Next)
- Stack.push_back(std::make_pair(Next, 0));
+ // We currently only allow BinaryOperator's and SelectInst's as reduction
+ // values in our tree.
+ if (isa<BinaryOperator>(NextV) || isa<SelectInst>(NextV))
+ Stack.push_back(std::make_pair(cast<Instruction>(NextV), 0));
else if (NextV != Phi)
return false;
}
@@ -3717,9 +3820,12 @@ public:
return VectorizedTree != nullptr;
}
-private:
+ unsigned numReductionValues() const {
+ return ReducedVals.size();
+ }
- /// \brief Calcuate the cost of a reduction.
+private:
+ /// \brief Calculate the cost of a reduction.
int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) {
Type *ScalarTy = FirstReducedVal->getType();
Type *VecTy = VectorType::get(ScalarTy, ReduxWidth);
@@ -3825,6 +3931,82 @@ static bool PhiTypeSorterFunc(Value *V, Value *V2) {
return V->getType() < V2->getType();
}
+/// \brief Try and get a reduction value from a phi node.
+///
+/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
+/// if they come from either \p ParentBB or a containing loop latch.
+///
+/// \returns A candidate reduction value if possible, or \code nullptr \endcode
+/// if not possible.
+static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
+ BasicBlock *ParentBB, LoopInfo *LI) {
+ // There are situations where the reduction value is not dominated by the
+ // reduction phi. Vectorizing such cases has been reported to cause
+ // miscompiles. See PR25787.
+ auto DominatedReduxValue = [&](Value *R) {
+ return (
+ dyn_cast<Instruction>(R) &&
+ DT->dominates(P->getParent(), dyn_cast<Instruction>(R)->getParent()));
+ };
+
+ Value *Rdx = nullptr;
+
+ // Return the incoming value if it comes from the same BB as the phi node.
+ if (P->getIncomingBlock(0) == ParentBB) {
+ Rdx = P->getIncomingValue(0);
+ } else if (P->getIncomingBlock(1) == ParentBB) {
+ Rdx = P->getIncomingValue(1);
+ }
+
+ if (Rdx && DominatedReduxValue(Rdx))
+ return Rdx;
+
+ // Otherwise, check whether we have a loop latch to look at.
+ Loop *BBL = LI->getLoopFor(ParentBB);
+ if (!BBL)
+ return nullptr;
+ BasicBlock *BBLatch = BBL->getLoopLatch();
+ if (!BBLatch)
+ return nullptr;
+
+ // There is a loop latch, return the incoming value if it comes from
+ // that. This reduction pattern occassionaly turns up.
+ if (P->getIncomingBlock(0) == BBLatch) {
+ Rdx = P->getIncomingValue(0);
+ } else if (P->getIncomingBlock(1) == BBLatch) {
+ Rdx = P->getIncomingValue(1);
+ }
+
+ if (Rdx && DominatedReduxValue(Rdx))
+ return Rdx;
+
+ return nullptr;
+}
+
+/// \brief Attempt to reduce a horizontal reduction.
+/// If it is legal to match a horizontal reduction feeding
+/// the phi node P with reduction operators BI, then check if it
+/// can be done.
+/// \returns true if a horizontal reduction was matched and reduced.
+/// \returns false if a horizontal reduction was not matched.
+static bool canMatchHorizontalReduction(PHINode *P, BinaryOperator *BI,
+ BoUpSLP &R, TargetTransformInfo *TTI) {
+ if (!ShouldVectorizeHor)
+ return false;
+
+ HorizontalReduction HorRdx;
+ if (!HorRdx.matchAssociativeReduction(P, BI))
+ return false;
+
+ // If there is a sufficient number of reduction values, reduce
+ // to a nearby power-of-2. Can safely generate oversized
+ // vectors and rely on the backend to split them to legal sizes.
+ HorRdx.ReduxWidth =
+ std::max((uint64_t)4, PowerOf2Floor(HorRdx.numReductionValues()));
+
+ return HorRdx.tryToReduce(R, TTI);
+}
+
bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
bool Changed = false;
SmallVector<Value *, 4> Incoming;
@@ -3881,7 +4063,7 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) {
// We may go through BB multiple times so skip the one we have checked.
- if (!VisitedInstrs.insert(it).second)
+ if (!VisitedInstrs.insert(&*it).second)
continue;
if (isa<DbgInfoIntrinsic>(it))
@@ -3892,20 +4074,16 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
// Check that the PHI is a reduction PHI.
if (P->getNumIncomingValues() != 2)
return Changed;
- Value *Rdx =
- (P->getIncomingBlock(0) == BB
- ? (P->getIncomingValue(0))
- : (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1)
- : nullptr));
+
+ Value *Rdx = getReductionValue(DT, P, BB, LI);
+
// Check if this is a Binary Operator.
BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx);
if (!BI)
continue;
// Try to match and vectorize a horizontal reduction.
- HorizontalReduction HorRdx;
- if (ShouldVectorizeHor && HorRdx.matchAssociativeReduction(P, BI) &&
- HorRdx.tryToReduce(R, TTI)) {
+ if (canMatchHorizontalReduction(P, BI, R, TTI)) {
Changed = true;
it = BB->begin();
e = BB->end();
@@ -3928,15 +4106,12 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
continue;
}
- // Try to vectorize horizontal reductions feeding into a store.
if (ShouldStartVectorizeHorAtStore)
if (StoreInst *SI = dyn_cast<StoreInst>(it))
if (BinaryOperator *BinOp =
dyn_cast<BinaryOperator>(SI->getValueOperand())) {
- HorizontalReduction HorRdx;
- if (((HorRdx.matchAssociativeReduction(nullptr, BinOp) &&
- HorRdx.tryToReduce(R, TTI)) ||
- tryToVectorize(BinOp, R))) {
+ if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI) ||
+ tryToVectorize(BinOp, R)) {
Changed = true;
it = BB->begin();
e = BB->end();
@@ -4037,10 +4212,10 @@ bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) {
char SLPVectorizer::ID = 0;
static const char lv_name[] = "SLP Vectorizer";
INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
-INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
OpenPOWER on IntegriCloud