summaryrefslogtreecommitdiffstats
path: root/lib/Transforms/Vectorize/SLPVectorizer.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Transforms/Vectorize/SLPVectorizer.cpp')
-rw-r--r--lib/Transforms/Vectorize/SLPVectorizer.cpp489
1 files changed, 332 insertions, 157 deletions
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 7bac407..40abfc7 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -22,6 +22,7 @@
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/LoopInfo.h"
@@ -61,7 +62,7 @@ static cl::opt<int>
"number "));
static cl::opt<bool>
-ShouldVectorizeHor("slp-vectorize-hor", cl::init(false), cl::Hidden,
+ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
cl::desc("Attempt to vectorize horizontal reductions"));
static cl::opt<bool> ShouldStartVectorizeHorAtStore(
@@ -73,6 +74,14 @@ static cl::opt<int>
MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
cl::desc("Attempt to vectorize for this register size in bits"));
+/// Limits the size of scheduling regions in a block.
+/// It avoid long compile times for _very_ large blocks where vector
+/// instructions are spread over a wide range.
+/// This limit is way higher than needed by real-world functions.
+static cl::opt<int>
+ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
+ cl::desc("Limit the size of the SLP scheduling region per block"));
+
namespace {
// FIXME: Set this via cl::opt to allow overriding.
@@ -89,6 +98,10 @@ static const unsigned AliasedCheckLimit = 10;
// This limit is useful for very large basic blocks.
static const unsigned MaxMemDepDistance = 160;
+/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
+/// regions to be handled.
+static const int MinScheduleRegionSize = 16;
+
/// \brief Predicate for the element types that the SLP vectorizer supports.
///
/// The most important thing to filter here are types which are invalid in LLVM
@@ -156,13 +169,11 @@ static unsigned getAltOpcode(unsigned Op) {
/// of an alternate sequence which can later be merged as
/// a ShuffleVector instruction.
static bool canCombineAsAltInst(unsigned Op) {
- if (Op == Instruction::FAdd || Op == Instruction::FSub ||
- Op == Instruction::Sub || Op == Instruction::Add)
- return true;
- return false;
+ return Op == Instruction::FAdd || Op == Instruction::FSub ||
+ Op == Instruction::Sub || Op == Instruction::Add;
}
-/// \returns ShuffleVector instruction if intructions in \p VL have
+/// \returns ShuffleVector instruction if instructions in \p VL have
/// alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence.
/// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...)
static unsigned isAltInst(ArrayRef<Value *> VL) {
@@ -242,6 +253,9 @@ static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) {
case LLVMContext::MD_fpmath:
MD = MDNode::getMostGenericFPMath(MD, IMD);
break;
+ case LLVMContext::MD_nontemporal:
+ MD = MDNode::intersect(MD, IMD);
+ break;
}
}
I->setMetadata(Kind, MD);
@@ -393,7 +407,7 @@ public:
/// \brief Perform LICM and CSE on the newly generated gather sequences.
void optimizeGatherSequence();
- /// \returns true if it is benefitial to reverse the vector order.
+ /// \returns true if it is beneficial to reverse the vector order.
bool shouldReorder() const {
return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder;
}
@@ -441,7 +455,7 @@ private:
/// \returns a vector from a collection of scalars in \p VL.
Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
- /// \returns whether the VectorizableTree is fully vectoriable and will
+ /// \returns whether the VectorizableTree is fully vectorizable and will
/// be beneficial even the tree height is tiny.
bool isFullyVectorizableTinyTree();
@@ -506,7 +520,7 @@ private:
/// This POD struct describes one external user in the vectorized tree.
struct ExternalUser {
ExternalUser (Value *S, llvm::User *U, int L) :
- Scalar(S), User(U), Lane(L){};
+ Scalar(S), User(U), Lane(L){}
// Which scalar in our function.
Value *Scalar;
// Which user that uses the scalar.
@@ -717,6 +731,8 @@ private:
: BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize),
ScheduleStart(nullptr), ScheduleEnd(nullptr),
FirstLoadStoreInRegion(nullptr), LastLoadStoreInRegion(nullptr),
+ ScheduleRegionSize(0),
+ ScheduleRegionSizeLimit(ScheduleRegionSizeBudget),
// Make sure that the initial SchedulingRegionID is greater than the
// initial SchedulingRegionID in ScheduleData (which is 0).
SchedulingRegionID(1) {}
@@ -728,6 +744,13 @@ private:
FirstLoadStoreInRegion = nullptr;
LastLoadStoreInRegion = nullptr;
+ // Reduce the maximum schedule region size by the size of the
+ // previous scheduling run.
+ ScheduleRegionSizeLimit -= ScheduleRegionSize;
+ if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
+ ScheduleRegionSizeLimit = MinScheduleRegionSize;
+ ScheduleRegionSize = 0;
+
// Make a new scheduling region, i.e. all existing ScheduleData is not
// in the new region yet.
++SchedulingRegionID;
@@ -804,7 +827,8 @@ private:
void cancelScheduling(ArrayRef<Value *> VL);
/// Extends the scheduling region so that V is inside the region.
- void extendSchedulingRegion(Value *V);
+ /// \returns true if the region size is within the limit.
+ bool extendSchedulingRegion(Value *V);
/// Initialize the ScheduleData structures for new instructions in the
/// scheduling region.
@@ -858,6 +882,12 @@ private:
/// (can be null).
ScheduleData *LastLoadStoreInRegion;
+ /// The current size of the scheduling region.
+ int ScheduleRegionSize;
+
+ /// The maximum size allowed for the scheduling region.
+ int ScheduleRegionSizeLimit;
+
/// The ID of the scheduling region. For a new vectorization iteration this
/// is incremented which "removes" all ScheduleData from the region.
int SchedulingRegionID;
@@ -1077,7 +1107,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
if (!BS.tryScheduleBundle(VL, this)) {
DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
- BS.cancelScheduling(VL);
+ assert((!BS.getScheduleData(VL[0]) ||
+ !BS.getScheduleData(VL[0])->isPartOfBundle()) &&
+ "tryScheduleBundle should cancelScheduling on failure");
newTreeEntry(VL, false);
return;
}
@@ -1125,6 +1157,23 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
return;
}
case Instruction::Load: {
+ // Check that a vectorized load would load the same memory as a scalar
+ // load.
+ // For example we don't want vectorize loads that are smaller than 8 bit.
+ // Even though we have a packed struct {<i2, i2, i2, i2>} LLVM treats
+ // loading/storing it as an i8 struct. If we vectorize loads/stores from
+ // such a struct we read/write packed bits disagreeing with the
+ // unvectorized version.
+ const DataLayout &DL = F->getParent()->getDataLayout();
+ Type *ScalarTy = VL[0]->getType();
+
+ if (DL.getTypeSizeInBits(ScalarTy) !=
+ DL.getTypeAllocSizeInBits(ScalarTy)) {
+ BS.cancelScheduling(VL);
+ newTreeEntry(VL, false);
+ DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
+ return;
+ }
// Check if the loads are consecutive or of we need to swizzle them.
for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
LoadInst *L = cast<LoadInst>(VL[i]);
@@ -1134,7 +1183,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
return;
}
- const DataLayout &DL = F->getParent()->getDataLayout();
+
if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) {
if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0], DL)) {
++NumLoadsWantToChangeOrder;
@@ -1692,7 +1741,8 @@ int BoUpSLP::getSpillCost() {
}
// Now find the sequence of instructions between PrevInst and Inst.
- BasicBlock::reverse_iterator InstIt(Inst), PrevInstIt(PrevInst);
+ BasicBlock::reverse_iterator InstIt(Inst->getIterator()),
+ PrevInstIt(PrevInst->getIterator());
--PrevInstIt;
while (InstIt != PrevInstIt) {
if (PrevInstIt == PrevInst->getParent()->rend()) {
@@ -1892,106 +1942,126 @@ void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL,
}
}
+// Return true if I should be commuted before adding it's left and right
+// operands to the arrays Left and Right.
+//
+// The vectorizer is trying to either have all elements one side being
+// instruction with the same opcode to enable further vectorization, or having
+// a splat to lower the vectorizing cost.
+static bool shouldReorderOperands(int i, Instruction &I,
+ SmallVectorImpl<Value *> &Left,
+ SmallVectorImpl<Value *> &Right,
+ bool AllSameOpcodeLeft,
+ bool AllSameOpcodeRight, bool SplatLeft,
+ bool SplatRight) {
+ Value *VLeft = I.getOperand(0);
+ Value *VRight = I.getOperand(1);
+ // If we have "SplatRight", try to see if commuting is needed to preserve it.
+ if (SplatRight) {
+ if (VRight == Right[i - 1])
+ // Preserve SplatRight
+ return false;
+ if (VLeft == Right[i - 1]) {
+ // Commuting would preserve SplatRight, but we don't want to break
+ // SplatLeft either, i.e. preserve the original order if possible.
+ // (FIXME: why do we care?)
+ if (SplatLeft && VLeft == Left[i - 1])
+ return false;
+ return true;
+ }
+ }
+ // Symmetrically handle Right side.
+ if (SplatLeft) {
+ if (VLeft == Left[i - 1])
+ // Preserve SplatLeft
+ return false;
+ if (VRight == Left[i - 1])
+ return true;
+ }
+
+ Instruction *ILeft = dyn_cast<Instruction>(VLeft);
+ Instruction *IRight = dyn_cast<Instruction>(VRight);
+
+ // If we have "AllSameOpcodeRight", try to see if the left operands preserves
+ // it and not the right, in this case we want to commute.
+ if (AllSameOpcodeRight) {
+ unsigned RightPrevOpcode = cast<Instruction>(Right[i - 1])->getOpcode();
+ if (IRight && RightPrevOpcode == IRight->getOpcode())
+ // Do not commute, a match on the right preserves AllSameOpcodeRight
+ return false;
+ if (ILeft && RightPrevOpcode == ILeft->getOpcode()) {
+ // We have a match and may want to commute, but first check if there is
+ // not also a match on the existing operands on the Left to preserve
+ // AllSameOpcodeLeft, i.e. preserve the original order if possible.
+ // (FIXME: why do we care?)
+ if (AllSameOpcodeLeft && ILeft &&
+ cast<Instruction>(Left[i - 1])->getOpcode() == ILeft->getOpcode())
+ return false;
+ return true;
+ }
+ }
+ // Symmetrically handle Left side.
+ if (AllSameOpcodeLeft) {
+ unsigned LeftPrevOpcode = cast<Instruction>(Left[i - 1])->getOpcode();
+ if (ILeft && LeftPrevOpcode == ILeft->getOpcode())
+ return false;
+ if (IRight && LeftPrevOpcode == IRight->getOpcode())
+ return true;
+ }
+ return false;
+}
+
void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
SmallVectorImpl<Value *> &Left,
SmallVectorImpl<Value *> &Right) {
- SmallVector<Value *, 16> OrigLeft, OrigRight;
-
- bool AllSameOpcodeLeft = true;
- bool AllSameOpcodeRight = true;
- for (unsigned i = 0, e = VL.size(); i != e; ++i) {
- Instruction *I = cast<Instruction>(VL[i]);
- Value *VLeft = I->getOperand(0);
- Value *VRight = I->getOperand(1);
-
- OrigLeft.push_back(VLeft);
- OrigRight.push_back(VRight);
-
- Instruction *ILeft = dyn_cast<Instruction>(VLeft);
- Instruction *IRight = dyn_cast<Instruction>(VRight);
-
- // Check whether all operands on one side have the same opcode. In this case
- // we want to preserve the original order and not make things worse by
- // reordering.
- if (i && AllSameOpcodeLeft && ILeft) {
- if (Instruction *PLeft = dyn_cast<Instruction>(OrigLeft[i - 1])) {
- if (PLeft->getOpcode() != ILeft->getOpcode())
- AllSameOpcodeLeft = false;
- } else
- AllSameOpcodeLeft = false;
- }
- if (i && AllSameOpcodeRight && IRight) {
- if (Instruction *PRight = dyn_cast<Instruction>(OrigRight[i - 1])) {
- if (PRight->getOpcode() != IRight->getOpcode())
- AllSameOpcodeRight = false;
- } else
- AllSameOpcodeRight = false;
- }
-
- // Sort two opcodes. In the code below we try to preserve the ability to use
- // broadcast of values instead of individual inserts.
- // vl1 = load
- // vl2 = phi
- // vr1 = load
- // vr2 = vr2
- // = vl1 x vr1
- // = vl2 x vr2
- // If we just sorted according to opcode we would leave the first line in
- // tact but we would swap vl2 with vr2 because opcode(phi) > opcode(load).
- // = vl1 x vr1
- // = vr2 x vl2
- // Because vr2 and vr1 are from the same load we loose the opportunity of a
- // broadcast for the packed right side in the backend: we have [vr1, vl2]
- // instead of [vr1, vr2=vr1].
- if (ILeft && IRight) {
- if (!i && ILeft->getOpcode() > IRight->getOpcode()) {
- Left.push_back(IRight);
- Right.push_back(ILeft);
- } else if (i && ILeft->getOpcode() > IRight->getOpcode() &&
- Right[i - 1] != IRight) {
- // Try not to destroy a broad cast for no apparent benefit.
- Left.push_back(IRight);
- Right.push_back(ILeft);
- } else if (i && ILeft->getOpcode() == IRight->getOpcode() &&
- Right[i - 1] == ILeft) {
- // Try preserve broadcasts.
- Left.push_back(IRight);
- Right.push_back(ILeft);
- } else if (i && ILeft->getOpcode() == IRight->getOpcode() &&
- Left[i - 1] == IRight) {
- // Try preserve broadcasts.
- Left.push_back(IRight);
- Right.push_back(ILeft);
- } else {
- Left.push_back(ILeft);
- Right.push_back(IRight);
- }
- continue;
- }
- // One opcode, put the instruction on the right.
- if (ILeft) {
- Left.push_back(VRight);
- Right.push_back(ILeft);
- continue;
- }
+ if (VL.size()) {
+ // Peel the first iteration out of the loop since there's nothing
+ // interesting to do anyway and it simplifies the checks in the loop.
+ auto VLeft = cast<Instruction>(VL[0])->getOperand(0);
+ auto VRight = cast<Instruction>(VL[0])->getOperand(1);
+ if (!isa<Instruction>(VRight) && isa<Instruction>(VLeft))
+ // Favor having instruction to the right. FIXME: why?
+ std::swap(VLeft, VRight);
Left.push_back(VLeft);
Right.push_back(VRight);
}
- bool LeftBroadcast = isSplat(Left);
- bool RightBroadcast = isSplat(Right);
-
- // If operands end up being broadcast return this operand order.
- if (LeftBroadcast || RightBroadcast)
- return;
+ // Keep track if we have instructions with all the same opcode on one side.
+ bool AllSameOpcodeLeft = isa<Instruction>(Left[0]);
+ bool AllSameOpcodeRight = isa<Instruction>(Right[0]);
+ // Keep track if we have one side with all the same value (broadcast).
+ bool SplatLeft = true;
+ bool SplatRight = true;
- // Don't reorder if the operands where good to begin.
- if (AllSameOpcodeRight || AllSameOpcodeLeft) {
- Left = OrigLeft;
- Right = OrigRight;
+ for (unsigned i = 1, e = VL.size(); i != e; ++i) {
+ Instruction *I = cast<Instruction>(VL[i]);
+ assert(I->isCommutative() && "Can only process commutative instruction");
+ // Commute to favor either a splat or maximizing having the same opcodes on
+ // one side.
+ if (shouldReorderOperands(i, *I, Left, Right, AllSameOpcodeLeft,
+ AllSameOpcodeRight, SplatLeft, SplatRight)) {
+ Left.push_back(I->getOperand(1));
+ Right.push_back(I->getOperand(0));
+ } else {
+ Left.push_back(I->getOperand(0));
+ Right.push_back(I->getOperand(1));
+ }
+ // Update Splat* and AllSameOpcode* after the insertion.
+ SplatRight = SplatRight && (Right[i - 1] == Right[i]);
+ SplatLeft = SplatLeft && (Left[i - 1] == Left[i]);
+ AllSameOpcodeLeft = AllSameOpcodeLeft && isa<Instruction>(Left[i]) &&
+ (cast<Instruction>(Left[i - 1])->getOpcode() ==
+ cast<Instruction>(Left[i])->getOpcode());
+ AllSameOpcodeRight = AllSameOpcodeRight && isa<Instruction>(Right[i]) &&
+ (cast<Instruction>(Right[i - 1])->getOpcode() ==
+ cast<Instruction>(Right[i])->getOpcode());
}
+ // If one operand end up being broadcast, return this operand order.
+ if (SplatRight || SplatLeft)
+ return;
+
const DataLayout &DL = F->getParent()->getDataLayout();
// Finally check if we can get longer vectorizable chain by reordering
@@ -2032,7 +2102,7 @@ void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) {
Instruction *VL0 = cast<Instruction>(VL[0]);
- BasicBlock::iterator NextInst = VL0;
+ BasicBlock::iterator NextInst(VL0);
++NextInst;
Builder.SetInsertPoint(VL0->getParent(), NextInst);
Builder.SetCurrentDebugLocation(VL0->getDebugLoc());
@@ -2489,7 +2559,7 @@ Value *BoUpSLP::vectorizeTree() {
scheduleBlock(BSIter.second.get());
}
- Builder.SetInsertPoint(F->getEntryBlock().begin());
+ Builder.SetInsertPoint(&F->getEntryBlock().front());
vectorizeTree(&VectorizableTree[0]);
DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n");
@@ -2534,7 +2604,7 @@ Value *BoUpSLP::vectorizeTree() {
User->replaceUsesOfWith(Scalar, Ex);
}
} else {
- Builder.SetInsertPoint(F->getEntryBlock().begin());
+ Builder.SetInsertPoint(&F->getEntryBlock().front());
Value *Ex = Builder.CreateExtractElement(Vec, Lane);
CSEBlocks.insert(&F->getEntryBlock());
User->replaceUsesOfWith(Scalar, Ex);
@@ -2643,7 +2713,7 @@ void BoUpSLP::optimizeGatherSequence() {
BasicBlock *BB = (*I)->getBlock();
// For all instructions in blocks containing gather sequences:
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
- Instruction *In = it++;
+ Instruction *In = &*it++;
if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))
continue;
@@ -2683,8 +2753,15 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
ScheduleData *Bundle = nullptr;
bool ReSchedule = false;
DEBUG(dbgs() << "SLP: bundle: " << *VL[0] << "\n");
+
+ // Make sure that the scheduling region contains all
+ // instructions of the bundle.
+ for (Value *V : VL) {
+ if (!extendSchedulingRegion(V))
+ return false;
+ }
+
for (Value *V : VL) {
- extendSchedulingRegion(V);
ScheduleData *BundleMember = getScheduleData(V);
assert(BundleMember &&
"no ScheduleData for bundle member (maybe not in same basic block)");
@@ -2745,7 +2822,11 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
schedule(pickedSD, ReadyInsts);
}
}
- return Bundle->isReady();
+ if (!Bundle->isReady()) {
+ cancelScheduling(VL);
+ return false;
+ }
+ return true;
}
void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL) {
@@ -2774,9 +2855,9 @@ void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL) {
}
}
-void BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
+bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
if (getScheduleData(V))
- return;
+ return true;
Instruction *I = dyn_cast<Instruction>(V);
assert(I && "bundle member must be an instruction");
assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled");
@@ -2787,21 +2868,26 @@ void BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
ScheduleEnd = I->getNextNode();
assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
- return;
+ return true;
}
// Search up and down at the same time, because we don't know if the new
// instruction is above or below the existing scheduling region.
- BasicBlock::reverse_iterator UpIter(ScheduleStart);
+ BasicBlock::reverse_iterator UpIter(ScheduleStart->getIterator());
BasicBlock::reverse_iterator UpperEnd = BB->rend();
BasicBlock::iterator DownIter(ScheduleEnd);
BasicBlock::iterator LowerEnd = BB->end();
for (;;) {
+ if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
+ DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
+ return false;
+ }
+
if (UpIter != UpperEnd) {
if (&*UpIter == I) {
initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
ScheduleStart = I;
DEBUG(dbgs() << "SLP: extend schedule region start to " << *I << "\n");
- return;
+ return true;
}
UpIter++;
}
@@ -2812,13 +2898,14 @@ void BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
ScheduleEnd = I->getNextNode();
assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
- return;
+ return true;
}
DownIter++;
}
assert((UpIter != UpperEnd || DownIter != LowerEnd) &&
"instruction not found in block");
}
+ return true;
}
void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
@@ -2898,8 +2985,8 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
}
} else {
// I'm not sure if this can ever happen. But we need to be safe.
- // This lets the instruction/bundle never be scheduled and eventally
- // disable vectorization.
+ // This lets the instruction/bundle never be scheduled and
+ // eventually disable vectorization.
BundleMember->Dependencies++;
BundleMember->incrementUnscheduledDeps(1);
}
@@ -3005,7 +3092,7 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
};
std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
- // Ensure that all depencency data is updated and fill the ready-list with
+ // Ensure that all dependency data is updated and fill the ready-list with
// initial instructions.
int Idx = 0;
int NumToSchedule = 0;
@@ -3037,7 +3124,8 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
Instruction *pickedInst = BundleMember->Inst;
if (LastScheduledInst->getNextNode() != pickedInst) {
BS->BB->getInstList().remove(pickedInst);
- BS->BB->getInstList().insert(LastScheduledInst, pickedInst);
+ BS->BB->getInstList().insert(LastScheduledInst->getIterator(),
+ pickedInst);
}
LastScheduledInst = pickedInst;
BundleMember = BundleMember->NextInBundle;
@@ -3076,11 +3164,11 @@ struct SLPVectorizer : public FunctionPass {
if (skipOptnoneFunction(F))
return false;
- SE = &getAnalysis<ScalarEvolution>();
+ SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
TLI = TLIP ? &TLIP->getTLI() : nullptr;
- AA = &getAnalysis<AliasAnalysis>();
+ AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
@@ -3141,13 +3229,15 @@ struct SLPVectorizer : public FunctionPass {
void getAnalysisUsage(AnalysisUsage &AU) const override {
FunctionPass::getAnalysisUsage(AU);
AU.addRequired<AssumptionCacheTracker>();
- AU.addRequired<ScalarEvolution>();
- AU.addRequired<AliasAnalysis>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addRequired<AAResultsWrapperPass>();
AU.addRequired<TargetTransformInfoWrapperPass>();
AU.addRequired<LoopInfoWrapperPass>();
AU.addRequired<DominatorTreeWrapperPass>();
AU.addPreserved<LoopInfoWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<AAResultsWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
AU.setPreservesCFG();
}
@@ -3262,15 +3352,26 @@ bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
// Do a quadratic search on all of the given stores and find
// all of the pairs of stores that follow each other.
+ SmallVector<unsigned, 16> IndexQueue;
for (unsigned i = 0, e = Stores.size(); i < e; ++i) {
- for (unsigned j = 0; j < e; ++j) {
- if (i == j)
- continue;
- const DataLayout &DL = Stores[i]->getModule()->getDataLayout();
- if (R.isConsecutiveAccess(Stores[i], Stores[j], DL)) {
- Tails.insert(Stores[j]);
+ const DataLayout &DL = Stores[i]->getModule()->getDataLayout();
+ IndexQueue.clear();
+ // If a store has multiple consecutive store candidates, search Stores
+ // array according to the sequence: from i+1 to e, then from i-1 to 0.
+ // This is because usually pairing with immediate succeeding or preceding
+ // candidate create the best chance to find slp vectorization opportunity.
+ unsigned j = 0;
+ for (j = i + 1; j < e; ++j)
+ IndexQueue.push_back(j);
+ for (j = i; j > 0; --j)
+ IndexQueue.push_back(j - 1);
+
+ for (auto &k : IndexQueue) {
+ if (R.isConsecutiveAccess(Stores[i], Stores[k], DL)) {
+ Tails.insert(Stores[k]);
Heads.insert(Stores[i]);
- ConsecutiveChain[Stores[i]] = Stores[j];
+ ConsecutiveChain[Stores[i]] = Stores[k];
+ break;
}
}
}
@@ -3430,7 +3531,7 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
unsigned VecIdx = 0;
for (auto &V : BuildVectorSlice) {
IRBuilder<true, NoFolder> Builder(
- ++BasicBlock::iterator(InsertAfter));
+ InsertAfter->getParent(), ++BasicBlock::iterator(InsertAfter));
InsertElementInst *IE = cast<InsertElementInst>(V);
Instruction *Extract = cast<Instruction>(Builder.CreateExtractElement(
VectorizedRoot, Builder.getInt32(VecIdx++)));
@@ -3554,16 +3655,17 @@ class HorizontalReduction {
unsigned ReductionOpcode;
/// The opcode of the values we perform a reduction on.
unsigned ReducedValueOpcode;
- /// The width of one full horizontal reduction operation.
- unsigned ReduxWidth;
/// Should we model this reduction as a pairwise reduction tree or a tree that
/// splits the vector in halves and adds those halves.
bool IsPairwiseReduction;
public:
+ /// The width of one full horizontal reduction operation.
+ unsigned ReduxWidth;
+
HorizontalReduction()
: ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0),
- ReducedValueOpcode(0), ReduxWidth(0), IsPairwiseReduction(false) {}
+ ReducedValueOpcode(0), IsPairwiseReduction(false), ReduxWidth(0) {}
/// \brief Try to find a reduction tree.
bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) {
@@ -3609,11 +3711,11 @@ public:
return false;
// Post order traverse the reduction tree starting at B. We only handle true
- // trees containing only binary operators.
- SmallVector<std::pair<BinaryOperator *, unsigned>, 32> Stack;
+ // trees containing only binary operators or selects.
+ SmallVector<std::pair<Instruction *, unsigned>, 32> Stack;
Stack.push_back(std::make_pair(B, 0));
while (!Stack.empty()) {
- BinaryOperator *TreeN = Stack.back().first;
+ Instruction *TreeN = Stack.back().first;
unsigned EdgeToVist = Stack.back().second++;
bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode;
@@ -3649,9 +3751,10 @@ public:
// Visit left or right.
Value *NextV = TreeN->getOperand(EdgeToVist);
- BinaryOperator *Next = dyn_cast<BinaryOperator>(NextV);
- if (Next)
- Stack.push_back(std::make_pair(Next, 0));
+ // We currently only allow BinaryOperator's and SelectInst's as reduction
+ // values in our tree.
+ if (isa<BinaryOperator>(NextV) || isa<SelectInst>(NextV))
+ Stack.push_back(std::make_pair(cast<Instruction>(NextV), 0));
else if (NextV != Phi)
return false;
}
@@ -3719,9 +3822,12 @@ public:
return VectorizedTree != nullptr;
}
-private:
+ unsigned numReductionValues() const {
+ return ReducedVals.size();
+ }
- /// \brief Calcuate the cost of a reduction.
+private:
+ /// \brief Calculate the cost of a reduction.
int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) {
Type *ScalarTy = FirstReducedVal->getType();
Type *VecTy = VectorType::get(ScalarTy, ReduxWidth);
@@ -3827,6 +3933,82 @@ static bool PhiTypeSorterFunc(Value *V, Value *V2) {
return V->getType() < V2->getType();
}
+/// \brief Try and get a reduction value from a phi node.
+///
+/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
+/// if they come from either \p ParentBB or a containing loop latch.
+///
+/// \returns A candidate reduction value if possible, or \code nullptr \endcode
+/// if not possible.
+static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
+ BasicBlock *ParentBB, LoopInfo *LI) {
+ // There are situations where the reduction value is not dominated by the
+ // reduction phi. Vectorizing such cases has been reported to cause
+ // miscompiles. See PR25787.
+ auto DominatedReduxValue = [&](Value *R) {
+ return (
+ dyn_cast<Instruction>(R) &&
+ DT->dominates(P->getParent(), dyn_cast<Instruction>(R)->getParent()));
+ };
+
+ Value *Rdx = nullptr;
+
+ // Return the incoming value if it comes from the same BB as the phi node.
+ if (P->getIncomingBlock(0) == ParentBB) {
+ Rdx = P->getIncomingValue(0);
+ } else if (P->getIncomingBlock(1) == ParentBB) {
+ Rdx = P->getIncomingValue(1);
+ }
+
+ if (Rdx && DominatedReduxValue(Rdx))
+ return Rdx;
+
+ // Otherwise, check whether we have a loop latch to look at.
+ Loop *BBL = LI->getLoopFor(ParentBB);
+ if (!BBL)
+ return nullptr;
+ BasicBlock *BBLatch = BBL->getLoopLatch();
+ if (!BBLatch)
+ return nullptr;
+
+ // There is a loop latch, return the incoming value if it comes from
+ // that. This reduction pattern occassionaly turns up.
+ if (P->getIncomingBlock(0) == BBLatch) {
+ Rdx = P->getIncomingValue(0);
+ } else if (P->getIncomingBlock(1) == BBLatch) {
+ Rdx = P->getIncomingValue(1);
+ }
+
+ if (Rdx && DominatedReduxValue(Rdx))
+ return Rdx;
+
+ return nullptr;
+}
+
+/// \brief Attempt to reduce a horizontal reduction.
+/// If it is legal to match a horizontal reduction feeding
+/// the phi node P with reduction operators BI, then check if it
+/// can be done.
+/// \returns true if a horizontal reduction was matched and reduced.
+/// \returns false if a horizontal reduction was not matched.
+static bool canMatchHorizontalReduction(PHINode *P, BinaryOperator *BI,
+ BoUpSLP &R, TargetTransformInfo *TTI) {
+ if (!ShouldVectorizeHor)
+ return false;
+
+ HorizontalReduction HorRdx;
+ if (!HorRdx.matchAssociativeReduction(P, BI))
+ return false;
+
+ // If there is a sufficient number of reduction values, reduce
+ // to a nearby power-of-2. Can safely generate oversized
+ // vectors and rely on the backend to split them to legal sizes.
+ HorRdx.ReduxWidth =
+ std::max((uint64_t)4, PowerOf2Floor(HorRdx.numReductionValues()));
+
+ return HorRdx.tryToReduce(R, TTI);
+}
+
bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
bool Changed = false;
SmallVector<Value *, 4> Incoming;
@@ -3883,7 +4065,7 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) {
// We may go through BB multiple times so skip the one we have checked.
- if (!VisitedInstrs.insert(it).second)
+ if (!VisitedInstrs.insert(&*it).second)
continue;
if (isa<DbgInfoIntrinsic>(it))
@@ -3894,20 +4076,16 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
// Check that the PHI is a reduction PHI.
if (P->getNumIncomingValues() != 2)
return Changed;
- Value *Rdx =
- (P->getIncomingBlock(0) == BB
- ? (P->getIncomingValue(0))
- : (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1)
- : nullptr));
+
+ Value *Rdx = getReductionValue(DT, P, BB, LI);
+
// Check if this is a Binary Operator.
BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx);
if (!BI)
continue;
// Try to match and vectorize a horizontal reduction.
- HorizontalReduction HorRdx;
- if (ShouldVectorizeHor && HorRdx.matchAssociativeReduction(P, BI) &&
- HorRdx.tryToReduce(R, TTI)) {
+ if (canMatchHorizontalReduction(P, BI, R, TTI)) {
Changed = true;
it = BB->begin();
e = BB->end();
@@ -3930,15 +4108,12 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
continue;
}
- // Try to vectorize horizontal reductions feeding into a store.
if (ShouldStartVectorizeHorAtStore)
if (StoreInst *SI = dyn_cast<StoreInst>(it))
if (BinaryOperator *BinOp =
dyn_cast<BinaryOperator>(SI->getValueOperand())) {
- HorizontalReduction HorRdx;
- if (((HorRdx.matchAssociativeReduction(nullptr, BinOp) &&
- HorRdx.tryToReduce(R, TTI)) ||
- tryToVectorize(BinOp, R))) {
+ if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI) ||
+ tryToVectorize(BinOp, R)) {
Changed = true;
it = BB->begin();
e = BB->end();
@@ -4039,10 +4214,10 @@ bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) {
char SLPVectorizer::ID = 0;
static const char lv_name[] = "SLP Vectorizer";
INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
-INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
OpenPOWER on IntegriCloud