summaryrefslogtreecommitdiffstats
path: root/lib/Transforms/Vectorize
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Transforms/Vectorize')
-rw-r--r--lib/Transforms/Vectorize/CMakeLists.txt2
-rw-r--r--lib/Transforms/Vectorize/LoopVectorize.cpp538
-rw-r--r--lib/Transforms/Vectorize/SLPVectorizer.cpp348
-rw-r--r--lib/Transforms/Vectorize/VecUtils.cpp730
-rw-r--r--lib/Transforms/Vectorize/VecUtils.h164
-rw-r--r--lib/Transforms/Vectorize/Vectorize.cpp7
6 files changed, 1651 insertions, 138 deletions
diff --git a/lib/Transforms/Vectorize/CMakeLists.txt b/lib/Transforms/Vectorize/CMakeLists.txt
index e64034a..7ae082f 100644
--- a/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/lib/Transforms/Vectorize/CMakeLists.txt
@@ -2,6 +2,8 @@ add_llvm_library(LLVMVectorize
BBVectorize.cpp
Vectorize.cpp
LoopVectorize.cpp
+ SLPVectorizer.cpp
+ VecUtils.cpp
)
add_dependencies(LLVMVectorize intrinsics_gen)
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index acf2b81..08d3725 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8,9 +8,9 @@
//===----------------------------------------------------------------------===//
//
// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
-// and generates target-independent LLVM-IR. Legalization of the IR is done
-// in the codegen. However, the vectorizer uses (will use) the codegen
-// interfaces to generate IR that is likely to result in an optimal binary.
+// and generates target-independent LLVM-IR.
+// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
+// of instructions in order to estimate the profitability of vectorization.
//
// The loop vectorizer combines consecutive loop iterations into a single
// 'wide' iteration. After this transformation the index is incremented
@@ -78,7 +78,9 @@
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/PatternMatch.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/ValueHandle.h"
#include "llvm/Target/TargetLibraryInfo.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -87,6 +89,7 @@
#include <map>
using namespace llvm;
+using namespace llvm::PatternMatch;
static cl::opt<unsigned>
VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
@@ -112,9 +115,9 @@ TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16),
/// We don't unroll loops with a known constant trip count below this number.
static const unsigned TinyTripCountUnrollThreshold = 128;
-/// When performing a runtime memory check, do not check more than this
-/// number of pointers. Notice that the check is quadratic!
-static const unsigned RuntimeMemoryCheckThreshold = 4;
+/// When performing memory disambiguation checks at runtime do not make more
+/// than this number of comparisons.
+static const unsigned RuntimeMemoryCheckThreshold = 8;
/// We use a metadata with this name to indicate that a scalar loop was
/// vectorized and that we don't need to re-vectorize it if we run into it
@@ -214,7 +217,7 @@ private:
/// This function adds 0, 1, 2 ... to each vector element, starting at zero.
/// If Negate is set then negative numbers are added e.g. (0, -1, -2, ...).
/// The sequence starts at StartIndex.
- Value *getConsecutiveVector(Value* Val, unsigned StartIdx, bool Negate);
+ Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate);
/// When we go over instructions in the basic block we rely on previous
/// values within the current basic block or on loop invariant values.
@@ -333,7 +336,7 @@ public:
DominatorTree *DT, TargetTransformInfo* TTI,
AliasAnalysis *AA, TargetLibraryInfo *TLI)
: TheLoop(L), SE(SE), DL(DL), DT(DT), TTI(TTI), AA(AA), TLI(TLI),
- Induction(0) {}
+ Induction(0), HasFunNoNaNAttr(false) {}
/// This enum represents the kinds of reductions that we support.
enum ReductionKind {
@@ -343,8 +346,10 @@ public:
RK_IntegerOr, ///< Bitwise or logical OR of numbers.
RK_IntegerAnd, ///< Bitwise or logical AND of numbers.
RK_IntegerXor, ///< Bitwise or logical XOR of numbers.
+ RK_IntegerMinMax, ///< Min/max implemented in terms of select(cmp()).
RK_FloatAdd, ///< Sum of floats.
- RK_FloatMult ///< Product of floats.
+ RK_FloatMult, ///< Product of floats.
+ RK_FloatMinMax ///< Min/max implemented in terms of select(cmp()).
};
/// This enum represents the kinds of inductions that we support.
@@ -356,21 +361,52 @@ public:
IK_ReversePtrInduction ///< Reverse ptr indvar. Step = - sizeof(elem).
};
+ // This enum represents the kind of minmax reduction.
+ enum MinMaxReductionKind {
+ MRK_Invalid,
+ MRK_UIntMin,
+ MRK_UIntMax,
+ MRK_SIntMin,
+ MRK_SIntMax,
+ MRK_FloatMin,
+ MRK_FloatMax
+ };
+
/// This POD struct holds information about reduction variables.
struct ReductionDescriptor {
ReductionDescriptor() : StartValue(0), LoopExitInstr(0),
- Kind(RK_NoReduction) {}
+ Kind(RK_NoReduction), MinMaxKind(MRK_Invalid) {}
- ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K)
- : StartValue(Start), LoopExitInstr(Exit), Kind(K) {}
+ ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K,
+ MinMaxReductionKind MK)
+ : StartValue(Start), LoopExitInstr(Exit), Kind(K), MinMaxKind(MK) {}
// The starting value of the reduction.
// It does not have to be zero!
- Value *StartValue;
+ TrackingVH<Value> StartValue;
// The instruction who's value is used outside the loop.
Instruction *LoopExitInstr;
// The kind of the reduction.
ReductionKind Kind;
+ // If this a min/max reduction the kind of reduction.
+ MinMaxReductionKind MinMaxKind;
+ };
+
+ /// This POD struct holds information about a potential reduction operation.
+ struct ReductionInstDesc {
+ ReductionInstDesc(bool IsRedux, Instruction *I) :
+ IsReduction(IsRedux), PatternLastInst(I), MinMaxKind(MRK_Invalid) {}
+
+ ReductionInstDesc(Instruction *I, MinMaxReductionKind K) :
+ IsReduction(true), PatternLastInst(I), MinMaxKind(K) {}
+
+ // Is this instruction a reduction candidate.
+ bool IsReduction;
+ // The last instruction in a min/max pattern (select of the select(icmp())
+ // pattern), or the current reduction instruction otherwise.
+ Instruction *PatternLastInst;
+ // If this is a min/max pattern the comparison predicate.
+ MinMaxReductionKind MinMaxKind;
};
// This POD struct holds information about the memory runtime legality
@@ -387,16 +423,18 @@ public:
}
/// Insert a pointer and calculate the start and end SCEVs.
- void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr);
+ void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr);
/// This flag indicates if we need to add the runtime check.
bool Need;
/// Holds the pointers that we need to check.
- SmallVector<Value*, 2> Pointers;
+ SmallVector<TrackingVH<Value>, 2> Pointers;
/// Holds the pointer value at the beginning of the loop.
SmallVector<const SCEV*, 2> Starts;
/// Holds the pointer value at the end of the loop.
SmallVector<const SCEV*, 2> Ends;
+ /// Holds the information if this pointer is used for writing to memory.
+ SmallVector<bool, 2> IsWritePtr;
};
/// A POD for saving information about induction variables.
@@ -404,7 +442,7 @@ public:
InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {}
InductionInfo() : StartValue(0), IK(IK_NoInduction) {}
/// Start value.
- Value *StartValue;
+ TrackingVH<Value> StartValue;
/// Induction kind.
InductionKind IK;
};
@@ -461,6 +499,10 @@ public:
/// Returns the information that we collected about runtime memory check.
RuntimePointerCheck *getRuntimePointerCheck() { return &PtrRtCheck; }
+
+ /// This function returns the identity element (or neutral element) for
+ /// the operation K.
+ static Constant *getReductionIdentity(ReductionKind K, Type *Tp);
private:
/// Check if a single basic block loop is vectorizable.
/// At this point we know that this is a loop with a constant trip count
@@ -487,9 +529,17 @@ private:
/// Returns True, if 'Phi' is the kind of reduction variable for type
/// 'Kind'. If this is a reduction variable, it adds it to ReductionList.
bool AddReductionVar(PHINode *Phi, ReductionKind Kind);
- /// Returns true if the instruction I can be a reduction variable of type
- /// 'Kind'.
- bool isReductionInstr(Instruction *I, ReductionKind Kind);
+ /// Returns a struct describing if the instruction 'I' can be a reduction
+ /// variable of type 'Kind'. If the reduction is a min/max pattern of
+ /// select(icmp()) this function advances the instruction pointer 'I' from the
+ /// compare instruction to the select instruction and stores this pointer in
+ /// 'PatternLastInst' member of the returned struct.
+ ReductionInstDesc isReductionInstr(Instruction *I, ReductionKind Kind,
+ ReductionInstDesc &Desc);
+ /// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction
+ /// pattern corresponding to a min(X, Y) or max(X, Y).
+ static ReductionInstDesc isMinMaxSelectCmpPattern(Instruction *I,
+ ReductionInstDesc &Prev);
/// Returns the induction kind of Phi. This function may return NoInduction
/// if the PHI is not an induction variable.
InductionKind isInductionVariable(PHINode *Phi);
@@ -540,6 +590,8 @@ private:
/// We need to check that all of the pointers in this list are disjoint
/// at runtime.
RuntimePointerCheck PtrRtCheck;
+ /// Can we assume the absence of NaNs.
+ bool HasFunNoNaNAttr;
};
/// LoopVectorizationCostModel - estimates the expected speedups due to
@@ -662,6 +714,11 @@ struct LoopVectorize : public LoopPass {
AA = getAnalysisIfAvailable<AliasAnalysis>();
TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
+ if (DL == NULL) {
+ DEBUG(dbgs() << "LV: Not vectorizing because of missing data layout");
+ return false;
+ }
+
DEBUG(dbgs() << "LV: Checking a loop in \"" <<
L->getHeader()->getParent()->getName() << "\"\n");
@@ -737,7 +794,8 @@ struct LoopVectorize : public LoopPass {
void
LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE,
- Loop *Lp, Value *Ptr) {
+ Loop *Lp, Value *Ptr,
+ bool WritePtr) {
const SCEV *Sc = SE->getSCEV(Ptr);
const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
assert(AR && "Invalid addrec expression");
@@ -746,6 +804,7 @@ LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE,
Pointers.push_back(Ptr);
Starts.push_back(AR->getStart());
Ends.push_back(ScEnd);
+ IsWritePtr.push_back(WritePtr);
}
Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
@@ -771,7 +830,7 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
return Shuf;
}
-Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, unsigned StartIdx,
+Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, int StartIdx,
bool Negate) {
assert(Val->getType()->isVectorTy() && "Must be a vector");
assert(Val->getType()->getScalarType()->isIntegerTy() &&
@@ -784,8 +843,8 @@ Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, unsigned StartIdx,
// Create a vector of consecutive numbers from zero to VF.
for (int i = 0; i < VLen; ++i) {
- int Idx = Negate ? (-i): i;
- Indices.push_back(ConstantInt::get(ITy, StartIdx + Idx));
+ int64_t Idx = Negate ? (-i) : i;
+ Indices.push_back(ConstantInt::get(ITy, StartIdx + Idx, Negate));
}
// Add the consecutive indices to the vector value.
@@ -906,12 +965,18 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment();
+ unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy);
+ unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF;
+
+ if (ScalarAllocatedSize != VectorElementSize)
+ return scalarizeInstruction(Instr);
+
// If the pointer is loop invariant or if it is non consecutive,
// scalarize the load.
- int Stride = Legal->isConsecutivePtr(Ptr);
- bool Reverse = Stride < 0;
+ int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
+ bool Reverse = ConsecutiveStride < 0;
bool UniformLoad = LI && Legal->isUniform(Ptr);
- if (Stride == 0 || UniformLoad)
+ if (!ConsecutiveStride || UniformLoad)
return scalarizeInstruction(Instr);
Constant *Zero = Builder.getInt32(0);
@@ -1040,10 +1105,10 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
// Create a new entry in the WidenMap and initialize it to Undef or Null.
VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
- // For each scalar that we create:
- for (unsigned Width = 0; Width < VF; ++Width) {
- // For each vector unroll 'part':
- for (unsigned Part = 0; Part < UF; ++Part) {
+ // For each vector unroll 'part':
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ // For each scalar that we create:
+ for (unsigned Width = 0; Width < VF; ++Width) {
Instruction *Cloned = Instr->clone();
if (!IsVoidRetTy)
Cloned->setName(Instr->getName() + ".cloned");
@@ -1110,6 +1175,10 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
for (unsigned i = 0; i < NumPointers; ++i) {
for (unsigned j = i+1; j < NumPointers; ++j) {
+ // No need to check if two readonly pointers intersect.
+ if (!PtrRtCheck->IsWritePtr[i] && !PtrRtCheck->IsWritePtr[j])
+ continue;
+
Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy, "bc");
Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy, "bc");
Value *End0 = ChkBuilder.CreateBitCast(Ends[i], PtrArithTy, "bc");
@@ -1167,7 +1236,7 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
// Mark the old scalar loop with metadata that tells us not to vectorize this
// loop again if we run into it.
- MDNode *MD = MDNode::get(OldBasicBlock->getContext(), ArrayRef<Value*>());
+ MDNode *MD = MDNode::get(OldBasicBlock->getContext(), None);
OldBasicBlock->getTerminator()->setMetadata(AlreadyVectorizedMDName, MD);
// Some loops have a single integer induction variable, while other loops
@@ -1436,24 +1505,24 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
/// This function returns the identity element (or neutral element) for
/// the operation K.
-static Constant*
-getReductionIdentity(LoopVectorizationLegality::ReductionKind K, Type *Tp) {
+Constant*
+LoopVectorizationLegality::getReductionIdentity(ReductionKind K, Type *Tp) {
switch (K) {
- case LoopVectorizationLegality:: RK_IntegerXor:
- case LoopVectorizationLegality:: RK_IntegerAdd:
- case LoopVectorizationLegality:: RK_IntegerOr:
+ case RK_IntegerXor:
+ case RK_IntegerAdd:
+ case RK_IntegerOr:
// Adding, Xoring, Oring zero to a number does not change it.
return ConstantInt::get(Tp, 0);
- case LoopVectorizationLegality:: RK_IntegerMult:
+ case RK_IntegerMult:
// Multiplying a number by 1 does not change it.
return ConstantInt::get(Tp, 1);
- case LoopVectorizationLegality:: RK_IntegerAnd:
+ case RK_IntegerAnd:
// AND-ing a number with an all-1 value does not change it.
return ConstantInt::get(Tp, -1, true);
- case LoopVectorizationLegality:: RK_FloatMult:
+ case RK_FloatMult:
// Multiplying a number by 1 does not change it.
return ConstantFP::get(Tp, 1.0L);
- case LoopVectorizationLegality:: RK_FloatAdd:
+ case RK_FloatAdd:
// Adding zero to a number does not change it.
return ConstantFP::get(Tp, 0.0L);
default:
@@ -1566,7 +1635,7 @@ getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) {
}
/// This function translates the reduction kind to an LLVM binary operator.
-static Instruction::BinaryOps
+static unsigned
getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) {
switch (Kind) {
case LoopVectorizationLegality::RK_IntegerAdd:
@@ -1583,11 +1652,53 @@ getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) {
return Instruction::FMul;
case LoopVectorizationLegality::RK_FloatAdd:
return Instruction::FAdd;
+ case LoopVectorizationLegality::RK_IntegerMinMax:
+ return Instruction::ICmp;
+ case LoopVectorizationLegality::RK_FloatMinMax:
+ return Instruction::FCmp;
default:
llvm_unreachable("Unknown reduction operation");
}
}
+Value *createMinMaxOp(IRBuilder<> &Builder,
+ LoopVectorizationLegality::MinMaxReductionKind RK,
+ Value *Left,
+ Value *Right) {
+ CmpInst::Predicate P = CmpInst::ICMP_NE;
+ switch (RK) {
+ default:
+ llvm_unreachable("Unknown min/max reduction kind");
+ case LoopVectorizationLegality::MRK_UIntMin:
+ P = CmpInst::ICMP_ULT;
+ break;
+ case LoopVectorizationLegality::MRK_UIntMax:
+ P = CmpInst::ICMP_UGT;
+ break;
+ case LoopVectorizationLegality::MRK_SIntMin:
+ P = CmpInst::ICMP_SLT;
+ break;
+ case LoopVectorizationLegality::MRK_SIntMax:
+ P = CmpInst::ICMP_SGT;
+ break;
+ case LoopVectorizationLegality::MRK_FloatMin:
+ P = CmpInst::FCMP_OLT;
+ break;
+ case LoopVectorizationLegality::MRK_FloatMax:
+ P = CmpInst::FCMP_OGT;
+ break;
+ }
+
+ Value *Cmp;
+ if (RK == LoopVectorizationLegality::MRK_FloatMin || RK == LoopVectorizationLegality::MRK_FloatMax)
+ Cmp = Builder.CreateFCmp(P, Left, Right, "rdx.minmax.cmp");
+ else
+ Cmp = Builder.CreateICmp(P, Left, Right, "rdx.minmax.cmp");
+
+ Value *Select = Builder.CreateSelect(Cmp, Left, Right, "rdx.minmax.select");
+ return Select;
+}
+
void
InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
//===------------------------------------------------===//
@@ -1651,13 +1762,24 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
// Find the reduction identity variable. Zero for addition, or, xor,
// one for multiplication, -1 for And.
- Constant *Iden = getReductionIdentity(RdxDesc.Kind, VecTy->getScalarType());
- Constant *Identity = ConstantVector::getSplat(VF, Iden);
-
- // This vector is the Identity vector where the first element is the
- // incoming scalar reduction.
- Value *VectorStart = Builder.CreateInsertElement(Identity,
- RdxDesc.StartValue, Zero);
+ Value *Identity;
+ Value *VectorStart;
+ if (RdxDesc.Kind == LoopVectorizationLegality::RK_IntegerMinMax ||
+ RdxDesc.Kind == LoopVectorizationLegality::RK_FloatMinMax) {
+ // MinMax reduction have the start value as their identify.
+ VectorStart = Identity = Builder.CreateVectorSplat(VF, RdxDesc.StartValue,
+ "minmax.ident");
+ } else {
+ Constant *Iden =
+ LoopVectorizationLegality::getReductionIdentity(RdxDesc.Kind,
+ VecTy->getScalarType());
+ Identity = ConstantVector::getSplat(VF, Iden);
+
+ // This vector is the Identity vector where the first element is the
+ // incoming scalar reduction.
+ VectorStart = Builder.CreateInsertElement(Identity,
+ RdxDesc.StartValue, Zero);
+ }
// Fix the vector-loop phi.
// We created the induction variable so we know that the
@@ -1699,10 +1821,15 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
// Reduce all of the unrolled parts into a single vector.
Value *ReducedPartRdx = RdxParts[0];
+ unsigned Op = getReductionBinOp(RdxDesc.Kind);
for (unsigned part = 1; part < UF; ++part) {
- Instruction::BinaryOps Op = getReductionBinOp(RdxDesc.Kind);
- ReducedPartRdx = Builder.CreateBinOp(Op, RdxParts[part], ReducedPartRdx,
- "bin.rdx");
+ if (Op != Instruction::ICmp && Op != Instruction::FCmp)
+ ReducedPartRdx = Builder.CreateBinOp((Instruction::BinaryOps)Op,
+ RdxParts[part], ReducedPartRdx,
+ "bin.rdx");
+ else
+ ReducedPartRdx = createMinMaxOp(Builder, RdxDesc.MinMaxKind,
+ ReducedPartRdx, RdxParts[part]);
}
// VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
@@ -1727,8 +1854,11 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
ConstantVector::get(ShuffleMask),
"rdx.shuf");
- Instruction::BinaryOps Op = getReductionBinOp(RdxDesc.Kind);
- TmpVec = Builder.CreateBinOp(Op, TmpVec, Shuf, "bin.rdx");
+ if (Op != Instruction::ICmp && Op != Instruction::FCmp)
+ TmpVec = Builder.CreateBinOp((Instruction::BinaryOps)Op, TmpVec, Shuf,
+ "bin.rdx");
+ else
+ TmpVec = createMinMaxOp(Builder, RdxDesc.MinMaxKind, TmpVec, Shuf);
}
// The result is in the first element of the vector.
@@ -1861,18 +1991,33 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
// We know that all PHIs in non header blocks are converted into
// selects, so we don't have to worry about the insertion order and we
// can just use the builder.
-
// At this point we generate the predication tree. There may be
// duplications since this is a simple recursive scan, but future
// optimizations will clean it up.
- VectorParts Cond = createEdgeMask(P->getIncomingBlock(0),
- P->getParent());
- for (unsigned part = 0; part < UF; ++part) {
- VectorParts &In0 = getVectorValue(P->getIncomingValue(0));
- VectorParts &In1 = getVectorValue(P->getIncomingValue(1));
- Entry[part] = Builder.CreateSelect(Cond[part], In0[part], In1[part],
- "predphi");
+ unsigned NumIncoming = P->getNumIncomingValues();
+ assert(NumIncoming > 1 && "Invalid PHI");
+
+ // Generate a sequence of selects of the form:
+ // SELECT(Mask3, In3,
+ // SELECT(Mask2, In2,
+ // ( ...)))
+ for (unsigned In = 0; In < NumIncoming; In++) {
+ VectorParts Cond = createEdgeMask(P->getIncomingBlock(In),
+ P->getParent());
+ VectorParts &In0 = getVectorValue(P->getIncomingValue(In));
+
+ for (unsigned part = 0; part < UF; ++part) {
+ // We don't need to 'select' the first PHI operand because it is
+ // the default value if all of the other masks don't match.
+ if (In == 0)
+ Entry[part] = In0[part];
+ else
+ // Select between the current value and the previous incoming edge
+ // based on the incoming mask.
+ Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
+ Entry[part], "predphi");
+ }
}
continue;
}
@@ -1928,7 +2073,8 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
// After broadcasting the induction variable we need to make the
// vector consecutive by adding ... -3, -2, -1, 0.
for (unsigned part = 0; part < UF; ++part)
- Entry[part] = getConsecutiveVector(Broadcasted, -VF * part, true);
+ Entry[part] = getConsecutiveVector(Broadcasted, -(int)VF * part,
+ true);
continue;
}
@@ -2152,12 +2298,6 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
if (!isa<BranchInst>(BB->getTerminator()))
return false;
- // We must have at most two predecessors because we need to convert
- // all PHIs to selects.
- unsigned Preds = std::distance(pred_begin(BB), pred_end(BB));
- if (Preds > 2)
- return false;
-
// We must be able to predicate all blocks that need to be predicated.
if (blockNeedsPredication(BB) && !blockCanBePredicated(BB))
return false;
@@ -2168,7 +2308,10 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
}
bool LoopVectorizationLegality::canVectorize() {
- assert(TheLoop->getLoopPreheader() && "No preheader!!");
+ // We must have a loop in canonical form. Loops with indirectbr in them cannot
+ // be canonicalized.
+ if (!TheLoop->getLoopPreheader())
+ return false;
// We can only vectorize innermost loops.
if (TheLoop->getSubLoopsVector().size())
@@ -2235,6 +2378,26 @@ bool LoopVectorizationLegality::canVectorize() {
return true;
}
+/// \brief Check that the instruction has outside loop users and is not an
+/// identified reduction variable.
+static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
+ SmallPtrSet<Value *, 4> &Reductions) {
+ // Reduction instructions are allowed to have exit users. All other
+ // instructions must not have external users.
+ if (!Reductions.count(Inst))
+ //Check that all of the users of the loop are inside the BB.
+ for (Value::use_iterator I = Inst->use_begin(), E = Inst->use_end();
+ I != E; ++I) {
+ Instruction *U = cast<Instruction>(*I);
+ // This user may be a reduction exit value.
+ if (!TheLoop->contains(U)) {
+ DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n");
+ return true;
+ }
+ }
+ return false;
+}
+
bool LoopVectorizationLegality::canVectorizeInstrs() {
BasicBlock *PreHeader = TheLoop->getLoopPreheader();
BasicBlock *Header = TheLoop->getHeader();
@@ -2246,6 +2409,13 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
return false;
}
+ // Look for the attribute signaling the absence of NaNs.
+ Function &F = *Header->getParent();
+ if (F.hasFnAttribute("no-nans-fp-math"))
+ HasFunNoNaNAttr = F.getAttributes().getAttribute(
+ AttributeSet::FunctionIndex,
+ "no-nans-fp-math").getValueAsString() == "true";
+
// For each block in the loop.
for (Loop::block_iterator bb = TheLoop->block_begin(),
be = TheLoop->block_end(); bb != be; ++bb) {
@@ -2255,12 +2425,6 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
++it) {
if (PHINode *Phi = dyn_cast<PHINode>(it)) {
- // This should not happen because the loop should be normalized.
- if (Phi->getNumIncomingValues() != 2) {
- DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
- return false;
- }
-
// Check that this PHI type is allowed.
if (!Phi->getType()->isIntegerTy() &&
!Phi->getType()->isFloatingPointTy() &&
@@ -2272,8 +2436,19 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
// If this PHINode is not in the header block, then we know that we
// can convert it to select during if-conversion. No need to check if
// the PHIs in this block are induction or reduction variables.
- if (*bb != Header)
- continue;
+ if (*bb != Header) {
+ // Check that this instruction has no outside users or is an
+ // identified reduction value with an outside user.
+ if(!hasOutsideLoopUser(TheLoop, it, AllowedExit))
+ continue;
+ return false;
+ }
+
+ // We only allow if-converted PHIs with more than two incoming values.
+ if (Phi->getNumIncomingValues() != 2) {
+ DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
+ return false;
+ }
// This is the value coming from the preheader.
Value *StartValue = Phi->getIncomingValueForBlock(PreHeader);
@@ -2315,6 +2490,10 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n");
continue;
}
+ if (AddReductionVar(Phi, RK_IntegerMinMax)) {
+ DEBUG(dbgs() << "LV: Found a MINMAX reduction PHI."<< *Phi <<"\n");
+ continue;
+ }
if (AddReductionVar(Phi, RK_FloatMult)) {
DEBUG(dbgs() << "LV: Found an FMult reduction PHI."<< *Phi <<"\n");
continue;
@@ -2323,6 +2502,10 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
DEBUG(dbgs() << "LV: Found an FAdd reduction PHI."<< *Phi <<"\n");
continue;
}
+ if (AddReductionVar(Phi, RK_FloatMinMax)) {
+ DEBUG(dbgs() << "LV: Found an float MINMAX reduction PHI."<< *Phi <<"\n");
+ continue;
+ }
DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n");
return false;
@@ -2352,17 +2535,9 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
// Reduction instructions are allowed to have exit users.
// All other instructions must not have external users.
- if (!AllowedExit.count(it))
- //Check that all of the users of the loop are inside the BB.
- for (Value::use_iterator I = it->use_begin(), E = it->use_end();
- I != E; ++I) {
- Instruction *U = cast<Instruction>(*I);
- // This user may be a reduction exit value.
- if (!TheLoop->contains(U)) {
- DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n");
- return false;
- }
- }
+ if (hasOutsideLoopUser(TheLoop, it, AllowedExit))
+ return false;
+
} // next instr.
}
@@ -2442,13 +2617,6 @@ LoopVectorizationLegality::hasPossibleGlobalWriteReorder(
bool LoopVectorizationLegality::canVectorizeMemory() {
- if (TheLoop->isAnnotatedParallel()) {
- DEBUG(dbgs()
- << "LV: A loop annotated parallel, ignore memory dependency "
- << "checks.\n");
- return true;
- }
-
typedef SmallVector<Value*, 16> ValueVector;
typedef SmallPtrSet<Value*, 16> ValueSet;
// Holds the Load and Store *instructions*.
@@ -2457,6 +2625,8 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
PtrRtCheck.Pointers.clear();
PtrRtCheck.Need = false;
+ const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
+
// For each block.
for (Loop::block_iterator bb = TheLoop->block_begin(),
be = TheLoop->block_end(); bb != be; ++bb) {
@@ -2471,7 +2641,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
if (it->mayReadFromMemory()) {
LoadInst *Ld = dyn_cast<LoadInst>(it);
if (!Ld) return false;
- if (!Ld->isSimple()) {
+ if (!Ld->isSimple() && !IsAnnotatedParallel) {
DEBUG(dbgs() << "LV: Found a non-simple load.\n");
return false;
}
@@ -2483,7 +2653,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
if (it->mayWriteToMemory()) {
StoreInst *St = dyn_cast<StoreInst>(it);
if (!St) return false;
- if (!St->isSimple()) {
+ if (!St->isSimple() && !IsAnnotatedParallel) {
DEBUG(dbgs() << "LV: Found a non-simple store.\n");
return false;
}
@@ -2530,6 +2700,13 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
ReadWrites.insert(std::make_pair(Ptr, ST));
}
+ if (IsAnnotatedParallel) {
+ DEBUG(dbgs()
+ << "LV: A loop annotated parallel, ignore memory dependency "
+ << "checks.\n");
+ return true;
+ }
+
for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) {
LoadInst *LD = cast<LoadInst>(*I);
Value* Ptr = LD->getPointerOperand();
@@ -2552,6 +2729,9 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
return true;
}
+ unsigned NumReadPtrs = 0;
+ unsigned NumWritePtrs = 0;
+
// Find pointers with computable bounds. We are going to use this information
// to place a runtime bound check.
bool CanDoRT = true;
@@ -2559,7 +2739,8 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
for (MI = ReadWrites.begin(), ME = ReadWrites.end(); MI != ME; ++MI) {
Value *V = (*MI).first;
if (hasComputableBounds(V)) {
- PtrRtCheck.insert(SE, TheLoop, V);
+ PtrRtCheck.insert(SE, TheLoop, V, true);
+ NumWritePtrs++;
DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *V <<"\n");
} else {
CanDoRT = false;
@@ -2569,7 +2750,8 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
for (MI = Reads.begin(), ME = Reads.end(); MI != ME; ++MI) {
Value *V = (*MI).first;
if (hasComputableBounds(V)) {
- PtrRtCheck.insert(SE, TheLoop, V);
+ PtrRtCheck.insert(SE, TheLoop, V, false);
+ NumReadPtrs++;
DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *V <<"\n");
} else {
CanDoRT = false;
@@ -2579,7 +2761,9 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
// Check that we did not collect too many pointers or found a
// unsizeable pointer.
- if (!CanDoRT || PtrRtCheck.Pointers.size() > RuntimeMemoryCheckThreshold) {
+ unsigned NumComparisons = (NumWritePtrs * (NumReadPtrs + NumWritePtrs - 1));
+ DEBUG(dbgs() << "LV: We need to compare " << NumComparisons << " ptrs.\n");
+ if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) {
PtrRtCheck.reset();
CanDoRT = false;
}
@@ -2642,8 +2826,8 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
Inst,
WriteObjects,
MaxByteWidth)) {
- DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
- << *UI <<"\n");
+ DEBUG(dbgs() << "LV: Found a possible write-write reorder:" << **UI
+ << "\n");
return false;
}
@@ -2686,8 +2870,8 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
Inst,
WriteObjects,
MaxByteWidth)) {
- DEBUG(dbgs() << "LV: Found a possible read-write reorder:"
- << *UI <<"\n");
+ DEBUG(dbgs() << "LV: Found a possible read-write reorder:" << **UI
+ << "\n");
return false;
}
}
@@ -2733,7 +2917,18 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
// used as reduction variables (such as ADD). We may have a single
// out-of-block user. The cycle must end with the original PHI.
Instruction *Iter = Phi;
- while (true) {
+
+ // To recognize min/max patterns formed by a icmp select sequence, we store
+ // the number of instruction we saw from the recognized min/max pattern,
+ // such that we don't stop when we see the phi has two uses (one by the select
+ // and one by the icmp) and to make sure we only see exactly the two
+ // instructions.
+ unsigned NumCmpSelectPatternInst = 0;
+ ReductionInstDesc ReduxDesc(false, 0);
+
+ // Avoid cycles in the chain.
+ SmallPtrSet<Instruction *, 8> VisitedInsts;
+ while (VisitedInsts.insert(Iter)) {
// If the instruction has no users then this is a broken
// chain and can't be a reduction variable.
if (Iter->use_empty())
@@ -2747,9 +2942,6 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
// Is this a bin op ?
FoundBinOp |= !isa<PHINode>(Iter);
- // Remember the current instruction.
- Instruction *OldIter = Iter;
-
// For each of the *users* of iter.
for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end();
it != e; ++it) {
@@ -2778,25 +2970,35 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
Iter->hasNUsesOrMore(2))
continue;
- // We can't have multiple inside users.
- if (FoundInBlockUser)
+ // We can't have multiple inside users except for a combination of
+ // icmp/select both using the phi.
+ if (FoundInBlockUser && !NumCmpSelectPatternInst)
return false;
FoundInBlockUser = true;
// Any reduction instr must be of one of the allowed kinds.
- if (!isReductionInstr(U, Kind))
+ ReduxDesc = isReductionInstr(U, Kind, ReduxDesc);
+ if (!ReduxDesc.IsReduction)
return false;
+ if (Kind == RK_IntegerMinMax && (isa<ICmpInst>(U) || isa<SelectInst>(U)))
+ ++NumCmpSelectPatternInst;
+ if (Kind == RK_FloatMinMax && (isa<FCmpInst>(U) || isa<SelectInst>(U)))
+ ++NumCmpSelectPatternInst;
+
// Reductions of instructions such as Div, and Sub is only
// possible if the LHS is the reduction variable.
- if (!U->isCommutative() && !isa<PHINode>(U) && U->getOperand(0) != Iter)
+ if (!U->isCommutative() && !isa<PHINode>(U) && !isa<SelectInst>(U) &&
+ !isa<ICmpInst>(U) && !isa<FCmpInst>(U) && U->getOperand(0) != Iter)
return false;
- Iter = U;
+ Iter = ReduxDesc.PatternLastInst;
}
- // If all uses were skipped this can't be a reduction variable.
- if (Iter == OldIter)
+ // This means we have seen one but not the other instruction of the
+ // pattern or more than just a select and cmp.
+ if ((Kind == RK_IntegerMinMax || Kind == RK_FloatMinMax) &&
+ NumCmpSelectPatternInst != 2)
return false;
// We found a reduction var if we have reached the original
@@ -2807,47 +3009,107 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
AllowedExit.insert(ExitInstruction);
// Save the description of this reduction variable.
- ReductionDescriptor RD(RdxStart, ExitInstruction, Kind);
+ ReductionDescriptor RD(RdxStart, ExitInstruction, Kind,
+ ReduxDesc.MinMaxKind);
Reductions[Phi] = RD;
// We've ended the cycle. This is a reduction variable if we have an
// outside user and it has a binary op.
return FoundBinOp && ExitInstruction;
}
}
+
+ return false;
}
-bool
+/// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction
+/// pattern corresponding to a min(X, Y) or max(X, Y).
+LoopVectorizationLegality::ReductionInstDesc
+LoopVectorizationLegality::isMinMaxSelectCmpPattern(Instruction *I,
+ ReductionInstDesc &Prev) {
+
+ assert((isa<ICmpInst>(I) || isa<FCmpInst>(I) || isa<SelectInst>(I)) &&
+ "Expect a select instruction");
+ Instruction *Cmp = 0;
+ SelectInst *Select = 0;
+
+ // We must handle the select(cmp()) as a single instruction. Advance to the
+ // select.
+ if ((Cmp = dyn_cast<ICmpInst>(I)) || (Cmp = dyn_cast<FCmpInst>(I))) {
+ if (!Cmp->hasOneUse() || !(Select = dyn_cast<SelectInst>(*I->use_begin())))
+ return ReductionInstDesc(false, I);
+ return ReductionInstDesc(Select, Prev.MinMaxKind);
+ }
+
+ // Only handle single use cases for now.
+ if (!(Select = dyn_cast<SelectInst>(I)))
+ return ReductionInstDesc(false, I);
+ if (!(Cmp = dyn_cast<ICmpInst>(I->getOperand(0))) &&
+ !(Cmp = dyn_cast<FCmpInst>(I->getOperand(0))))
+ return ReductionInstDesc(false, I);
+ if (!Cmp->hasOneUse())
+ return ReductionInstDesc(false, I);
+
+ Value *CmpLeft;
+ Value *CmpRight;
+
+ // Look for a min/max pattern.
+ if (m_UMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
+ return ReductionInstDesc(Select, MRK_UIntMin);
+ else if (m_UMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
+ return ReductionInstDesc(Select, MRK_UIntMax);
+ else if (m_SMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
+ return ReductionInstDesc(Select, MRK_SIntMax);
+ else if (m_SMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
+ return ReductionInstDesc(Select, MRK_SIntMin);
+ else if (m_OrdFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
+ return ReductionInstDesc(Select, MRK_FloatMin);
+ else if (m_OrdFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
+ return ReductionInstDesc(Select, MRK_FloatMax);
+ else if (m_UnordFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
+ return ReductionInstDesc(Select, MRK_FloatMin);
+ else if (m_UnordFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
+ return ReductionInstDesc(Select, MRK_FloatMax);
+
+ return ReductionInstDesc(false, I);
+}
+
+LoopVectorizationLegality::ReductionInstDesc
LoopVectorizationLegality::isReductionInstr(Instruction *I,
- ReductionKind Kind) {
+ ReductionKind Kind,
+ ReductionInstDesc &Prev) {
bool FP = I->getType()->isFloatingPointTy();
bool FastMath = (FP && I->isCommutative() && I->isAssociative());
-
switch (I->getOpcode()) {
default:
- return false;
+ return ReductionInstDesc(false, I);
case Instruction::PHI:
- if (FP && (Kind != RK_FloatMult && Kind != RK_FloatAdd))
- return false;
- // possibly.
- return true;
+ if (FP && (Kind != RK_FloatMult && Kind != RK_FloatAdd &&
+ Kind != RK_FloatMinMax))
+ return ReductionInstDesc(false, I);
+ return ReductionInstDesc(I, Prev.MinMaxKind);
case Instruction::Sub:
case Instruction::Add:
- return Kind == RK_IntegerAdd;
- case Instruction::SDiv:
- case Instruction::UDiv:
+ return ReductionInstDesc(Kind == RK_IntegerAdd, I);
case Instruction::Mul:
- return Kind == RK_IntegerMult;
+ return ReductionInstDesc(Kind == RK_IntegerMult, I);
case Instruction::And:
- return Kind == RK_IntegerAnd;
+ return ReductionInstDesc(Kind == RK_IntegerAnd, I);
case Instruction::Or:
- return Kind == RK_IntegerOr;
+ return ReductionInstDesc(Kind == RK_IntegerOr, I);
case Instruction::Xor:
- return Kind == RK_IntegerXor;
+ return ReductionInstDesc(Kind == RK_IntegerXor, I);
case Instruction::FMul:
- return Kind == RK_FloatMult && FastMath;
+ return ReductionInstDesc(Kind == RK_FloatMult && FastMath, I);
case Instruction::FAdd:
- return Kind == RK_FloatAdd && FastMath;
- }
+ return ReductionInstDesc(Kind == RK_FloatAdd && FastMath, I);
+ case Instruction::FCmp:
+ case Instruction::ICmp:
+ case Instruction::Select:
+ if (Kind != RK_IntegerMinMax &&
+ (!HasFunNoNaNAttr || Kind != RK_FloatMinMax))
+ return ReductionInstDesc(false, I);
+ return isMinMaxSelectCmpPattern(I, Prev);
+ }
}
LoopVectorizationLegality::InductionKind
@@ -3380,9 +3642,11 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
// Scalarized loads/stores.
- int Stride = Legal->isConsecutivePtr(Ptr);
- bool Reverse = Stride < 0;
- if (0 == Stride) {
+ int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
+ bool Reverse = ConsecutiveStride < 0;
+ unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ValTy);
+ unsigned VectorElementSize = DL->getTypeStoreSize(VectorTy)/VF;
+ if (!ConsecutiveStride || ScalarAllocatedSize != VectorElementSize) {
unsigned Cost = 0;
// The cost of extracting from the value vector and pointer vector.
Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
new file mode 100644
index 0000000..cc30cc9
--- /dev/null
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -0,0 +1,348 @@
+//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
+// stores that can be put together into vector-stores. Next, it attempts to
+// construct vectorizable tree using the use-def chains. If a profitable tree
+// was found, the SLP vectorizer performs vectorization on the tree.
+//
+// The pass is inspired by the work described in the paper:
+// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
+//
+//===----------------------------------------------------------------------===//
+#define SV_NAME "slp-vectorizer"
+#define DEBUG_TYPE SV_NAME
+
+#include "VecUtils.h"
+#include "llvm/Transforms/Vectorize.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <map>
+
+using namespace llvm;
+
+static cl::opt<int>
+SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
+ cl::desc("Only vectorize trees if the gain is above this "
+ "number. (gain = -cost of vectorization)"));
+namespace {
+
+/// The SLPVectorizer Pass.
+struct SLPVectorizer : public FunctionPass {
+ typedef std::map<Value*, BoUpSLP::StoreList> StoreListMap;
+
+ /// Pass identification, replacement for typeid
+ static char ID;
+
+ explicit SLPVectorizer() : FunctionPass(ID) {
+ initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
+ }
+
+ ScalarEvolution *SE;
+ DataLayout *DL;
+ TargetTransformInfo *TTI;
+ AliasAnalysis *AA;
+ LoopInfo *LI;
+
+ virtual bool runOnFunction(Function &F) {
+ SE = &getAnalysis<ScalarEvolution>();
+ DL = getAnalysisIfAvailable<DataLayout>();
+ TTI = &getAnalysis<TargetTransformInfo>();
+ AA = &getAnalysis<AliasAnalysis>();
+ LI = &getAnalysis<LoopInfo>();
+
+ StoreRefs.clear();
+ bool Changed = false;
+
+ // Must have DataLayout. We can't require it because some tests run w/o
+ // triple.
+ if (!DL)
+ return false;
+
+ for (Function::iterator it = F.begin(), e = F.end(); it != e; ++it) {
+ BasicBlock *BB = it;
+ bool BBChanged = false;
+
+ // Use the bollom up slp vectorizer to construct chains that start with
+ // he store instructions.
+ BoUpSLP R(BB, SE, DL, TTI, AA, LI->getLoopFor(BB));
+
+ // Vectorize trees that end at reductions.
+ BBChanged |= vectorizeReductions(BB, R);
+
+ // Vectorize trees that end at stores.
+ if (unsigned count = collectStores(BB, R)) {
+ (void)count;
+ DEBUG(dbgs()<<"SLP: Found " << count << " stores to vectorize.\n");
+ BBChanged |= vectorizeStoreChains(R);
+ }
+
+ // Try to hoist some of the scalarization code to the preheader.
+ if (BBChanged) hoistGatherSequence(LI, BB, R);
+
+ Changed |= BBChanged;
+ }
+
+ if (Changed) {
+ DEBUG(dbgs()<<"SLP: vectorized \""<<F.getName()<<"\"\n");
+ DEBUG(verifyFunction(F));
+ }
+ return Changed;
+ }
+
+ virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+ FunctionPass::getAnalysisUsage(AU);
+ AU.addRequired<ScalarEvolution>();
+ AU.addRequired<AliasAnalysis>();
+ AU.addRequired<TargetTransformInfo>();
+ AU.addRequired<LoopInfo>();
+ }
+
+private:
+
+ /// \brief Collect memory references and sort them according to their base
+ /// object. We sort the stores to their base objects to reduce the cost of the
+ /// quadratic search on the stores. TODO: We can further reduce this cost
+ /// if we flush the chain creation every time we run into a memory barrier.
+ unsigned collectStores(BasicBlock *BB, BoUpSLP &R);
+
+ /// \brief Try to vectorize a chain that starts at two arithmetic instrs.
+ bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R);
+
+ /// \brief Try to vectorize a list of operands.
+ bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R);
+
+ /// \brief Try to vectorize a chain that may start at the operands of \V;
+ bool tryToVectorize(BinaryOperator *V, BoUpSLP &R);
+
+ /// \brief Vectorize the stores that were collected in StoreRefs.
+ bool vectorizeStoreChains(BoUpSLP &R);
+
+ /// \brief Try to hoist gather sequences outside of the loop in cases where
+ /// all of the sources are loop invariant.
+ void hoistGatherSequence(LoopInfo *LI, BasicBlock *BB, BoUpSLP &R);
+
+ /// \brief Scan the basic block and look for reductions that may start a
+ /// vectorization chain.
+ bool vectorizeReductions(BasicBlock *BB, BoUpSLP &R);
+
+private:
+ StoreListMap StoreRefs;
+};
+
+unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) {
+ unsigned count = 0;
+ StoreRefs.clear();
+ for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+ StoreInst *SI = dyn_cast<StoreInst>(it);
+ if (!SI)
+ continue;
+
+ // Check that the pointer points to scalars.
+ Type *Ty = SI->getValueOperand()->getType();
+ if (Ty->isAggregateType() || Ty->isVectorTy())
+ return 0;
+
+ // Find the base of the GEP.
+ Value *Ptr = SI->getPointerOperand();
+ if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr))
+ Ptr = GEP->getPointerOperand();
+
+ // Save the store locations.
+ StoreRefs[Ptr].push_back(SI);
+ count++;
+ }
+ return count;
+}
+
+bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
+ if (!A || !B) return false;
+ Value *VL[] = { A, B };
+ return tryToVectorizeList(VL, R);
+}
+
+bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R) {
+ DEBUG(dbgs()<<"SLP: Vectorizing a list of length = " << VL.size() << ".\n");
+
+ // Check that all of the parts are scalar.
+ for (int i = 0, e = VL.size(); i < e; ++i) {
+ Type *Ty = VL[i]->getType();
+ if (Ty->isAggregateType() || Ty->isVectorTy())
+ return 0;
+ }
+
+ int Cost = R.getTreeCost(VL);
+ int ExtrCost = R.getScalarizationCost(VL);
+ DEBUG(dbgs()<<"SLP: Cost of pair:" << Cost <<
+ " Cost of extract:" << ExtrCost << ".\n");
+ if ((Cost+ExtrCost) >= -SLPCostThreshold) return false;
+ DEBUG(dbgs()<<"SLP: Vectorizing pair.\n");
+ R.vectorizeArith(VL);
+ return true;
+}
+
+bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
+ if (!V) return false;
+ // Try to vectorize V.
+ if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R))
+ return true;
+
+ BinaryOperator *A = dyn_cast<BinaryOperator>(V->getOperand(0));
+ BinaryOperator *B = dyn_cast<BinaryOperator>(V->getOperand(1));
+ // Try to skip B.
+ if (B && B->hasOneUse()) {
+ BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
+ BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
+ if (tryToVectorizePair(A, B0, R)) {
+ B->moveBefore(V);
+ return true;
+ }
+ if (tryToVectorizePair(A, B1, R)) {
+ B->moveBefore(V);
+ return true;
+ }
+ }
+
+ // Try to skip A.
+ if (A && A->hasOneUse()) {
+ BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
+ BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
+ if (tryToVectorizePair(A0, B, R)) {
+ A->moveBefore(V);
+ return true;
+ }
+ if (tryToVectorizePair(A1, B, R)) {
+ A->moveBefore(V);
+ return true;
+ }
+ }
+ return 0;
+}
+
+bool SLPVectorizer::vectorizeReductions(BasicBlock *BB, BoUpSLP &R) {
+ bool Changed = false;
+ for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+ if (isa<DbgInfoIntrinsic>(it)) continue;
+
+ // Try to vectorize reductions that use PHINodes.
+ if (PHINode *P = dyn_cast<PHINode>(it)) {
+ // Check that the PHI is a reduction PHI.
+ if (P->getNumIncomingValues() != 2) return Changed;
+ Value *Rdx = (P->getIncomingBlock(0) == BB ? P->getIncomingValue(0) :
+ (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1) :
+ 0));
+ // Check if this is a Binary Operator.
+ BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx);
+ if (!BI)
+ continue;
+
+ Value *Inst = BI->getOperand(0);
+ if (Inst == P) Inst = BI->getOperand(1);
+ Changed |= tryToVectorize(dyn_cast<BinaryOperator>(Inst), R);
+ continue;
+ }
+
+ // Try to vectorize trees that start at compare instructions.
+ if (CmpInst *CI = dyn_cast<CmpInst>(it)) {
+ if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) {
+ Changed |= true;
+ continue;
+ }
+ for (int i = 0; i < 2; ++i)
+ if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i)))
+ Changed |= tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R);
+ continue;
+ }
+ }
+
+ return Changed;
+}
+
+bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) {
+ bool Changed = false;
+ // Attempt to sort and vectorize each of the store-groups.
+ for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end();
+ it != e; ++it) {
+ if (it->second.size() < 2)
+ continue;
+
+ DEBUG(dbgs()<<"SLP: Analyzing a store chain of length " <<
+ it->second.size() << ".\n");
+
+ Changed |= R.vectorizeStores(it->second, -SLPCostThreshold);
+ }
+ return Changed;
+}
+
+void SLPVectorizer::hoistGatherSequence(LoopInfo *LI, BasicBlock *BB,
+ BoUpSLP &R) {
+ // Check if this block is inside a loop.
+ Loop *L = LI->getLoopFor(BB);
+ if (!L)
+ return;
+
+ // Check if it has a preheader.
+ BasicBlock *PreHeader = L->getLoopPreheader();
+ if (!PreHeader)
+ return;
+
+ // Mark the insertion point for the block.
+ Instruction *Location = PreHeader->getTerminator();
+
+ BoUpSLP::ValueList &Gathers = R.getGatherSeqInstructions();
+ for (BoUpSLP::ValueList::iterator it = Gathers.begin(), e = Gathers.end();
+ it != e; ++it) {
+ InsertElementInst *Insert = dyn_cast<InsertElementInst>(*it);
+
+ // The InsertElement sequence can be simplified into a constant.
+ if (!Insert)
+ continue;
+
+ // If the vector or the element that we insert into it are
+ // instructions that are defined in this basic block then we can't
+ // hoist this instruction.
+ Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0));
+ Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1));
+ if (CurrVec && L->contains(CurrVec)) continue;
+ if (NewElem && L->contains(NewElem)) continue;
+
+ // We can hoist this instruction. Move it to the pre-header.
+ Insert->moveBefore(Location);
+ }
+}
+
+} // end anonymous namespace
+
+char SLPVectorizer::ID = 0;
+static const char lv_name[] = "SLP Vectorizer";
+INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
+
+namespace llvm {
+ Pass *createSLPVectorizerPass() {
+ return new SLPVectorizer();
+ }
+}
+
diff --git a/lib/Transforms/Vectorize/VecUtils.cpp b/lib/Transforms/Vectorize/VecUtils.cpp
new file mode 100644
index 0000000..9b94366
--- /dev/null
+++ b/lib/Transforms/Vectorize/VecUtils.cpp
@@ -0,0 +1,730 @@
+//===- VecUtils.cpp --- Vectorization Utilities ---------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "SLP"
+
+#include "VecUtils.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+#include <map>
+
+using namespace llvm;
+
+static const unsigned MinVecRegSize = 128;
+
+static const unsigned RecursionMaxDepth = 6;
+
+namespace llvm {
+
+BoUpSLP::BoUpSLP(BasicBlock *Bb, ScalarEvolution *S, DataLayout *Dl,
+ TargetTransformInfo *Tti, AliasAnalysis *Aa, Loop *Lp) :
+ BB(Bb), SE(S), DL(Dl), TTI(Tti), AA(Aa), L(Lp) {
+ numberInstructions();
+}
+
+void BoUpSLP::numberInstructions() {
+ int Loc = 0;
+ InstrIdx.clear();
+ InstrVec.clear();
+ // Number the instructions in the block.
+ for (BasicBlock::iterator it=BB->begin(), e=BB->end(); it != e; ++it) {
+ InstrIdx[it] = Loc++;
+ InstrVec.push_back(it);
+ assert(InstrVec[InstrIdx[it]] == it && "Invalid allocation");
+ }
+}
+
+Value *BoUpSLP::getPointerOperand(Value *I) {
+ if (LoadInst *LI = dyn_cast<LoadInst>(I)) return LI->getPointerOperand();
+ if (StoreInst *SI = dyn_cast<StoreInst>(I)) return SI->getPointerOperand();
+ return 0;
+}
+
+unsigned BoUpSLP::getAddressSpaceOperand(Value *I) {
+ if (LoadInst *L=dyn_cast<LoadInst>(I)) return L->getPointerAddressSpace();
+ if (StoreInst *S=dyn_cast<StoreInst>(I)) return S->getPointerAddressSpace();
+ return -1;
+}
+
+bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) {
+ Value *PtrA = getPointerOperand(A);
+ Value *PtrB = getPointerOperand(B);
+ unsigned ASA = getAddressSpaceOperand(A);
+ unsigned ASB = getAddressSpaceOperand(B);
+
+ // Check that the address spaces match and that the pointers are valid.
+ if (!PtrA || !PtrB || (ASA != ASB)) return false;
+
+ // Check that A and B are of the same type.
+ if (PtrA->getType() != PtrB->getType()) return false;
+
+ // Calculate the distance.
+ const SCEV *PtrSCEVA = SE->getSCEV(PtrA);
+ const SCEV *PtrSCEVB = SE->getSCEV(PtrB);
+ const SCEV *OffsetSCEV = SE->getMinusSCEV(PtrSCEVA, PtrSCEVB);
+ const SCEVConstant *ConstOffSCEV = dyn_cast<SCEVConstant>(OffsetSCEV);
+
+ // Non constant distance.
+ if (!ConstOffSCEV) return false;
+
+ int64_t Offset = ConstOffSCEV->getValue()->getSExtValue();
+ Type *Ty = cast<PointerType>(PtrA->getType())->getElementType();
+ // The Instructions are connsecutive if the size of the first load/store is
+ // the same as the offset.
+ int64_t Sz = DL->getTypeStoreSize(Ty);
+ return ((-Offset) == Sz);
+}
+
+bool BoUpSLP::vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold) {
+ Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType();
+ unsigned Sz = DL->getTypeSizeInBits(StoreTy);
+ unsigned VF = MinVecRegSize / Sz;
+
+ if (!isPowerOf2_32(Sz) || VF < 2) return false;
+
+ bool Changed = false;
+ // Look for profitable vectorizable trees at all offsets, starting at zero.
+ for (unsigned i = 0, e = Chain.size(); i < e; ++i) {
+ if (i + VF > e) return Changed;
+ DEBUG(dbgs()<<"SLP: Analyzing " << VF << " stores at offset "<< i << "\n");
+ ArrayRef<Value *> Operands = Chain.slice(i, VF);
+
+ int Cost = getTreeCost(Operands);
+ DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
+ if (Cost < CostThreshold) {
+ DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
+ vectorizeTree(Operands, VF);
+ i += VF - 1;
+ Changed = true;
+ }
+ }
+
+ return Changed;
+}
+
+bool BoUpSLP::vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold) {
+ ValueSet Heads, Tails;
+ SmallDenseMap<Value*, Value*> ConsecutiveChain;
+
+ // We may run into multiple chains that merge into a single chain. We mark the
+ // stores that we vectorized so that we don't visit the same store twice.
+ ValueSet VectorizedStores;
+ bool Changed = false;
+
+ // Do a quadratic search on all of the given stores and find
+ // all of the pairs of loads that follow each other.
+ for (unsigned i = 0, e = Stores.size(); i < e; ++i)
+ for (unsigned j = 0; j < e; ++j) {
+ if (i == j) continue;
+ if (isConsecutiveAccess(Stores[i], Stores[j])) {
+ Tails.insert(Stores[j]);
+ Heads.insert(Stores[i]);
+ ConsecutiveChain[Stores[i]] = Stores[j];
+ }
+ }
+
+ // For stores that start but don't end a link in the chain:
+ for (ValueSet::iterator it = Heads.begin(), e = Heads.end();it != e; ++it) {
+ if (Tails.count(*it)) continue;
+
+ // We found a store instr that starts a chain. Now follow the chain and try
+ // to vectorize it.
+ ValueList Operands;
+ Value *I = *it;
+ // Collect the chain into a list.
+ while (Tails.count(I) || Heads.count(I)) {
+ if (VectorizedStores.count(I)) break;
+ Operands.push_back(I);
+ // Move to the next value in the chain.
+ I = ConsecutiveChain[I];
+ }
+
+ bool Vectorized = vectorizeStoreChain(Operands, costThreshold);
+
+ // Mark the vectorized stores so that we don't vectorize them again.
+ if (Vectorized)
+ VectorizedStores.insert(Operands.begin(), Operands.end());
+ Changed |= Vectorized;
+ }
+
+ return Changed;
+}
+
+int BoUpSLP::getScalarizationCost(ArrayRef<Value *> VL) {
+ // Find the type of the operands in VL.
+ Type *ScalarTy = VL[0]->getType();
+ if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
+ ScalarTy = SI->getValueOperand()->getType();
+ VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
+ // Find the cost of inserting/extracting values from the vector.
+ return getScalarizationCost(VecTy);
+}
+
+int BoUpSLP::getScalarizationCost(Type *Ty) {
+ int Cost = 0;
+ for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
+ Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
+ return Cost;
+}
+
+AliasAnalysis::Location BoUpSLP::getLocation(Instruction *I) {
+ if (StoreInst *SI = dyn_cast<StoreInst>(I)) return AA->getLocation(SI);
+ if (LoadInst *LI = dyn_cast<LoadInst>(I)) return AA->getLocation(LI);
+ return AliasAnalysis::Location();
+}
+
+Value *BoUpSLP::isUnsafeToSink(Instruction *Src, Instruction *Dst) {
+ assert(Src->getParent() == Dst->getParent() && "Not the same BB");
+ BasicBlock::iterator I = Src, E = Dst;
+ /// Scan all of the instruction from SRC to DST and check if
+ /// the source may alias.
+ for (++I; I != E; ++I) {
+ // Ignore store instructions that are marked as 'ignore'.
+ if (MemBarrierIgnoreList.count(I)) continue;
+ if (Src->mayWriteToMemory()) /* Write */ {
+ if (!I->mayReadOrWriteMemory()) continue;
+ } else /* Read */ {
+ if (!I->mayWriteToMemory()) continue;
+ }
+ AliasAnalysis::Location A = getLocation(&*I);
+ AliasAnalysis::Location B = getLocation(Src);
+
+ if (!A.Ptr || !B.Ptr || AA->alias(A, B))
+ return I;
+ }
+ return 0;
+}
+
+void BoUpSLP::vectorizeArith(ArrayRef<Value *> Operands) {
+ Value *Vec = vectorizeTree(Operands, Operands.size());
+ BasicBlock::iterator Loc = cast<Instruction>(Vec);
+ IRBuilder<> Builder(++Loc);
+ // After vectorizing the operands we need to generate extractelement
+ // instructions and replace all of the uses of the scalar values with
+ // the values that we extracted from the vectorized tree.
+ for (unsigned i = 0, e = Operands.size(); i != e; ++i) {
+ Value *S = Builder.CreateExtractElement(Vec, Builder.getInt32(i));
+ Operands[i]->replaceAllUsesWith(S);
+ }
+}
+
+int BoUpSLP::getTreeCost(ArrayRef<Value *> VL) {
+ // Get rid of the list of stores that were removed, and from the
+ // lists of instructions with multiple users.
+ MemBarrierIgnoreList.clear();
+ LaneMap.clear();
+ MultiUserVals.clear();
+ MustScalarize.clear();
+
+ // Scan the tree and find which value is used by which lane, and which values
+ // must be scalarized.
+ getTreeUses_rec(VL, 0);
+
+ // Check that instructions with multiple users can be vectorized. Mark unsafe
+ // instructions.
+ for (ValueSet::iterator it = MultiUserVals.begin(),
+ e = MultiUserVals.end(); it != e; ++it) {
+ // Check that all of the users of this instr are within the tree
+ // and that they are all from the same lane.
+ int Lane = -1;
+ for (Value::use_iterator I = (*it)->use_begin(), E = (*it)->use_end();
+ I != E; ++I) {
+ if (LaneMap.find(*I) == LaneMap.end()) {
+ MustScalarize.insert(*it);
+ DEBUG(dbgs()<<"SLP: Adding " << **it <<
+ " to MustScalarize because of an out of tree usage.\n");
+ break;
+ }
+ if (Lane == -1) Lane = LaneMap[*I];
+ if (Lane != LaneMap[*I]) {
+ MustScalarize.insert(*it);
+ DEBUG(dbgs()<<"Adding " << **it <<
+ " to MustScalarize because multiple lane use it: "
+ << Lane << " and " << LaneMap[*I] << ".\n");
+ break;
+ }
+ }
+ }
+
+ // Now calculate the cost of vectorizing the tree.
+ return getTreeCost_rec(VL, 0);
+}
+
+void BoUpSLP::getTreeUses_rec(ArrayRef<Value *> VL, unsigned Depth) {
+ if (Depth == RecursionMaxDepth) return;
+
+ // Don't handle vectors.
+ if (VL[0]->getType()->isVectorTy()) return;
+ if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
+ if (SI->getValueOperand()->getType()->isVectorTy()) return;
+
+ // Check if all of the operands are constants.
+ bool AllConst = true;
+ bool AllSameScalar = true;
+ for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+ AllConst &= isa<Constant>(VL[i]);
+ AllSameScalar &= (VL[0] == VL[i]);
+ Instruction *I = dyn_cast<Instruction>(VL[i]);
+ // If one of the instructions is out of this BB, we need to scalarize all.
+ if (I && I->getParent() != BB) return;
+ }
+
+ // If all of the operands are identical or constant we have a simple solution.
+ if (AllConst || AllSameScalar) return;
+
+ // Scalarize unknown structures.
+ Instruction *VL0 = dyn_cast<Instruction>(VL[0]);
+ if (!VL0) return;
+
+ unsigned Opcode = VL0->getOpcode();
+ for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+ Instruction *I = dyn_cast<Instruction>(VL[i]);
+ // If not all of the instructions are identical then we have to scalarize.
+ if (!I || Opcode != I->getOpcode()) return;
+ }
+
+ // Mark instructions with multiple users.
+ for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+ Instruction *I = dyn_cast<Instruction>(VL[i]);
+ // Remember to check if all of the users of this instr are vectorized
+ // within our tree.
+ if (I && I->getNumUses() > 1) MultiUserVals.insert(I);
+ }
+
+ for (int i = 0, e = VL.size(); i < e; ++i) {
+ // Check that the instruction is only used within
+ // one lane.
+ if (LaneMap.count(VL[i]) && LaneMap[VL[i]] != i) return;
+ // Make this instruction as 'seen' and remember the lane.
+ LaneMap[VL[i]] = i;
+ }
+
+ switch (Opcode) {
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::SIToFP:
+ case Instruction::UIToFP:
+ case Instruction::Trunc:
+ case Instruction::FPTrunc:
+ case Instruction::BitCast:
+ case Instruction::Add:
+ case Instruction::FAdd:
+ case Instruction::Sub:
+ case Instruction::FSub:
+ case Instruction::Mul:
+ case Instruction::FMul:
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::FDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor: {
+ for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
+ ValueList Operands;
+ // Prepare the operand vector.
+ for (unsigned j = 0; j < VL.size(); ++j)
+ Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
+
+ getTreeUses_rec(Operands, Depth+1);
+ }
+ return;
+ }
+ case Instruction::Store: {
+ ValueList Operands;
+ for (unsigned j = 0; j < VL.size(); ++j)
+ Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
+ getTreeUses_rec(Operands, Depth+1);
+ return;
+ }
+ default:
+ return;
+ }
+}
+
+int BoUpSLP::getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth) {
+ Type *ScalarTy = VL[0]->getType();
+
+ if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
+ ScalarTy = SI->getValueOperand()->getType();
+
+ /// Don't mess with vectors.
+ if (ScalarTy->isVectorTy()) return max_cost;
+ VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
+
+ if (Depth == RecursionMaxDepth) return getScalarizationCost(VecTy);
+
+ // Check if all of the operands are constants.
+ bool AllConst = true;
+ bool AllSameScalar = true;
+ bool MustScalarizeFlag = false;
+ for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+ AllConst &= isa<Constant>(VL[i]);
+ AllSameScalar &= (VL[0] == VL[i]);
+ // Must have a single use.
+ Instruction *I = dyn_cast<Instruction>(VL[i]);
+ MustScalarizeFlag |= MustScalarize.count(VL[i]);
+ // This instruction is outside the basic block.
+ if (I && I->getParent() != BB)
+ return getScalarizationCost(VecTy);
+ }
+
+ // Is this a simple vector constant.
+ if (AllConst) return 0;
+
+ // If all of the operands are identical we can broadcast them.
+ Instruction *VL0 = dyn_cast<Instruction>(VL[0]);
+ if (AllSameScalar) {
+ // If we are in a loop, and this is not an instruction (e.g. constant or
+ // argument) or the instruction is defined outside the loop then assume
+ // that the cost is zero.
+ if (L && (!VL0 || !L->contains(VL0)))
+ return 0;
+
+ // We need to broadcast the scalar.
+ return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
+ }
+
+ // If this is not a constant, or a scalar from outside the loop then we
+ // need to scalarize it.
+ if (MustScalarizeFlag)
+ return getScalarizationCost(VecTy);
+
+ if (!VL0) return getScalarizationCost(VecTy);
+ assert(VL0->getParent() == BB && "Wrong BB");
+
+ unsigned Opcode = VL0->getOpcode();
+ for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+ Instruction *I = dyn_cast<Instruction>(VL[i]);
+ // If not all of the instructions are identical then we have to scalarize.
+ if (!I || Opcode != I->getOpcode()) return getScalarizationCost(VecTy);
+ }
+
+ // Check if it is safe to sink the loads or the stores.
+ if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
+ int MaxIdx = InstrIdx[VL0];
+ for (unsigned i = 1, e = VL.size(); i < e; ++i )
+ MaxIdx = std::max(MaxIdx, InstrIdx[VL[i]]);
+
+ Instruction *Last = InstrVec[MaxIdx];
+ for (unsigned i = 0, e = VL.size(); i < e; ++i ) {
+ if (VL[i] == Last) continue;
+ Value *Barrier = isUnsafeToSink(cast<Instruction>(VL[i]), Last);
+ if (Barrier) {
+ DEBUG(dbgs() << "SLP: Can't sink " << *VL[i] << "\n down to " <<
+ *Last << "\n because of " << *Barrier << "\n");
+ return max_cost;
+ }
+ }
+ }
+
+ switch (Opcode) {
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::SIToFP:
+ case Instruction::UIToFP:
+ case Instruction::Trunc:
+ case Instruction::FPTrunc:
+ case Instruction::BitCast: {
+ int Cost = 0;
+ ValueList Operands;
+ Type *SrcTy = VL0->getOperand(0)->getType();
+ // Prepare the operand vector.
+ for (unsigned j = 0; j < VL.size(); ++j) {
+ Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
+ // Check that the casted type is the same for all users.
+ if (cast<Instruction>(VL[j])->getOperand(0)->getType() != SrcTy)
+ return getScalarizationCost(VecTy);
+ }
+
+ Cost += getTreeCost_rec(Operands, Depth+1);
+ if (Cost >= max_cost) return max_cost;
+
+ // Calculate the cost of this instruction.
+ int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
+ VL0->getType(), SrcTy);
+
+ VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
+ int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy);
+ Cost += (VecCost - ScalarCost);
+ return Cost;
+ }
+ case Instruction::Add:
+ case Instruction::FAdd:
+ case Instruction::Sub:
+ case Instruction::FSub:
+ case Instruction::Mul:
+ case Instruction::FMul:
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::FDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor: {
+ int Cost = 0;
+ // Calculate the cost of all of the operands.
+ for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
+ ValueList Operands;
+ // Prepare the operand vector.
+ for (unsigned j = 0; j < VL.size(); ++j)
+ Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
+
+ Cost += getTreeCost_rec(Operands, Depth+1);
+ if (Cost >= max_cost) return max_cost;
+ }
+
+ // Calculate the cost of this instruction.
+ int ScalarCost = VecTy->getNumElements() *
+ TTI->getArithmeticInstrCost(Opcode, ScalarTy);
+
+ int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy);
+ Cost += (VecCost - ScalarCost);
+ return Cost;
+ }
+ case Instruction::Load: {
+ // If we are scalarize the loads, add the cost of forming the vector.
+ for (unsigned i = 0, e = VL.size()-1; i < e; ++i)
+ if (!isConsecutiveAccess(VL[i], VL[i+1]))
+ return getScalarizationCost(VecTy);
+
+ // Cost of wide load - cost of scalar loads.
+ int ScalarLdCost = VecTy->getNumElements() *
+ TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
+ int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
+ return VecLdCost - ScalarLdCost;
+ }
+ case Instruction::Store: {
+ // We know that we can merge the stores. Calculate the cost.
+ int ScalarStCost = VecTy->getNumElements() *
+ TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0);
+ int VecStCost = TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1,0);
+ int StoreCost = VecStCost - ScalarStCost;
+
+ ValueList Operands;
+ for (unsigned j = 0; j < VL.size(); ++j) {
+ Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
+ MemBarrierIgnoreList.insert(VL[j]);
+ }
+
+ int TotalCost = StoreCost + getTreeCost_rec(Operands, Depth + 1);
+ return TotalCost;
+ }
+ default:
+ // Unable to vectorize unknown instructions.
+ return getScalarizationCost(VecTy);
+ }
+}
+
+Instruction *BoUpSLP::GetLastInstr(ArrayRef<Value *> VL, unsigned VF) {
+ int MaxIdx = InstrIdx[BB->getFirstNonPHI()];
+ for (unsigned i = 0; i < VF; ++i )
+ MaxIdx = std::max(MaxIdx, InstrIdx[VL[i]]);
+ return InstrVec[MaxIdx + 1];
+}
+
+Value *BoUpSLP::Scalarize(ArrayRef<Value *> VL, VectorType *Ty) {
+ IRBuilder<> Builder(GetLastInstr(VL, Ty->getNumElements()));
+ Value *Vec = UndefValue::get(Ty);
+ for (unsigned i=0; i < Ty->getNumElements(); ++i) {
+ // Generate the 'InsertElement' instruction.
+ Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
+ // Remember that this instruction is used as part of a 'gather' sequence.
+ // The caller of the bottom-up slp vectorizer can try to hoist the sequence
+ // if the users are outside of the basic block.
+ GatherInstructions.push_back(Vec);
+ }
+
+ return Vec;
+}
+
+Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL, int VF) {
+ Value *V = vectorizeTree_rec(VL, VF);
+ // We moved some instructions around. We have to number them again
+ // before we can do any analysis.
+ numberInstructions();
+ MustScalarize.clear();
+ return V;
+}
+
+Value *BoUpSLP::vectorizeTree_rec(ArrayRef<Value *> VL, int VF) {
+ Type *ScalarTy = VL[0]->getType();
+ if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
+ ScalarTy = SI->getValueOperand()->getType();
+ VectorType *VecTy = VectorType::get(ScalarTy, VF);
+
+ // Check if all of the operands are constants or identical.
+ bool AllConst = true;
+ bool AllSameScalar = true;
+ for (unsigned i = 0, e = VF; i < e; ++i) {
+ AllConst &= isa<Constant>(VL[i]);
+ AllSameScalar &= (VL[0] == VL[i]);
+ // The instruction must be in the same BB, and it must be vectorizable.
+ Instruction *I = dyn_cast<Instruction>(VL[i]);
+ if (MustScalarize.count(VL[i]) || (I && I->getParent() != BB))
+ return Scalarize(VL, VecTy);
+ }
+
+ // Check that this is a simple vector constant.
+ if (AllConst || AllSameScalar) return Scalarize(VL, VecTy);
+
+ // Scalarize unknown structures.
+ Instruction *VL0 = dyn_cast<Instruction>(VL[0]);
+ if (!VL0) return Scalarize(VL, VecTy);
+
+ if (VectorizedValues.count(VL0)) return VectorizedValues[VL0];
+
+ unsigned Opcode = VL0->getOpcode();
+ for (unsigned i = 0, e = VF; i < e; ++i) {
+ Instruction *I = dyn_cast<Instruction>(VL[i]);
+ // If not all of the instructions are identical then we have to scalarize.
+ if (!I || Opcode != I->getOpcode()) return Scalarize(VL, VecTy);
+ }
+
+ switch (Opcode) {
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::SIToFP:
+ case Instruction::UIToFP:
+ case Instruction::Trunc:
+ case Instruction::FPTrunc:
+ case Instruction::BitCast: {
+ ValueList INVL;
+ for (int i = 0; i < VF; ++i)
+ INVL.push_back(cast<Instruction>(VL[i])->getOperand(0));
+ Value *InVec = vectorizeTree_rec(INVL, VF);
+ IRBuilder<> Builder(GetLastInstr(VL, VF));
+ CastInst *CI = dyn_cast<CastInst>(VL0);
+ Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
+ VectorizedValues[VL0] = V;
+ return V;
+ }
+ case Instruction::Add:
+ case Instruction::FAdd:
+ case Instruction::Sub:
+ case Instruction::FSub:
+ case Instruction::Mul:
+ case Instruction::FMul:
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::FDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor: {
+ ValueList LHSVL, RHSVL;
+ for (int i = 0; i < VF; ++i) {
+ RHSVL.push_back(cast<Instruction>(VL[i])->getOperand(0));
+ LHSVL.push_back(cast<Instruction>(VL[i])->getOperand(1));
+ }
+
+ Value *RHS = vectorizeTree_rec(RHSVL, VF);
+ Value *LHS = vectorizeTree_rec(LHSVL, VF);
+ IRBuilder<> Builder(GetLastInstr(VL, VF));
+ BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
+ Value *V = Builder.CreateBinOp(BinOp->getOpcode(), RHS,LHS);
+ VectorizedValues[VL0] = V;
+ return V;
+ }
+ case Instruction::Load: {
+ LoadInst *LI = cast<LoadInst>(VL0);
+ unsigned Alignment = LI->getAlignment();
+
+ // Check if all of the loads are consecutive.
+ for (unsigned i = 1, e = VF; i < e; ++i)
+ if (!isConsecutiveAccess(VL[i-1], VL[i]))
+ return Scalarize(VL, VecTy);
+
+ IRBuilder<> Builder(GetLastInstr(VL, VF));
+ Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
+ VecTy->getPointerTo());
+ LI = Builder.CreateLoad(VecPtr);
+ LI->setAlignment(Alignment);
+ VectorizedValues[VL0] = LI;
+ return LI;
+ }
+ case Instruction::Store: {
+ StoreInst *SI = cast<StoreInst>(VL0);
+ unsigned Alignment = SI->getAlignment();
+
+ ValueList ValueOp;
+ for (int i = 0; i < VF; ++i)
+ ValueOp.push_back(cast<StoreInst>(VL[i])->getValueOperand());
+
+ Value *VecValue = vectorizeTree_rec(ValueOp, VF);
+
+ IRBuilder<> Builder(GetLastInstr(VL, VF));
+ Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(),
+ VecTy->getPointerTo());
+ Builder.CreateStore(VecValue, VecPtr)->setAlignment(Alignment);
+
+ for (int i = 0; i < VF; ++i)
+ cast<Instruction>(VL[i])->eraseFromParent();
+ return 0;
+ }
+ default:
+ Value *S = Scalarize(VL, VecTy);
+ VectorizedValues[VL0] = S;
+ return S;
+ }
+}
+
+} // end of namespace
diff --git a/lib/Transforms/Vectorize/VecUtils.h b/lib/Transforms/Vectorize/VecUtils.h
new file mode 100644
index 0000000..5456c6c
--- /dev/null
+++ b/lib/Transforms/Vectorize/VecUtils.h
@@ -0,0 +1,164 @@
+//===- VecUtils.h - Vectorization Utilities -------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This family of classes and functions manipulate vectors and chains of
+// vectors.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VECUTILS_H
+#define LLVM_TRANSFORMS_VECTORIZE_VECUTILS_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include <vector>
+
+namespace llvm {
+
+class BasicBlock; class Instruction; class Type;
+class VectorType; class StoreInst; class Value;
+class ScalarEvolution; class DataLayout;
+class TargetTransformInfo; class AliasAnalysis;
+class Loop;
+
+/// Bottom Up SLP vectorization utility class.
+struct BoUpSLP {
+ typedef SmallVector<Value*, 8> ValueList;
+ typedef SmallPtrSet<Value*, 16> ValueSet;
+ typedef SmallVector<StoreInst*, 8> StoreList;
+ static const int max_cost = 1<<20;
+
+ // \brief C'tor.
+ BoUpSLP(BasicBlock *Bb, ScalarEvolution *Se, DataLayout *Dl,
+ TargetTransformInfo *Tti, AliasAnalysis *Aa, Loop *Lp);
+
+ /// \brief Take the pointer operand from the Load/Store instruction.
+ /// \returns NULL if this is not a valid Load/Store instruction.
+ static Value *getPointerOperand(Value *I);
+
+ /// \brief Take the address space operand from the Load/Store instruction.
+ /// \returns -1 if this is not a valid Load/Store instruction.
+ static unsigned getAddressSpaceOperand(Value *I);
+
+ /// \returns true if the memory operations A and B are consecutive.
+ bool isConsecutiveAccess(Value *A, Value *B);
+
+ /// \brief Vectorize the tree that starts with the elements in \p VL.
+ /// \returns the vectorized value.
+ Value *vectorizeTree(ArrayRef<Value *> VL, int VF);
+
+ /// \returns the vectorization cost of the subtree that starts at \p VL.
+ /// A negative number means that this is profitable.
+ int getTreeCost(ArrayRef<Value *> VL);
+
+ /// \returns the scalarization cost for this list of values. Assuming that
+ /// this subtree gets vectorized, we may need to extract the values from the
+ /// roots. This method calculates the cost of extracting the values.
+ int getScalarizationCost(ArrayRef<Value *> VL);
+
+ /// \brief Attempts to order and vectorize a sequence of stores. This
+ /// function does a quadratic scan of the given stores.
+ /// \returns true if the basic block was modified.
+ bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold);
+
+ /// \brief Vectorize a group of scalars into a vector tree.
+ void vectorizeArith(ArrayRef<Value *> Operands);
+
+ /// \returns the list of new instructions that were added in order to collect
+ /// scalars into vectors. This list can be used to further optimize the gather
+ /// sequences.
+ ValueList &getGatherSeqInstructions() {return GatherInstructions; }
+
+private:
+ /// \brief This method contains the recursive part of getTreeCost.
+ int getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth);
+
+ /// \brief This recursive method looks for vectorization hazards such as
+ /// values that are used by multiple users and checks that values are used
+ /// by only one vector lane. It updates the variables LaneMap, MultiUserVals.
+ void getTreeUses_rec(ArrayRef<Value *> VL, unsigned Depth);
+
+ /// \brief This method contains the recursive part of vectorizeTree.
+ Value *vectorizeTree_rec(ArrayRef<Value *> VL, int VF);
+
+ /// \brief Number all of the instructions in the block.
+ void numberInstructions();
+
+ /// \brief Vectorize a sorted sequence of stores.
+ bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold);
+
+ /// \returns the scalarization cost for this type. Scalarization in this
+ /// context means the creation of vectors from a group of scalars.
+ int getScalarizationCost(Type *Ty);
+
+ /// \returns the AA location that is being access by the instruction.
+ AliasAnalysis::Location getLocation(Instruction *I);
+
+ /// \brief Checks if it is possible to sink an instruction from
+ /// \p Src to \p Dst.
+ /// \returns the pointer to the barrier instruction if we can't sink.
+ Value *isUnsafeToSink(Instruction *Src, Instruction *Dst);
+
+ /// \returns the instruction that appears last in the BB from \p VL.
+ /// Only consider the first \p VF elements.
+ Instruction *GetLastInstr(ArrayRef<Value *> VL, unsigned VF);
+
+ /// \returns a vector from a collection of scalars in \p VL.
+ Value *Scalarize(ArrayRef<Value *> VL, VectorType *Ty);
+
+private:
+ /// Maps instructions to numbers and back.
+ SmallDenseMap<Value*, int> InstrIdx;
+ /// Maps integers to Instructions.
+ std::vector<Instruction*> InstrVec;
+
+ // -- containers that are used during getTreeCost -- //
+
+ /// Contains values that must be scalarized because they are used
+ /// by multiple lanes, or by users outside the tree.
+ /// NOTICE: The vectorization methods also use this set.
+ ValueSet MustScalarize;
+
+ /// Contains a list of values that are used outside the current tree. This
+ /// set must be reset between runs.
+ ValueSet MultiUserVals;
+ /// Maps values in the tree to the vector lanes that uses them. This map must
+ /// be reset between runs of getCost.
+ std::map<Value*, int> LaneMap;
+ /// A list of instructions to ignore while sinking
+ /// memory instructions. This map must be reset between runs of getCost.
+ SmallPtrSet<Value *, 8> MemBarrierIgnoreList;
+
+ // -- Containers that are used during vectorizeTree -- //
+
+ /// Maps between the first scalar to the vector. This map must be reset
+ ///between runs.
+ DenseMap<Value*, Value*> VectorizedValues;
+
+ // -- Containers that are used after vectorization by the caller -- //
+
+ /// A list of instructions that are used when gathering scalars into vectors.
+ /// In many cases these instructions can be hoisted outside of the BB.
+ /// Iterating over this list is faster than calling LICM.
+ ValueList GatherInstructions;
+
+ // Analysis and block reference.
+ BasicBlock *BB;
+ ScalarEvolution *SE;
+ DataLayout *DL;
+ TargetTransformInfo *TTI;
+ AliasAnalysis *AA;
+ Loop *L;
+};
+
+} // end of namespace
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_VECUTILS_H
diff --git a/lib/Transforms/Vectorize/Vectorize.cpp b/lib/Transforms/Vectorize/Vectorize.cpp
index 19eefd2..a927fe1 100644
--- a/lib/Transforms/Vectorize/Vectorize.cpp
+++ b/lib/Transforms/Vectorize/Vectorize.cpp
@@ -1,4 +1,4 @@
- //===-- Vectorize.cpp -----------------------------------------------------===//
+//===-- Vectorize.cpp -----------------------------------------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -28,6 +28,7 @@ using namespace llvm;
void llvm::initializeVectorization(PassRegistry &Registry) {
initializeBBVectorizePass(Registry);
initializeLoopVectorizePass(Registry);
+ initializeSLPVectorizerPass(Registry);
}
void LLVMInitializeVectorization(LLVMPassRegistryRef R) {
@@ -41,3 +42,7 @@ void LLVMAddBBVectorizePass(LLVMPassManagerRef PM) {
void LLVMAddLoopVectorizePass(LLVMPassManagerRef PM) {
unwrap(PM)->add(createLoopVectorizePass());
}
+
+void LLVMAddSLPVectorizePass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createSLPVectorizerPass());
+}
OpenPOWER on IntegriCloud