diff options
Diffstat (limited to 'lib/Transforms/Vectorize')
-rw-r--r-- | lib/Transforms/Vectorize/CMakeLists.txt | 2 | ||||
-rw-r--r-- | lib/Transforms/Vectorize/LoopVectorize.cpp | 538 | ||||
-rw-r--r-- | lib/Transforms/Vectorize/SLPVectorizer.cpp | 348 | ||||
-rw-r--r-- | lib/Transforms/Vectorize/VecUtils.cpp | 730 | ||||
-rw-r--r-- | lib/Transforms/Vectorize/VecUtils.h | 164 | ||||
-rw-r--r-- | lib/Transforms/Vectorize/Vectorize.cpp | 7 |
6 files changed, 1651 insertions, 138 deletions
diff --git a/lib/Transforms/Vectorize/CMakeLists.txt b/lib/Transforms/Vectorize/CMakeLists.txt index e64034a..7ae082f 100644 --- a/lib/Transforms/Vectorize/CMakeLists.txt +++ b/lib/Transforms/Vectorize/CMakeLists.txt @@ -2,6 +2,8 @@ add_llvm_library(LLVMVectorize BBVectorize.cpp Vectorize.cpp LoopVectorize.cpp + SLPVectorizer.cpp + VecUtils.cpp ) add_dependencies(LLVMVectorize intrinsics_gen) diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index acf2b81..08d3725 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8,9 +8,9 @@ //===----------------------------------------------------------------------===// // // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops -// and generates target-independent LLVM-IR. Legalization of the IR is done -// in the codegen. However, the vectorizer uses (will use) the codegen -// interfaces to generate IR that is likely to result in an optimal binary. +// and generates target-independent LLVM-IR. +// The vectorizer uses the TargetTransformInfo analysis to estimate the costs +// of instructions in order to estimate the profitability of vectorization. // // The loop vectorizer combines consecutive loop iterations into a single // 'wide' iteration. After this transformation the index is incremented @@ -78,7 +78,9 @@ #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/PatternMatch.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Support/ValueHandle.h" #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" @@ -87,6 +89,7 @@ #include <map> using namespace llvm; +using namespace llvm::PatternMatch; static cl::opt<unsigned> VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, @@ -112,9 +115,9 @@ TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), /// We don't unroll loops with a known constant trip count below this number. static const unsigned TinyTripCountUnrollThreshold = 128; -/// When performing a runtime memory check, do not check more than this -/// number of pointers. Notice that the check is quadratic! -static const unsigned RuntimeMemoryCheckThreshold = 4; +/// When performing memory disambiguation checks at runtime do not make more +/// than this number of comparisons. +static const unsigned RuntimeMemoryCheckThreshold = 8; /// We use a metadata with this name to indicate that a scalar loop was /// vectorized and that we don't need to re-vectorize it if we run into it @@ -214,7 +217,7 @@ private: /// This function adds 0, 1, 2 ... to each vector element, starting at zero. /// If Negate is set then negative numbers are added e.g. (0, -1, -2, ...). /// The sequence starts at StartIndex. - Value *getConsecutiveVector(Value* Val, unsigned StartIdx, bool Negate); + Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate); /// When we go over instructions in the basic block we rely on previous /// values within the current basic block or on loop invariant values. @@ -333,7 +336,7 @@ public: DominatorTree *DT, TargetTransformInfo* TTI, AliasAnalysis *AA, TargetLibraryInfo *TLI) : TheLoop(L), SE(SE), DL(DL), DT(DT), TTI(TTI), AA(AA), TLI(TLI), - Induction(0) {} + Induction(0), HasFunNoNaNAttr(false) {} /// This enum represents the kinds of reductions that we support. enum ReductionKind { @@ -343,8 +346,10 @@ public: RK_IntegerOr, ///< Bitwise or logical OR of numbers. RK_IntegerAnd, ///< Bitwise or logical AND of numbers. RK_IntegerXor, ///< Bitwise or logical XOR of numbers. + RK_IntegerMinMax, ///< Min/max implemented in terms of select(cmp()). RK_FloatAdd, ///< Sum of floats. - RK_FloatMult ///< Product of floats. + RK_FloatMult, ///< Product of floats. + RK_FloatMinMax ///< Min/max implemented in terms of select(cmp()). }; /// This enum represents the kinds of inductions that we support. @@ -356,21 +361,52 @@ public: IK_ReversePtrInduction ///< Reverse ptr indvar. Step = - sizeof(elem). }; + // This enum represents the kind of minmax reduction. + enum MinMaxReductionKind { + MRK_Invalid, + MRK_UIntMin, + MRK_UIntMax, + MRK_SIntMin, + MRK_SIntMax, + MRK_FloatMin, + MRK_FloatMax + }; + /// This POD struct holds information about reduction variables. struct ReductionDescriptor { ReductionDescriptor() : StartValue(0), LoopExitInstr(0), - Kind(RK_NoReduction) {} + Kind(RK_NoReduction), MinMaxKind(MRK_Invalid) {} - ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K) - : StartValue(Start), LoopExitInstr(Exit), Kind(K) {} + ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K, + MinMaxReductionKind MK) + : StartValue(Start), LoopExitInstr(Exit), Kind(K), MinMaxKind(MK) {} // The starting value of the reduction. // It does not have to be zero! - Value *StartValue; + TrackingVH<Value> StartValue; // The instruction who's value is used outside the loop. Instruction *LoopExitInstr; // The kind of the reduction. ReductionKind Kind; + // If this a min/max reduction the kind of reduction. + MinMaxReductionKind MinMaxKind; + }; + + /// This POD struct holds information about a potential reduction operation. + struct ReductionInstDesc { + ReductionInstDesc(bool IsRedux, Instruction *I) : + IsReduction(IsRedux), PatternLastInst(I), MinMaxKind(MRK_Invalid) {} + + ReductionInstDesc(Instruction *I, MinMaxReductionKind K) : + IsReduction(true), PatternLastInst(I), MinMaxKind(K) {} + + // Is this instruction a reduction candidate. + bool IsReduction; + // The last instruction in a min/max pattern (select of the select(icmp()) + // pattern), or the current reduction instruction otherwise. + Instruction *PatternLastInst; + // If this is a min/max pattern the comparison predicate. + MinMaxReductionKind MinMaxKind; }; // This POD struct holds information about the memory runtime legality @@ -387,16 +423,18 @@ public: } /// Insert a pointer and calculate the start and end SCEVs. - void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr); + void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr); /// This flag indicates if we need to add the runtime check. bool Need; /// Holds the pointers that we need to check. - SmallVector<Value*, 2> Pointers; + SmallVector<TrackingVH<Value>, 2> Pointers; /// Holds the pointer value at the beginning of the loop. SmallVector<const SCEV*, 2> Starts; /// Holds the pointer value at the end of the loop. SmallVector<const SCEV*, 2> Ends; + /// Holds the information if this pointer is used for writing to memory. + SmallVector<bool, 2> IsWritePtr; }; /// A POD for saving information about induction variables. @@ -404,7 +442,7 @@ public: InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {} InductionInfo() : StartValue(0), IK(IK_NoInduction) {} /// Start value. - Value *StartValue; + TrackingVH<Value> StartValue; /// Induction kind. InductionKind IK; }; @@ -461,6 +499,10 @@ public: /// Returns the information that we collected about runtime memory check. RuntimePointerCheck *getRuntimePointerCheck() { return &PtrRtCheck; } + + /// This function returns the identity element (or neutral element) for + /// the operation K. + static Constant *getReductionIdentity(ReductionKind K, Type *Tp); private: /// Check if a single basic block loop is vectorizable. /// At this point we know that this is a loop with a constant trip count @@ -487,9 +529,17 @@ private: /// Returns True, if 'Phi' is the kind of reduction variable for type /// 'Kind'. If this is a reduction variable, it adds it to ReductionList. bool AddReductionVar(PHINode *Phi, ReductionKind Kind); - /// Returns true if the instruction I can be a reduction variable of type - /// 'Kind'. - bool isReductionInstr(Instruction *I, ReductionKind Kind); + /// Returns a struct describing if the instruction 'I' can be a reduction + /// variable of type 'Kind'. If the reduction is a min/max pattern of + /// select(icmp()) this function advances the instruction pointer 'I' from the + /// compare instruction to the select instruction and stores this pointer in + /// 'PatternLastInst' member of the returned struct. + ReductionInstDesc isReductionInstr(Instruction *I, ReductionKind Kind, + ReductionInstDesc &Desc); + /// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction + /// pattern corresponding to a min(X, Y) or max(X, Y). + static ReductionInstDesc isMinMaxSelectCmpPattern(Instruction *I, + ReductionInstDesc &Prev); /// Returns the induction kind of Phi. This function may return NoInduction /// if the PHI is not an induction variable. InductionKind isInductionVariable(PHINode *Phi); @@ -540,6 +590,8 @@ private: /// We need to check that all of the pointers in this list are disjoint /// at runtime. RuntimePointerCheck PtrRtCheck; + /// Can we assume the absence of NaNs. + bool HasFunNoNaNAttr; }; /// LoopVectorizationCostModel - estimates the expected speedups due to @@ -662,6 +714,11 @@ struct LoopVectorize : public LoopPass { AA = getAnalysisIfAvailable<AliasAnalysis>(); TLI = getAnalysisIfAvailable<TargetLibraryInfo>(); + if (DL == NULL) { + DEBUG(dbgs() << "LV: Not vectorizing because of missing data layout"); + return false; + } + DEBUG(dbgs() << "LV: Checking a loop in \"" << L->getHeader()->getParent()->getName() << "\"\n"); @@ -737,7 +794,8 @@ struct LoopVectorize : public LoopPass { void LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE, - Loop *Lp, Value *Ptr) { + Loop *Lp, Value *Ptr, + bool WritePtr) { const SCEV *Sc = SE->getSCEV(Ptr); const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc); assert(AR && "Invalid addrec expression"); @@ -746,6 +804,7 @@ LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE, Pointers.push_back(Ptr); Starts.push_back(AR->getStart()); Ends.push_back(ScEnd); + IsWritePtr.push_back(WritePtr); } Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { @@ -771,7 +830,7 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { return Shuf; } -Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, unsigned StartIdx, +Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, int StartIdx, bool Negate) { assert(Val->getType()->isVectorTy() && "Must be a vector"); assert(Val->getType()->getScalarType()->isIntegerTy() && @@ -784,8 +843,8 @@ Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, unsigned StartIdx, // Create a vector of consecutive numbers from zero to VF. for (int i = 0; i < VLen; ++i) { - int Idx = Negate ? (-i): i; - Indices.push_back(ConstantInt::get(ITy, StartIdx + Idx)); + int64_t Idx = Negate ? (-i) : i; + Indices.push_back(ConstantInt::get(ITy, StartIdx + Idx, Negate)); } // Add the consecutive indices to the vector value. @@ -906,12 +965,18 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment(); + unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy); + unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF; + + if (ScalarAllocatedSize != VectorElementSize) + return scalarizeInstruction(Instr); + // If the pointer is loop invariant or if it is non consecutive, // scalarize the load. - int Stride = Legal->isConsecutivePtr(Ptr); - bool Reverse = Stride < 0; + int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); + bool Reverse = ConsecutiveStride < 0; bool UniformLoad = LI && Legal->isUniform(Ptr); - if (Stride == 0 || UniformLoad) + if (!ConsecutiveStride || UniformLoad) return scalarizeInstruction(Instr); Constant *Zero = Builder.getInt32(0); @@ -1040,10 +1105,10 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) { // Create a new entry in the WidenMap and initialize it to Undef or Null. VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); - // For each scalar that we create: - for (unsigned Width = 0; Width < VF; ++Width) { - // For each vector unroll 'part': - for (unsigned Part = 0; Part < UF; ++Part) { + // For each vector unroll 'part': + for (unsigned Part = 0; Part < UF; ++Part) { + // For each scalar that we create: + for (unsigned Width = 0; Width < VF; ++Width) { Instruction *Cloned = Instr->clone(); if (!IsVoidRetTy) Cloned->setName(Instr->getName() + ".cloned"); @@ -1110,6 +1175,10 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, for (unsigned i = 0; i < NumPointers; ++i) { for (unsigned j = i+1; j < NumPointers; ++j) { + // No need to check if two readonly pointers intersect. + if (!PtrRtCheck->IsWritePtr[i] && !PtrRtCheck->IsWritePtr[j]) + continue; + Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy, "bc"); Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy, "bc"); Value *End0 = ChkBuilder.CreateBitCast(Ends[i], PtrArithTy, "bc"); @@ -1167,7 +1236,7 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { // Mark the old scalar loop with metadata that tells us not to vectorize this // loop again if we run into it. - MDNode *MD = MDNode::get(OldBasicBlock->getContext(), ArrayRef<Value*>()); + MDNode *MD = MDNode::get(OldBasicBlock->getContext(), None); OldBasicBlock->getTerminator()->setMetadata(AlreadyVectorizedMDName, MD); // Some loops have a single integer induction variable, while other loops @@ -1436,24 +1505,24 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { /// This function returns the identity element (or neutral element) for /// the operation K. -static Constant* -getReductionIdentity(LoopVectorizationLegality::ReductionKind K, Type *Tp) { +Constant* +LoopVectorizationLegality::getReductionIdentity(ReductionKind K, Type *Tp) { switch (K) { - case LoopVectorizationLegality:: RK_IntegerXor: - case LoopVectorizationLegality:: RK_IntegerAdd: - case LoopVectorizationLegality:: RK_IntegerOr: + case RK_IntegerXor: + case RK_IntegerAdd: + case RK_IntegerOr: // Adding, Xoring, Oring zero to a number does not change it. return ConstantInt::get(Tp, 0); - case LoopVectorizationLegality:: RK_IntegerMult: + case RK_IntegerMult: // Multiplying a number by 1 does not change it. return ConstantInt::get(Tp, 1); - case LoopVectorizationLegality:: RK_IntegerAnd: + case RK_IntegerAnd: // AND-ing a number with an all-1 value does not change it. return ConstantInt::get(Tp, -1, true); - case LoopVectorizationLegality:: RK_FloatMult: + case RK_FloatMult: // Multiplying a number by 1 does not change it. return ConstantFP::get(Tp, 1.0L); - case LoopVectorizationLegality:: RK_FloatAdd: + case RK_FloatAdd: // Adding zero to a number does not change it. return ConstantFP::get(Tp, 0.0L); default: @@ -1566,7 +1635,7 @@ getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) { } /// This function translates the reduction kind to an LLVM binary operator. -static Instruction::BinaryOps +static unsigned getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) { switch (Kind) { case LoopVectorizationLegality::RK_IntegerAdd: @@ -1583,11 +1652,53 @@ getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) { return Instruction::FMul; case LoopVectorizationLegality::RK_FloatAdd: return Instruction::FAdd; + case LoopVectorizationLegality::RK_IntegerMinMax: + return Instruction::ICmp; + case LoopVectorizationLegality::RK_FloatMinMax: + return Instruction::FCmp; default: llvm_unreachable("Unknown reduction operation"); } } +Value *createMinMaxOp(IRBuilder<> &Builder, + LoopVectorizationLegality::MinMaxReductionKind RK, + Value *Left, + Value *Right) { + CmpInst::Predicate P = CmpInst::ICMP_NE; + switch (RK) { + default: + llvm_unreachable("Unknown min/max reduction kind"); + case LoopVectorizationLegality::MRK_UIntMin: + P = CmpInst::ICMP_ULT; + break; + case LoopVectorizationLegality::MRK_UIntMax: + P = CmpInst::ICMP_UGT; + break; + case LoopVectorizationLegality::MRK_SIntMin: + P = CmpInst::ICMP_SLT; + break; + case LoopVectorizationLegality::MRK_SIntMax: + P = CmpInst::ICMP_SGT; + break; + case LoopVectorizationLegality::MRK_FloatMin: + P = CmpInst::FCMP_OLT; + break; + case LoopVectorizationLegality::MRK_FloatMax: + P = CmpInst::FCMP_OGT; + break; + } + + Value *Cmp; + if (RK == LoopVectorizationLegality::MRK_FloatMin || RK == LoopVectorizationLegality::MRK_FloatMax) + Cmp = Builder.CreateFCmp(P, Left, Right, "rdx.minmax.cmp"); + else + Cmp = Builder.CreateICmp(P, Left, Right, "rdx.minmax.cmp"); + + Value *Select = Builder.CreateSelect(Cmp, Left, Right, "rdx.minmax.select"); + return Select; +} + void InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { //===------------------------------------------------===// @@ -1651,13 +1762,24 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // Find the reduction identity variable. Zero for addition, or, xor, // one for multiplication, -1 for And. - Constant *Iden = getReductionIdentity(RdxDesc.Kind, VecTy->getScalarType()); - Constant *Identity = ConstantVector::getSplat(VF, Iden); - - // This vector is the Identity vector where the first element is the - // incoming scalar reduction. - Value *VectorStart = Builder.CreateInsertElement(Identity, - RdxDesc.StartValue, Zero); + Value *Identity; + Value *VectorStart; + if (RdxDesc.Kind == LoopVectorizationLegality::RK_IntegerMinMax || + RdxDesc.Kind == LoopVectorizationLegality::RK_FloatMinMax) { + // MinMax reduction have the start value as their identify. + VectorStart = Identity = Builder.CreateVectorSplat(VF, RdxDesc.StartValue, + "minmax.ident"); + } else { + Constant *Iden = + LoopVectorizationLegality::getReductionIdentity(RdxDesc.Kind, + VecTy->getScalarType()); + Identity = ConstantVector::getSplat(VF, Iden); + + // This vector is the Identity vector where the first element is the + // incoming scalar reduction. + VectorStart = Builder.CreateInsertElement(Identity, + RdxDesc.StartValue, Zero); + } // Fix the vector-loop phi. // We created the induction variable so we know that the @@ -1699,10 +1821,15 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // Reduce all of the unrolled parts into a single vector. Value *ReducedPartRdx = RdxParts[0]; + unsigned Op = getReductionBinOp(RdxDesc.Kind); for (unsigned part = 1; part < UF; ++part) { - Instruction::BinaryOps Op = getReductionBinOp(RdxDesc.Kind); - ReducedPartRdx = Builder.CreateBinOp(Op, RdxParts[part], ReducedPartRdx, - "bin.rdx"); + if (Op != Instruction::ICmp && Op != Instruction::FCmp) + ReducedPartRdx = Builder.CreateBinOp((Instruction::BinaryOps)Op, + RdxParts[part], ReducedPartRdx, + "bin.rdx"); + else + ReducedPartRdx = createMinMaxOp(Builder, RdxDesc.MinMaxKind, + ReducedPartRdx, RdxParts[part]); } // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles @@ -1727,8 +1854,11 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { ConstantVector::get(ShuffleMask), "rdx.shuf"); - Instruction::BinaryOps Op = getReductionBinOp(RdxDesc.Kind); - TmpVec = Builder.CreateBinOp(Op, TmpVec, Shuf, "bin.rdx"); + if (Op != Instruction::ICmp && Op != Instruction::FCmp) + TmpVec = Builder.CreateBinOp((Instruction::BinaryOps)Op, TmpVec, Shuf, + "bin.rdx"); + else + TmpVec = createMinMaxOp(Builder, RdxDesc.MinMaxKind, TmpVec, Shuf); } // The result is in the first element of the vector. @@ -1861,18 +1991,33 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, // We know that all PHIs in non header blocks are converted into // selects, so we don't have to worry about the insertion order and we // can just use the builder. - // At this point we generate the predication tree. There may be // duplications since this is a simple recursive scan, but future // optimizations will clean it up. - VectorParts Cond = createEdgeMask(P->getIncomingBlock(0), - P->getParent()); - for (unsigned part = 0; part < UF; ++part) { - VectorParts &In0 = getVectorValue(P->getIncomingValue(0)); - VectorParts &In1 = getVectorValue(P->getIncomingValue(1)); - Entry[part] = Builder.CreateSelect(Cond[part], In0[part], In1[part], - "predphi"); + unsigned NumIncoming = P->getNumIncomingValues(); + assert(NumIncoming > 1 && "Invalid PHI"); + + // Generate a sequence of selects of the form: + // SELECT(Mask3, In3, + // SELECT(Mask2, In2, + // ( ...))) + for (unsigned In = 0; In < NumIncoming; In++) { + VectorParts Cond = createEdgeMask(P->getIncomingBlock(In), + P->getParent()); + VectorParts &In0 = getVectorValue(P->getIncomingValue(In)); + + for (unsigned part = 0; part < UF; ++part) { + // We don't need to 'select' the first PHI operand because it is + // the default value if all of the other masks don't match. + if (In == 0) + Entry[part] = In0[part]; + else + // Select between the current value and the previous incoming edge + // based on the incoming mask. + Entry[part] = Builder.CreateSelect(Cond[part], In0[part], + Entry[part], "predphi"); + } } continue; } @@ -1928,7 +2073,8 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, // After broadcasting the induction variable we need to make the // vector consecutive by adding ... -3, -2, -1, 0. for (unsigned part = 0; part < UF; ++part) - Entry[part] = getConsecutiveVector(Broadcasted, -VF * part, true); + Entry[part] = getConsecutiveVector(Broadcasted, -(int)VF * part, + true); continue; } @@ -2152,12 +2298,6 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { if (!isa<BranchInst>(BB->getTerminator())) return false; - // We must have at most two predecessors because we need to convert - // all PHIs to selects. - unsigned Preds = std::distance(pred_begin(BB), pred_end(BB)); - if (Preds > 2) - return false; - // We must be able to predicate all blocks that need to be predicated. if (blockNeedsPredication(BB) && !blockCanBePredicated(BB)) return false; @@ -2168,7 +2308,10 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { } bool LoopVectorizationLegality::canVectorize() { - assert(TheLoop->getLoopPreheader() && "No preheader!!"); + // We must have a loop in canonical form. Loops with indirectbr in them cannot + // be canonicalized. + if (!TheLoop->getLoopPreheader()) + return false; // We can only vectorize innermost loops. if (TheLoop->getSubLoopsVector().size()) @@ -2235,6 +2378,26 @@ bool LoopVectorizationLegality::canVectorize() { return true; } +/// \brief Check that the instruction has outside loop users and is not an +/// identified reduction variable. +static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst, + SmallPtrSet<Value *, 4> &Reductions) { + // Reduction instructions are allowed to have exit users. All other + // instructions must not have external users. + if (!Reductions.count(Inst)) + //Check that all of the users of the loop are inside the BB. + for (Value::use_iterator I = Inst->use_begin(), E = Inst->use_end(); + I != E; ++I) { + Instruction *U = cast<Instruction>(*I); + // This user may be a reduction exit value. + if (!TheLoop->contains(U)) { + DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n"); + return true; + } + } + return false; +} + bool LoopVectorizationLegality::canVectorizeInstrs() { BasicBlock *PreHeader = TheLoop->getLoopPreheader(); BasicBlock *Header = TheLoop->getHeader(); @@ -2246,6 +2409,13 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { return false; } + // Look for the attribute signaling the absence of NaNs. + Function &F = *Header->getParent(); + if (F.hasFnAttribute("no-nans-fp-math")) + HasFunNoNaNAttr = F.getAttributes().getAttribute( + AttributeSet::FunctionIndex, + "no-nans-fp-math").getValueAsString() == "true"; + // For each block in the loop. for (Loop::block_iterator bb = TheLoop->block_begin(), be = TheLoop->block_end(); bb != be; ++bb) { @@ -2255,12 +2425,6 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { ++it) { if (PHINode *Phi = dyn_cast<PHINode>(it)) { - // This should not happen because the loop should be normalized. - if (Phi->getNumIncomingValues() != 2) { - DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); - return false; - } - // Check that this PHI type is allowed. if (!Phi->getType()->isIntegerTy() && !Phi->getType()->isFloatingPointTy() && @@ -2272,8 +2436,19 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // If this PHINode is not in the header block, then we know that we // can convert it to select during if-conversion. No need to check if // the PHIs in this block are induction or reduction variables. - if (*bb != Header) - continue; + if (*bb != Header) { + // Check that this instruction has no outside users or is an + // identified reduction value with an outside user. + if(!hasOutsideLoopUser(TheLoop, it, AllowedExit)) + continue; + return false; + } + + // We only allow if-converted PHIs with more than two incoming values. + if (Phi->getNumIncomingValues() != 2) { + DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); + return false; + } // This is the value coming from the preheader. Value *StartValue = Phi->getIncomingValueForBlock(PreHeader); @@ -2315,6 +2490,10 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n"); continue; } + if (AddReductionVar(Phi, RK_IntegerMinMax)) { + DEBUG(dbgs() << "LV: Found a MINMAX reduction PHI."<< *Phi <<"\n"); + continue; + } if (AddReductionVar(Phi, RK_FloatMult)) { DEBUG(dbgs() << "LV: Found an FMult reduction PHI."<< *Phi <<"\n"); continue; @@ -2323,6 +2502,10 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { DEBUG(dbgs() << "LV: Found an FAdd reduction PHI."<< *Phi <<"\n"); continue; } + if (AddReductionVar(Phi, RK_FloatMinMax)) { + DEBUG(dbgs() << "LV: Found an float MINMAX reduction PHI."<< *Phi <<"\n"); + continue; + } DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n"); return false; @@ -2352,17 +2535,9 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // Reduction instructions are allowed to have exit users. // All other instructions must not have external users. - if (!AllowedExit.count(it)) - //Check that all of the users of the loop are inside the BB. - for (Value::use_iterator I = it->use_begin(), E = it->use_end(); - I != E; ++I) { - Instruction *U = cast<Instruction>(*I); - // This user may be a reduction exit value. - if (!TheLoop->contains(U)) { - DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n"); - return false; - } - } + if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) + return false; + } // next instr. } @@ -2442,13 +2617,6 @@ LoopVectorizationLegality::hasPossibleGlobalWriteReorder( bool LoopVectorizationLegality::canVectorizeMemory() { - if (TheLoop->isAnnotatedParallel()) { - DEBUG(dbgs() - << "LV: A loop annotated parallel, ignore memory dependency " - << "checks.\n"); - return true; - } - typedef SmallVector<Value*, 16> ValueVector; typedef SmallPtrSet<Value*, 16> ValueSet; // Holds the Load and Store *instructions*. @@ -2457,6 +2625,8 @@ bool LoopVectorizationLegality::canVectorizeMemory() { PtrRtCheck.Pointers.clear(); PtrRtCheck.Need = false; + const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel(); + // For each block. for (Loop::block_iterator bb = TheLoop->block_begin(), be = TheLoop->block_end(); bb != be; ++bb) { @@ -2471,7 +2641,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() { if (it->mayReadFromMemory()) { LoadInst *Ld = dyn_cast<LoadInst>(it); if (!Ld) return false; - if (!Ld->isSimple()) { + if (!Ld->isSimple() && !IsAnnotatedParallel) { DEBUG(dbgs() << "LV: Found a non-simple load.\n"); return false; } @@ -2483,7 +2653,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() { if (it->mayWriteToMemory()) { StoreInst *St = dyn_cast<StoreInst>(it); if (!St) return false; - if (!St->isSimple()) { + if (!St->isSimple() && !IsAnnotatedParallel) { DEBUG(dbgs() << "LV: Found a non-simple store.\n"); return false; } @@ -2530,6 +2700,13 @@ bool LoopVectorizationLegality::canVectorizeMemory() { ReadWrites.insert(std::make_pair(Ptr, ST)); } + if (IsAnnotatedParallel) { + DEBUG(dbgs() + << "LV: A loop annotated parallel, ignore memory dependency " + << "checks.\n"); + return true; + } + for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) { LoadInst *LD = cast<LoadInst>(*I); Value* Ptr = LD->getPointerOperand(); @@ -2552,6 +2729,9 @@ bool LoopVectorizationLegality::canVectorizeMemory() { return true; } + unsigned NumReadPtrs = 0; + unsigned NumWritePtrs = 0; + // Find pointers with computable bounds. We are going to use this information // to place a runtime bound check. bool CanDoRT = true; @@ -2559,7 +2739,8 @@ bool LoopVectorizationLegality::canVectorizeMemory() { for (MI = ReadWrites.begin(), ME = ReadWrites.end(); MI != ME; ++MI) { Value *V = (*MI).first; if (hasComputableBounds(V)) { - PtrRtCheck.insert(SE, TheLoop, V); + PtrRtCheck.insert(SE, TheLoop, V, true); + NumWritePtrs++; DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *V <<"\n"); } else { CanDoRT = false; @@ -2569,7 +2750,8 @@ bool LoopVectorizationLegality::canVectorizeMemory() { for (MI = Reads.begin(), ME = Reads.end(); MI != ME; ++MI) { Value *V = (*MI).first; if (hasComputableBounds(V)) { - PtrRtCheck.insert(SE, TheLoop, V); + PtrRtCheck.insert(SE, TheLoop, V, false); + NumReadPtrs++; DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *V <<"\n"); } else { CanDoRT = false; @@ -2579,7 +2761,9 @@ bool LoopVectorizationLegality::canVectorizeMemory() { // Check that we did not collect too many pointers or found a // unsizeable pointer. - if (!CanDoRT || PtrRtCheck.Pointers.size() > RuntimeMemoryCheckThreshold) { + unsigned NumComparisons = (NumWritePtrs * (NumReadPtrs + NumWritePtrs - 1)); + DEBUG(dbgs() << "LV: We need to compare " << NumComparisons << " ptrs.\n"); + if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) { PtrRtCheck.reset(); CanDoRT = false; } @@ -2642,8 +2826,8 @@ bool LoopVectorizationLegality::canVectorizeMemory() { Inst, WriteObjects, MaxByteWidth)) { - DEBUG(dbgs() << "LV: Found a possible write-write reorder:" - << *UI <<"\n"); + DEBUG(dbgs() << "LV: Found a possible write-write reorder:" << **UI + << "\n"); return false; } @@ -2686,8 +2870,8 @@ bool LoopVectorizationLegality::canVectorizeMemory() { Inst, WriteObjects, MaxByteWidth)) { - DEBUG(dbgs() << "LV: Found a possible read-write reorder:" - << *UI <<"\n"); + DEBUG(dbgs() << "LV: Found a possible read-write reorder:" << **UI + << "\n"); return false; } } @@ -2733,7 +2917,18 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, // used as reduction variables (such as ADD). We may have a single // out-of-block user. The cycle must end with the original PHI. Instruction *Iter = Phi; - while (true) { + + // To recognize min/max patterns formed by a icmp select sequence, we store + // the number of instruction we saw from the recognized min/max pattern, + // such that we don't stop when we see the phi has two uses (one by the select + // and one by the icmp) and to make sure we only see exactly the two + // instructions. + unsigned NumCmpSelectPatternInst = 0; + ReductionInstDesc ReduxDesc(false, 0); + + // Avoid cycles in the chain. + SmallPtrSet<Instruction *, 8> VisitedInsts; + while (VisitedInsts.insert(Iter)) { // If the instruction has no users then this is a broken // chain and can't be a reduction variable. if (Iter->use_empty()) @@ -2747,9 +2942,6 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, // Is this a bin op ? FoundBinOp |= !isa<PHINode>(Iter); - // Remember the current instruction. - Instruction *OldIter = Iter; - // For each of the *users* of iter. for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end(); it != e; ++it) { @@ -2778,25 +2970,35 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, Iter->hasNUsesOrMore(2)) continue; - // We can't have multiple inside users. - if (FoundInBlockUser) + // We can't have multiple inside users except for a combination of + // icmp/select both using the phi. + if (FoundInBlockUser && !NumCmpSelectPatternInst) return false; FoundInBlockUser = true; // Any reduction instr must be of one of the allowed kinds. - if (!isReductionInstr(U, Kind)) + ReduxDesc = isReductionInstr(U, Kind, ReduxDesc); + if (!ReduxDesc.IsReduction) return false; + if (Kind == RK_IntegerMinMax && (isa<ICmpInst>(U) || isa<SelectInst>(U))) + ++NumCmpSelectPatternInst; + if (Kind == RK_FloatMinMax && (isa<FCmpInst>(U) || isa<SelectInst>(U))) + ++NumCmpSelectPatternInst; + // Reductions of instructions such as Div, and Sub is only // possible if the LHS is the reduction variable. - if (!U->isCommutative() && !isa<PHINode>(U) && U->getOperand(0) != Iter) + if (!U->isCommutative() && !isa<PHINode>(U) && !isa<SelectInst>(U) && + !isa<ICmpInst>(U) && !isa<FCmpInst>(U) && U->getOperand(0) != Iter) return false; - Iter = U; + Iter = ReduxDesc.PatternLastInst; } - // If all uses were skipped this can't be a reduction variable. - if (Iter == OldIter) + // This means we have seen one but not the other instruction of the + // pattern or more than just a select and cmp. + if ((Kind == RK_IntegerMinMax || Kind == RK_FloatMinMax) && + NumCmpSelectPatternInst != 2) return false; // We found a reduction var if we have reached the original @@ -2807,47 +3009,107 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, AllowedExit.insert(ExitInstruction); // Save the description of this reduction variable. - ReductionDescriptor RD(RdxStart, ExitInstruction, Kind); + ReductionDescriptor RD(RdxStart, ExitInstruction, Kind, + ReduxDesc.MinMaxKind); Reductions[Phi] = RD; // We've ended the cycle. This is a reduction variable if we have an // outside user and it has a binary op. return FoundBinOp && ExitInstruction; } } + + return false; } -bool +/// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction +/// pattern corresponding to a min(X, Y) or max(X, Y). +LoopVectorizationLegality::ReductionInstDesc +LoopVectorizationLegality::isMinMaxSelectCmpPattern(Instruction *I, + ReductionInstDesc &Prev) { + + assert((isa<ICmpInst>(I) || isa<FCmpInst>(I) || isa<SelectInst>(I)) && + "Expect a select instruction"); + Instruction *Cmp = 0; + SelectInst *Select = 0; + + // We must handle the select(cmp()) as a single instruction. Advance to the + // select. + if ((Cmp = dyn_cast<ICmpInst>(I)) || (Cmp = dyn_cast<FCmpInst>(I))) { + if (!Cmp->hasOneUse() || !(Select = dyn_cast<SelectInst>(*I->use_begin()))) + return ReductionInstDesc(false, I); + return ReductionInstDesc(Select, Prev.MinMaxKind); + } + + // Only handle single use cases for now. + if (!(Select = dyn_cast<SelectInst>(I))) + return ReductionInstDesc(false, I); + if (!(Cmp = dyn_cast<ICmpInst>(I->getOperand(0))) && + !(Cmp = dyn_cast<FCmpInst>(I->getOperand(0)))) + return ReductionInstDesc(false, I); + if (!Cmp->hasOneUse()) + return ReductionInstDesc(false, I); + + Value *CmpLeft; + Value *CmpRight; + + // Look for a min/max pattern. + if (m_UMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) + return ReductionInstDesc(Select, MRK_UIntMin); + else if (m_UMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) + return ReductionInstDesc(Select, MRK_UIntMax); + else if (m_SMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) + return ReductionInstDesc(Select, MRK_SIntMax); + else if (m_SMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) + return ReductionInstDesc(Select, MRK_SIntMin); + else if (m_OrdFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) + return ReductionInstDesc(Select, MRK_FloatMin); + else if (m_OrdFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) + return ReductionInstDesc(Select, MRK_FloatMax); + else if (m_UnordFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) + return ReductionInstDesc(Select, MRK_FloatMin); + else if (m_UnordFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) + return ReductionInstDesc(Select, MRK_FloatMax); + + return ReductionInstDesc(false, I); +} + +LoopVectorizationLegality::ReductionInstDesc LoopVectorizationLegality::isReductionInstr(Instruction *I, - ReductionKind Kind) { + ReductionKind Kind, + ReductionInstDesc &Prev) { bool FP = I->getType()->isFloatingPointTy(); bool FastMath = (FP && I->isCommutative() && I->isAssociative()); - switch (I->getOpcode()) { default: - return false; + return ReductionInstDesc(false, I); case Instruction::PHI: - if (FP && (Kind != RK_FloatMult && Kind != RK_FloatAdd)) - return false; - // possibly. - return true; + if (FP && (Kind != RK_FloatMult && Kind != RK_FloatAdd && + Kind != RK_FloatMinMax)) + return ReductionInstDesc(false, I); + return ReductionInstDesc(I, Prev.MinMaxKind); case Instruction::Sub: case Instruction::Add: - return Kind == RK_IntegerAdd; - case Instruction::SDiv: - case Instruction::UDiv: + return ReductionInstDesc(Kind == RK_IntegerAdd, I); case Instruction::Mul: - return Kind == RK_IntegerMult; + return ReductionInstDesc(Kind == RK_IntegerMult, I); case Instruction::And: - return Kind == RK_IntegerAnd; + return ReductionInstDesc(Kind == RK_IntegerAnd, I); case Instruction::Or: - return Kind == RK_IntegerOr; + return ReductionInstDesc(Kind == RK_IntegerOr, I); case Instruction::Xor: - return Kind == RK_IntegerXor; + return ReductionInstDesc(Kind == RK_IntegerXor, I); case Instruction::FMul: - return Kind == RK_FloatMult && FastMath; + return ReductionInstDesc(Kind == RK_FloatMult && FastMath, I); case Instruction::FAdd: - return Kind == RK_FloatAdd && FastMath; - } + return ReductionInstDesc(Kind == RK_FloatAdd && FastMath, I); + case Instruction::FCmp: + case Instruction::ICmp: + case Instruction::Select: + if (Kind != RK_IntegerMinMax && + (!HasFunNoNaNAttr || Kind != RK_FloatMinMax)) + return ReductionInstDesc(false, I); + return isMinMaxSelectCmpPattern(I, Prev); + } } LoopVectorizationLegality::InductionKind @@ -3380,9 +3642,11 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); // Scalarized loads/stores. - int Stride = Legal->isConsecutivePtr(Ptr); - bool Reverse = Stride < 0; - if (0 == Stride) { + int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); + bool Reverse = ConsecutiveStride < 0; + unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ValTy); + unsigned VectorElementSize = DL->getTypeStoreSize(VectorTy)/VF; + if (!ConsecutiveStride || ScalarAllocatedSize != VectorElementSize) { unsigned Cost = 0; // The cost of extracting from the value vector and pointer vector. Type *PtrTy = ToVectorTy(Ptr->getType(), VF); diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp new file mode 100644 index 0000000..cc30cc9 --- /dev/null +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -0,0 +1,348 @@ +//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// This pass implements the Bottom Up SLP vectorizer. It detects consecutive +// stores that can be put together into vector-stores. Next, it attempts to +// construct vectorizable tree using the use-def chains. If a profitable tree +// was found, the SLP vectorizer performs vectorization on the tree. +// +// The pass is inspired by the work described in the paper: +// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks. +// +//===----------------------------------------------------------------------===// +#define SV_NAME "slp-vectorizer" +#define DEBUG_TYPE SV_NAME + +#include "VecUtils.h" +#include "llvm/Transforms/Vectorize.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/Verifier.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include <map> + +using namespace llvm; + +static cl::opt<int> +SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, + cl::desc("Only vectorize trees if the gain is above this " + "number. (gain = -cost of vectorization)")); +namespace { + +/// The SLPVectorizer Pass. +struct SLPVectorizer : public FunctionPass { + typedef std::map<Value*, BoUpSLP::StoreList> StoreListMap; + + /// Pass identification, replacement for typeid + static char ID; + + explicit SLPVectorizer() : FunctionPass(ID) { + initializeSLPVectorizerPass(*PassRegistry::getPassRegistry()); + } + + ScalarEvolution *SE; + DataLayout *DL; + TargetTransformInfo *TTI; + AliasAnalysis *AA; + LoopInfo *LI; + + virtual bool runOnFunction(Function &F) { + SE = &getAnalysis<ScalarEvolution>(); + DL = getAnalysisIfAvailable<DataLayout>(); + TTI = &getAnalysis<TargetTransformInfo>(); + AA = &getAnalysis<AliasAnalysis>(); + LI = &getAnalysis<LoopInfo>(); + + StoreRefs.clear(); + bool Changed = false; + + // Must have DataLayout. We can't require it because some tests run w/o + // triple. + if (!DL) + return false; + + for (Function::iterator it = F.begin(), e = F.end(); it != e; ++it) { + BasicBlock *BB = it; + bool BBChanged = false; + + // Use the bollom up slp vectorizer to construct chains that start with + // he store instructions. + BoUpSLP R(BB, SE, DL, TTI, AA, LI->getLoopFor(BB)); + + // Vectorize trees that end at reductions. + BBChanged |= vectorizeReductions(BB, R); + + // Vectorize trees that end at stores. + if (unsigned count = collectStores(BB, R)) { + (void)count; + DEBUG(dbgs()<<"SLP: Found " << count << " stores to vectorize.\n"); + BBChanged |= vectorizeStoreChains(R); + } + + // Try to hoist some of the scalarization code to the preheader. + if (BBChanged) hoistGatherSequence(LI, BB, R); + + Changed |= BBChanged; + } + + if (Changed) { + DEBUG(dbgs()<<"SLP: vectorized \""<<F.getName()<<"\"\n"); + DEBUG(verifyFunction(F)); + } + return Changed; + } + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + FunctionPass::getAnalysisUsage(AU); + AU.addRequired<ScalarEvolution>(); + AU.addRequired<AliasAnalysis>(); + AU.addRequired<TargetTransformInfo>(); + AU.addRequired<LoopInfo>(); + } + +private: + + /// \brief Collect memory references and sort them according to their base + /// object. We sort the stores to their base objects to reduce the cost of the + /// quadratic search on the stores. TODO: We can further reduce this cost + /// if we flush the chain creation every time we run into a memory barrier. + unsigned collectStores(BasicBlock *BB, BoUpSLP &R); + + /// \brief Try to vectorize a chain that starts at two arithmetic instrs. + bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R); + + /// \brief Try to vectorize a list of operands. + bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R); + + /// \brief Try to vectorize a chain that may start at the operands of \V; + bool tryToVectorize(BinaryOperator *V, BoUpSLP &R); + + /// \brief Vectorize the stores that were collected in StoreRefs. + bool vectorizeStoreChains(BoUpSLP &R); + + /// \brief Try to hoist gather sequences outside of the loop in cases where + /// all of the sources are loop invariant. + void hoistGatherSequence(LoopInfo *LI, BasicBlock *BB, BoUpSLP &R); + + /// \brief Scan the basic block and look for reductions that may start a + /// vectorization chain. + bool vectorizeReductions(BasicBlock *BB, BoUpSLP &R); + +private: + StoreListMap StoreRefs; +}; + +unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) { + unsigned count = 0; + StoreRefs.clear(); + for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { + StoreInst *SI = dyn_cast<StoreInst>(it); + if (!SI) + continue; + + // Check that the pointer points to scalars. + Type *Ty = SI->getValueOperand()->getType(); + if (Ty->isAggregateType() || Ty->isVectorTy()) + return 0; + + // Find the base of the GEP. + Value *Ptr = SI->getPointerOperand(); + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr)) + Ptr = GEP->getPointerOperand(); + + // Save the store locations. + StoreRefs[Ptr].push_back(SI); + count++; + } + return count; +} + +bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) { + if (!A || !B) return false; + Value *VL[] = { A, B }; + return tryToVectorizeList(VL, R); +} + +bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R) { + DEBUG(dbgs()<<"SLP: Vectorizing a list of length = " << VL.size() << ".\n"); + + // Check that all of the parts are scalar. + for (int i = 0, e = VL.size(); i < e; ++i) { + Type *Ty = VL[i]->getType(); + if (Ty->isAggregateType() || Ty->isVectorTy()) + return 0; + } + + int Cost = R.getTreeCost(VL); + int ExtrCost = R.getScalarizationCost(VL); + DEBUG(dbgs()<<"SLP: Cost of pair:" << Cost << + " Cost of extract:" << ExtrCost << ".\n"); + if ((Cost+ExtrCost) >= -SLPCostThreshold) return false; + DEBUG(dbgs()<<"SLP: Vectorizing pair.\n"); + R.vectorizeArith(VL); + return true; +} + +bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) { + if (!V) return false; + // Try to vectorize V. + if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R)) + return true; + + BinaryOperator *A = dyn_cast<BinaryOperator>(V->getOperand(0)); + BinaryOperator *B = dyn_cast<BinaryOperator>(V->getOperand(1)); + // Try to skip B. + if (B && B->hasOneUse()) { + BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0)); + BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1)); + if (tryToVectorizePair(A, B0, R)) { + B->moveBefore(V); + return true; + } + if (tryToVectorizePair(A, B1, R)) { + B->moveBefore(V); + return true; + } + } + + // Try to skip A. + if (A && A->hasOneUse()) { + BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0)); + BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1)); + if (tryToVectorizePair(A0, B, R)) { + A->moveBefore(V); + return true; + } + if (tryToVectorizePair(A1, B, R)) { + A->moveBefore(V); + return true; + } + } + return 0; +} + +bool SLPVectorizer::vectorizeReductions(BasicBlock *BB, BoUpSLP &R) { + bool Changed = false; + for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { + if (isa<DbgInfoIntrinsic>(it)) continue; + + // Try to vectorize reductions that use PHINodes. + if (PHINode *P = dyn_cast<PHINode>(it)) { + // Check that the PHI is a reduction PHI. + if (P->getNumIncomingValues() != 2) return Changed; + Value *Rdx = (P->getIncomingBlock(0) == BB ? P->getIncomingValue(0) : + (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1) : + 0)); + // Check if this is a Binary Operator. + BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx); + if (!BI) + continue; + + Value *Inst = BI->getOperand(0); + if (Inst == P) Inst = BI->getOperand(1); + Changed |= tryToVectorize(dyn_cast<BinaryOperator>(Inst), R); + continue; + } + + // Try to vectorize trees that start at compare instructions. + if (CmpInst *CI = dyn_cast<CmpInst>(it)) { + if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) { + Changed |= true; + continue; + } + for (int i = 0; i < 2; ++i) + if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i))) + Changed |= tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R); + continue; + } + } + + return Changed; +} + +bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) { + bool Changed = false; + // Attempt to sort and vectorize each of the store-groups. + for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end(); + it != e; ++it) { + if (it->second.size() < 2) + continue; + + DEBUG(dbgs()<<"SLP: Analyzing a store chain of length " << + it->second.size() << ".\n"); + + Changed |= R.vectorizeStores(it->second, -SLPCostThreshold); + } + return Changed; +} + +void SLPVectorizer::hoistGatherSequence(LoopInfo *LI, BasicBlock *BB, + BoUpSLP &R) { + // Check if this block is inside a loop. + Loop *L = LI->getLoopFor(BB); + if (!L) + return; + + // Check if it has a preheader. + BasicBlock *PreHeader = L->getLoopPreheader(); + if (!PreHeader) + return; + + // Mark the insertion point for the block. + Instruction *Location = PreHeader->getTerminator(); + + BoUpSLP::ValueList &Gathers = R.getGatherSeqInstructions(); + for (BoUpSLP::ValueList::iterator it = Gathers.begin(), e = Gathers.end(); + it != e; ++it) { + InsertElementInst *Insert = dyn_cast<InsertElementInst>(*it); + + // The InsertElement sequence can be simplified into a constant. + if (!Insert) + continue; + + // If the vector or the element that we insert into it are + // instructions that are defined in this basic block then we can't + // hoist this instruction. + Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0)); + Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1)); + if (CurrVec && L->contains(CurrVec)) continue; + if (NewElem && L->contains(NewElem)) continue; + + // We can hoist this instruction. Move it to the pre-header. + Insert->moveBefore(Location); + } +} + +} // end anonymous namespace + +char SLPVectorizer::ID = 0; +static const char lv_name[] = "SLP Vectorizer"; +INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false) + +namespace llvm { + Pass *createSLPVectorizerPass() { + return new SLPVectorizer(); + } +} + diff --git a/lib/Transforms/Vectorize/VecUtils.cpp b/lib/Transforms/Vectorize/VecUtils.cpp new file mode 100644 index 0000000..9b94366 --- /dev/null +++ b/lib/Transforms/Vectorize/VecUtils.cpp @@ -0,0 +1,730 @@ +//===- VecUtils.cpp --- Vectorization Utilities ---------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +#define DEBUG_TYPE "SLP" + +#include "VecUtils.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/Verifier.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" +#include <algorithm> +#include <map> + +using namespace llvm; + +static const unsigned MinVecRegSize = 128; + +static const unsigned RecursionMaxDepth = 6; + +namespace llvm { + +BoUpSLP::BoUpSLP(BasicBlock *Bb, ScalarEvolution *S, DataLayout *Dl, + TargetTransformInfo *Tti, AliasAnalysis *Aa, Loop *Lp) : + BB(Bb), SE(S), DL(Dl), TTI(Tti), AA(Aa), L(Lp) { + numberInstructions(); +} + +void BoUpSLP::numberInstructions() { + int Loc = 0; + InstrIdx.clear(); + InstrVec.clear(); + // Number the instructions in the block. + for (BasicBlock::iterator it=BB->begin(), e=BB->end(); it != e; ++it) { + InstrIdx[it] = Loc++; + InstrVec.push_back(it); + assert(InstrVec[InstrIdx[it]] == it && "Invalid allocation"); + } +} + +Value *BoUpSLP::getPointerOperand(Value *I) { + if (LoadInst *LI = dyn_cast<LoadInst>(I)) return LI->getPointerOperand(); + if (StoreInst *SI = dyn_cast<StoreInst>(I)) return SI->getPointerOperand(); + return 0; +} + +unsigned BoUpSLP::getAddressSpaceOperand(Value *I) { + if (LoadInst *L=dyn_cast<LoadInst>(I)) return L->getPointerAddressSpace(); + if (StoreInst *S=dyn_cast<StoreInst>(I)) return S->getPointerAddressSpace(); + return -1; +} + +bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) { + Value *PtrA = getPointerOperand(A); + Value *PtrB = getPointerOperand(B); + unsigned ASA = getAddressSpaceOperand(A); + unsigned ASB = getAddressSpaceOperand(B); + + // Check that the address spaces match and that the pointers are valid. + if (!PtrA || !PtrB || (ASA != ASB)) return false; + + // Check that A and B are of the same type. + if (PtrA->getType() != PtrB->getType()) return false; + + // Calculate the distance. + const SCEV *PtrSCEVA = SE->getSCEV(PtrA); + const SCEV *PtrSCEVB = SE->getSCEV(PtrB); + const SCEV *OffsetSCEV = SE->getMinusSCEV(PtrSCEVA, PtrSCEVB); + const SCEVConstant *ConstOffSCEV = dyn_cast<SCEVConstant>(OffsetSCEV); + + // Non constant distance. + if (!ConstOffSCEV) return false; + + int64_t Offset = ConstOffSCEV->getValue()->getSExtValue(); + Type *Ty = cast<PointerType>(PtrA->getType())->getElementType(); + // The Instructions are connsecutive if the size of the first load/store is + // the same as the offset. + int64_t Sz = DL->getTypeStoreSize(Ty); + return ((-Offset) == Sz); +} + +bool BoUpSLP::vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold) { + Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType(); + unsigned Sz = DL->getTypeSizeInBits(StoreTy); + unsigned VF = MinVecRegSize / Sz; + + if (!isPowerOf2_32(Sz) || VF < 2) return false; + + bool Changed = false; + // Look for profitable vectorizable trees at all offsets, starting at zero. + for (unsigned i = 0, e = Chain.size(); i < e; ++i) { + if (i + VF > e) return Changed; + DEBUG(dbgs()<<"SLP: Analyzing " << VF << " stores at offset "<< i << "\n"); + ArrayRef<Value *> Operands = Chain.slice(i, VF); + + int Cost = getTreeCost(Operands); + DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n"); + if (Cost < CostThreshold) { + DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); + vectorizeTree(Operands, VF); + i += VF - 1; + Changed = true; + } + } + + return Changed; +} + +bool BoUpSLP::vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold) { + ValueSet Heads, Tails; + SmallDenseMap<Value*, Value*> ConsecutiveChain; + + // We may run into multiple chains that merge into a single chain. We mark the + // stores that we vectorized so that we don't visit the same store twice. + ValueSet VectorizedStores; + bool Changed = false; + + // Do a quadratic search on all of the given stores and find + // all of the pairs of loads that follow each other. + for (unsigned i = 0, e = Stores.size(); i < e; ++i) + for (unsigned j = 0; j < e; ++j) { + if (i == j) continue; + if (isConsecutiveAccess(Stores[i], Stores[j])) { + Tails.insert(Stores[j]); + Heads.insert(Stores[i]); + ConsecutiveChain[Stores[i]] = Stores[j]; + } + } + + // For stores that start but don't end a link in the chain: + for (ValueSet::iterator it = Heads.begin(), e = Heads.end();it != e; ++it) { + if (Tails.count(*it)) continue; + + // We found a store instr that starts a chain. Now follow the chain and try + // to vectorize it. + ValueList Operands; + Value *I = *it; + // Collect the chain into a list. + while (Tails.count(I) || Heads.count(I)) { + if (VectorizedStores.count(I)) break; + Operands.push_back(I); + // Move to the next value in the chain. + I = ConsecutiveChain[I]; + } + + bool Vectorized = vectorizeStoreChain(Operands, costThreshold); + + // Mark the vectorized stores so that we don't vectorize them again. + if (Vectorized) + VectorizedStores.insert(Operands.begin(), Operands.end()); + Changed |= Vectorized; + } + + return Changed; +} + +int BoUpSLP::getScalarizationCost(ArrayRef<Value *> VL) { + // Find the type of the operands in VL. + Type *ScalarTy = VL[0]->getType(); + if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) + ScalarTy = SI->getValueOperand()->getType(); + VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); + // Find the cost of inserting/extracting values from the vector. + return getScalarizationCost(VecTy); +} + +int BoUpSLP::getScalarizationCost(Type *Ty) { + int Cost = 0; + for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i) + Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); + return Cost; +} + +AliasAnalysis::Location BoUpSLP::getLocation(Instruction *I) { + if (StoreInst *SI = dyn_cast<StoreInst>(I)) return AA->getLocation(SI); + if (LoadInst *LI = dyn_cast<LoadInst>(I)) return AA->getLocation(LI); + return AliasAnalysis::Location(); +} + +Value *BoUpSLP::isUnsafeToSink(Instruction *Src, Instruction *Dst) { + assert(Src->getParent() == Dst->getParent() && "Not the same BB"); + BasicBlock::iterator I = Src, E = Dst; + /// Scan all of the instruction from SRC to DST and check if + /// the source may alias. + for (++I; I != E; ++I) { + // Ignore store instructions that are marked as 'ignore'. + if (MemBarrierIgnoreList.count(I)) continue; + if (Src->mayWriteToMemory()) /* Write */ { + if (!I->mayReadOrWriteMemory()) continue; + } else /* Read */ { + if (!I->mayWriteToMemory()) continue; + } + AliasAnalysis::Location A = getLocation(&*I); + AliasAnalysis::Location B = getLocation(Src); + + if (!A.Ptr || !B.Ptr || AA->alias(A, B)) + return I; + } + return 0; +} + +void BoUpSLP::vectorizeArith(ArrayRef<Value *> Operands) { + Value *Vec = vectorizeTree(Operands, Operands.size()); + BasicBlock::iterator Loc = cast<Instruction>(Vec); + IRBuilder<> Builder(++Loc); + // After vectorizing the operands we need to generate extractelement + // instructions and replace all of the uses of the scalar values with + // the values that we extracted from the vectorized tree. + for (unsigned i = 0, e = Operands.size(); i != e; ++i) { + Value *S = Builder.CreateExtractElement(Vec, Builder.getInt32(i)); + Operands[i]->replaceAllUsesWith(S); + } +} + +int BoUpSLP::getTreeCost(ArrayRef<Value *> VL) { + // Get rid of the list of stores that were removed, and from the + // lists of instructions with multiple users. + MemBarrierIgnoreList.clear(); + LaneMap.clear(); + MultiUserVals.clear(); + MustScalarize.clear(); + + // Scan the tree and find which value is used by which lane, and which values + // must be scalarized. + getTreeUses_rec(VL, 0); + + // Check that instructions with multiple users can be vectorized. Mark unsafe + // instructions. + for (ValueSet::iterator it = MultiUserVals.begin(), + e = MultiUserVals.end(); it != e; ++it) { + // Check that all of the users of this instr are within the tree + // and that they are all from the same lane. + int Lane = -1; + for (Value::use_iterator I = (*it)->use_begin(), E = (*it)->use_end(); + I != E; ++I) { + if (LaneMap.find(*I) == LaneMap.end()) { + MustScalarize.insert(*it); + DEBUG(dbgs()<<"SLP: Adding " << **it << + " to MustScalarize because of an out of tree usage.\n"); + break; + } + if (Lane == -1) Lane = LaneMap[*I]; + if (Lane != LaneMap[*I]) { + MustScalarize.insert(*it); + DEBUG(dbgs()<<"Adding " << **it << + " to MustScalarize because multiple lane use it: " + << Lane << " and " << LaneMap[*I] << ".\n"); + break; + } + } + } + + // Now calculate the cost of vectorizing the tree. + return getTreeCost_rec(VL, 0); +} + +void BoUpSLP::getTreeUses_rec(ArrayRef<Value *> VL, unsigned Depth) { + if (Depth == RecursionMaxDepth) return; + + // Don't handle vectors. + if (VL[0]->getType()->isVectorTy()) return; + if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) + if (SI->getValueOperand()->getType()->isVectorTy()) return; + + // Check if all of the operands are constants. + bool AllConst = true; + bool AllSameScalar = true; + for (unsigned i = 0, e = VL.size(); i < e; ++i) { + AllConst &= isa<Constant>(VL[i]); + AllSameScalar &= (VL[0] == VL[i]); + Instruction *I = dyn_cast<Instruction>(VL[i]); + // If one of the instructions is out of this BB, we need to scalarize all. + if (I && I->getParent() != BB) return; + } + + // If all of the operands are identical or constant we have a simple solution. + if (AllConst || AllSameScalar) return; + + // Scalarize unknown structures. + Instruction *VL0 = dyn_cast<Instruction>(VL[0]); + if (!VL0) return; + + unsigned Opcode = VL0->getOpcode(); + for (unsigned i = 0, e = VL.size(); i < e; ++i) { + Instruction *I = dyn_cast<Instruction>(VL[i]); + // If not all of the instructions are identical then we have to scalarize. + if (!I || Opcode != I->getOpcode()) return; + } + + // Mark instructions with multiple users. + for (unsigned i = 0, e = VL.size(); i < e; ++i) { + Instruction *I = dyn_cast<Instruction>(VL[i]); + // Remember to check if all of the users of this instr are vectorized + // within our tree. + if (I && I->getNumUses() > 1) MultiUserVals.insert(I); + } + + for (int i = 0, e = VL.size(); i < e; ++i) { + // Check that the instruction is only used within + // one lane. + if (LaneMap.count(VL[i]) && LaneMap[VL[i]] != i) return; + // Make this instruction as 'seen' and remember the lane. + LaneMap[VL[i]] = i; + } + + switch (Opcode) { + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { + ValueList Operands; + // Prepare the operand vector. + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); + + getTreeUses_rec(Operands, Depth+1); + } + return; + } + case Instruction::Store: { + ValueList Operands; + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast<Instruction>(VL[j])->getOperand(0)); + getTreeUses_rec(Operands, Depth+1); + return; + } + default: + return; + } +} + +int BoUpSLP::getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth) { + Type *ScalarTy = VL[0]->getType(); + + if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) + ScalarTy = SI->getValueOperand()->getType(); + + /// Don't mess with vectors. + if (ScalarTy->isVectorTy()) return max_cost; + VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); + + if (Depth == RecursionMaxDepth) return getScalarizationCost(VecTy); + + // Check if all of the operands are constants. + bool AllConst = true; + bool AllSameScalar = true; + bool MustScalarizeFlag = false; + for (unsigned i = 0, e = VL.size(); i < e; ++i) { + AllConst &= isa<Constant>(VL[i]); + AllSameScalar &= (VL[0] == VL[i]); + // Must have a single use. + Instruction *I = dyn_cast<Instruction>(VL[i]); + MustScalarizeFlag |= MustScalarize.count(VL[i]); + // This instruction is outside the basic block. + if (I && I->getParent() != BB) + return getScalarizationCost(VecTy); + } + + // Is this a simple vector constant. + if (AllConst) return 0; + + // If all of the operands are identical we can broadcast them. + Instruction *VL0 = dyn_cast<Instruction>(VL[0]); + if (AllSameScalar) { + // If we are in a loop, and this is not an instruction (e.g. constant or + // argument) or the instruction is defined outside the loop then assume + // that the cost is zero. + if (L && (!VL0 || !L->contains(VL0))) + return 0; + + // We need to broadcast the scalar. + return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); + } + + // If this is not a constant, or a scalar from outside the loop then we + // need to scalarize it. + if (MustScalarizeFlag) + return getScalarizationCost(VecTy); + + if (!VL0) return getScalarizationCost(VecTy); + assert(VL0->getParent() == BB && "Wrong BB"); + + unsigned Opcode = VL0->getOpcode(); + for (unsigned i = 0, e = VL.size(); i < e; ++i) { + Instruction *I = dyn_cast<Instruction>(VL[i]); + // If not all of the instructions are identical then we have to scalarize. + if (!I || Opcode != I->getOpcode()) return getScalarizationCost(VecTy); + } + + // Check if it is safe to sink the loads or the stores. + if (Opcode == Instruction::Load || Opcode == Instruction::Store) { + int MaxIdx = InstrIdx[VL0]; + for (unsigned i = 1, e = VL.size(); i < e; ++i ) + MaxIdx = std::max(MaxIdx, InstrIdx[VL[i]]); + + Instruction *Last = InstrVec[MaxIdx]; + for (unsigned i = 0, e = VL.size(); i < e; ++i ) { + if (VL[i] == Last) continue; + Value *Barrier = isUnsafeToSink(cast<Instruction>(VL[i]), Last); + if (Barrier) { + DEBUG(dbgs() << "SLP: Can't sink " << *VL[i] << "\n down to " << + *Last << "\n because of " << *Barrier << "\n"); + return max_cost; + } + } + } + + switch (Opcode) { + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + int Cost = 0; + ValueList Operands; + Type *SrcTy = VL0->getOperand(0)->getType(); + // Prepare the operand vector. + for (unsigned j = 0; j < VL.size(); ++j) { + Operands.push_back(cast<Instruction>(VL[j])->getOperand(0)); + // Check that the casted type is the same for all users. + if (cast<Instruction>(VL[j])->getOperand(0)->getType() != SrcTy) + return getScalarizationCost(VecTy); + } + + Cost += getTreeCost_rec(Operands, Depth+1); + if (Cost >= max_cost) return max_cost; + + // Calculate the cost of this instruction. + int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(), + VL0->getType(), SrcTy); + + VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size()); + int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy); + Cost += (VecCost - ScalarCost); + return Cost; + } + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + int Cost = 0; + // Calculate the cost of all of the operands. + for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { + ValueList Operands; + // Prepare the operand vector. + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); + + Cost += getTreeCost_rec(Operands, Depth+1); + if (Cost >= max_cost) return max_cost; + } + + // Calculate the cost of this instruction. + int ScalarCost = VecTy->getNumElements() * + TTI->getArithmeticInstrCost(Opcode, ScalarTy); + + int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy); + Cost += (VecCost - ScalarCost); + return Cost; + } + case Instruction::Load: { + // If we are scalarize the loads, add the cost of forming the vector. + for (unsigned i = 0, e = VL.size()-1; i < e; ++i) + if (!isConsecutiveAccess(VL[i], VL[i+1])) + return getScalarizationCost(VecTy); + + // Cost of wide load - cost of scalar loads. + int ScalarLdCost = VecTy->getNumElements() * + TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); + int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); + return VecLdCost - ScalarLdCost; + } + case Instruction::Store: { + // We know that we can merge the stores. Calculate the cost. + int ScalarStCost = VecTy->getNumElements() * + TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0); + int VecStCost = TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1,0); + int StoreCost = VecStCost - ScalarStCost; + + ValueList Operands; + for (unsigned j = 0; j < VL.size(); ++j) { + Operands.push_back(cast<Instruction>(VL[j])->getOperand(0)); + MemBarrierIgnoreList.insert(VL[j]); + } + + int TotalCost = StoreCost + getTreeCost_rec(Operands, Depth + 1); + return TotalCost; + } + default: + // Unable to vectorize unknown instructions. + return getScalarizationCost(VecTy); + } +} + +Instruction *BoUpSLP::GetLastInstr(ArrayRef<Value *> VL, unsigned VF) { + int MaxIdx = InstrIdx[BB->getFirstNonPHI()]; + for (unsigned i = 0; i < VF; ++i ) + MaxIdx = std::max(MaxIdx, InstrIdx[VL[i]]); + return InstrVec[MaxIdx + 1]; +} + +Value *BoUpSLP::Scalarize(ArrayRef<Value *> VL, VectorType *Ty) { + IRBuilder<> Builder(GetLastInstr(VL, Ty->getNumElements())); + Value *Vec = UndefValue::get(Ty); + for (unsigned i=0; i < Ty->getNumElements(); ++i) { + // Generate the 'InsertElement' instruction. + Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i)); + // Remember that this instruction is used as part of a 'gather' sequence. + // The caller of the bottom-up slp vectorizer can try to hoist the sequence + // if the users are outside of the basic block. + GatherInstructions.push_back(Vec); + } + + return Vec; +} + +Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL, int VF) { + Value *V = vectorizeTree_rec(VL, VF); + // We moved some instructions around. We have to number them again + // before we can do any analysis. + numberInstructions(); + MustScalarize.clear(); + return V; +} + +Value *BoUpSLP::vectorizeTree_rec(ArrayRef<Value *> VL, int VF) { + Type *ScalarTy = VL[0]->getType(); + if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) + ScalarTy = SI->getValueOperand()->getType(); + VectorType *VecTy = VectorType::get(ScalarTy, VF); + + // Check if all of the operands are constants or identical. + bool AllConst = true; + bool AllSameScalar = true; + for (unsigned i = 0, e = VF; i < e; ++i) { + AllConst &= isa<Constant>(VL[i]); + AllSameScalar &= (VL[0] == VL[i]); + // The instruction must be in the same BB, and it must be vectorizable. + Instruction *I = dyn_cast<Instruction>(VL[i]); + if (MustScalarize.count(VL[i]) || (I && I->getParent() != BB)) + return Scalarize(VL, VecTy); + } + + // Check that this is a simple vector constant. + if (AllConst || AllSameScalar) return Scalarize(VL, VecTy); + + // Scalarize unknown structures. + Instruction *VL0 = dyn_cast<Instruction>(VL[0]); + if (!VL0) return Scalarize(VL, VecTy); + + if (VectorizedValues.count(VL0)) return VectorizedValues[VL0]; + + unsigned Opcode = VL0->getOpcode(); + for (unsigned i = 0, e = VF; i < e; ++i) { + Instruction *I = dyn_cast<Instruction>(VL[i]); + // If not all of the instructions are identical then we have to scalarize. + if (!I || Opcode != I->getOpcode()) return Scalarize(VL, VecTy); + } + + switch (Opcode) { + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + ValueList INVL; + for (int i = 0; i < VF; ++i) + INVL.push_back(cast<Instruction>(VL[i])->getOperand(0)); + Value *InVec = vectorizeTree_rec(INVL, VF); + IRBuilder<> Builder(GetLastInstr(VL, VF)); + CastInst *CI = dyn_cast<CastInst>(VL0); + Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); + VectorizedValues[VL0] = V; + return V; + } + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + ValueList LHSVL, RHSVL; + for (int i = 0; i < VF; ++i) { + RHSVL.push_back(cast<Instruction>(VL[i])->getOperand(0)); + LHSVL.push_back(cast<Instruction>(VL[i])->getOperand(1)); + } + + Value *RHS = vectorizeTree_rec(RHSVL, VF); + Value *LHS = vectorizeTree_rec(LHSVL, VF); + IRBuilder<> Builder(GetLastInstr(VL, VF)); + BinaryOperator *BinOp = cast<BinaryOperator>(VL0); + Value *V = Builder.CreateBinOp(BinOp->getOpcode(), RHS,LHS); + VectorizedValues[VL0] = V; + return V; + } + case Instruction::Load: { + LoadInst *LI = cast<LoadInst>(VL0); + unsigned Alignment = LI->getAlignment(); + + // Check if all of the loads are consecutive. + for (unsigned i = 1, e = VF; i < e; ++i) + if (!isConsecutiveAccess(VL[i-1], VL[i])) + return Scalarize(VL, VecTy); + + IRBuilder<> Builder(GetLastInstr(VL, VF)); + Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(), + VecTy->getPointerTo()); + LI = Builder.CreateLoad(VecPtr); + LI->setAlignment(Alignment); + VectorizedValues[VL0] = LI; + return LI; + } + case Instruction::Store: { + StoreInst *SI = cast<StoreInst>(VL0); + unsigned Alignment = SI->getAlignment(); + + ValueList ValueOp; + for (int i = 0; i < VF; ++i) + ValueOp.push_back(cast<StoreInst>(VL[i])->getValueOperand()); + + Value *VecValue = vectorizeTree_rec(ValueOp, VF); + + IRBuilder<> Builder(GetLastInstr(VL, VF)); + Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(), + VecTy->getPointerTo()); + Builder.CreateStore(VecValue, VecPtr)->setAlignment(Alignment); + + for (int i = 0; i < VF; ++i) + cast<Instruction>(VL[i])->eraseFromParent(); + return 0; + } + default: + Value *S = Scalarize(VL, VecTy); + VectorizedValues[VL0] = S; + return S; + } +} + +} // end of namespace diff --git a/lib/Transforms/Vectorize/VecUtils.h b/lib/Transforms/Vectorize/VecUtils.h new file mode 100644 index 0000000..5456c6c --- /dev/null +++ b/lib/Transforms/Vectorize/VecUtils.h @@ -0,0 +1,164 @@ +//===- VecUtils.h - Vectorization Utilities -------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This family of classes and functions manipulate vectors and chains of +// vectors. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_VECTORIZE_VECUTILS_H +#define LLVM_TRANSFORMS_VECTORIZE_VECUTILS_H + +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include <vector> + +namespace llvm { + +class BasicBlock; class Instruction; class Type; +class VectorType; class StoreInst; class Value; +class ScalarEvolution; class DataLayout; +class TargetTransformInfo; class AliasAnalysis; +class Loop; + +/// Bottom Up SLP vectorization utility class. +struct BoUpSLP { + typedef SmallVector<Value*, 8> ValueList; + typedef SmallPtrSet<Value*, 16> ValueSet; + typedef SmallVector<StoreInst*, 8> StoreList; + static const int max_cost = 1<<20; + + // \brief C'tor. + BoUpSLP(BasicBlock *Bb, ScalarEvolution *Se, DataLayout *Dl, + TargetTransformInfo *Tti, AliasAnalysis *Aa, Loop *Lp); + + /// \brief Take the pointer operand from the Load/Store instruction. + /// \returns NULL if this is not a valid Load/Store instruction. + static Value *getPointerOperand(Value *I); + + /// \brief Take the address space operand from the Load/Store instruction. + /// \returns -1 if this is not a valid Load/Store instruction. + static unsigned getAddressSpaceOperand(Value *I); + + /// \returns true if the memory operations A and B are consecutive. + bool isConsecutiveAccess(Value *A, Value *B); + + /// \brief Vectorize the tree that starts with the elements in \p VL. + /// \returns the vectorized value. + Value *vectorizeTree(ArrayRef<Value *> VL, int VF); + + /// \returns the vectorization cost of the subtree that starts at \p VL. + /// A negative number means that this is profitable. + int getTreeCost(ArrayRef<Value *> VL); + + /// \returns the scalarization cost for this list of values. Assuming that + /// this subtree gets vectorized, we may need to extract the values from the + /// roots. This method calculates the cost of extracting the values. + int getScalarizationCost(ArrayRef<Value *> VL); + + /// \brief Attempts to order and vectorize a sequence of stores. This + /// function does a quadratic scan of the given stores. + /// \returns true if the basic block was modified. + bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold); + + /// \brief Vectorize a group of scalars into a vector tree. + void vectorizeArith(ArrayRef<Value *> Operands); + + /// \returns the list of new instructions that were added in order to collect + /// scalars into vectors. This list can be used to further optimize the gather + /// sequences. + ValueList &getGatherSeqInstructions() {return GatherInstructions; } + +private: + /// \brief This method contains the recursive part of getTreeCost. + int getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth); + + /// \brief This recursive method looks for vectorization hazards such as + /// values that are used by multiple users and checks that values are used + /// by only one vector lane. It updates the variables LaneMap, MultiUserVals. + void getTreeUses_rec(ArrayRef<Value *> VL, unsigned Depth); + + /// \brief This method contains the recursive part of vectorizeTree. + Value *vectorizeTree_rec(ArrayRef<Value *> VL, int VF); + + /// \brief Number all of the instructions in the block. + void numberInstructions(); + + /// \brief Vectorize a sorted sequence of stores. + bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold); + + /// \returns the scalarization cost for this type. Scalarization in this + /// context means the creation of vectors from a group of scalars. + int getScalarizationCost(Type *Ty); + + /// \returns the AA location that is being access by the instruction. + AliasAnalysis::Location getLocation(Instruction *I); + + /// \brief Checks if it is possible to sink an instruction from + /// \p Src to \p Dst. + /// \returns the pointer to the barrier instruction if we can't sink. + Value *isUnsafeToSink(Instruction *Src, Instruction *Dst); + + /// \returns the instruction that appears last in the BB from \p VL. + /// Only consider the first \p VF elements. + Instruction *GetLastInstr(ArrayRef<Value *> VL, unsigned VF); + + /// \returns a vector from a collection of scalars in \p VL. + Value *Scalarize(ArrayRef<Value *> VL, VectorType *Ty); + +private: + /// Maps instructions to numbers and back. + SmallDenseMap<Value*, int> InstrIdx; + /// Maps integers to Instructions. + std::vector<Instruction*> InstrVec; + + // -- containers that are used during getTreeCost -- // + + /// Contains values that must be scalarized because they are used + /// by multiple lanes, or by users outside the tree. + /// NOTICE: The vectorization methods also use this set. + ValueSet MustScalarize; + + /// Contains a list of values that are used outside the current tree. This + /// set must be reset between runs. + ValueSet MultiUserVals; + /// Maps values in the tree to the vector lanes that uses them. This map must + /// be reset between runs of getCost. + std::map<Value*, int> LaneMap; + /// A list of instructions to ignore while sinking + /// memory instructions. This map must be reset between runs of getCost. + SmallPtrSet<Value *, 8> MemBarrierIgnoreList; + + // -- Containers that are used during vectorizeTree -- // + + /// Maps between the first scalar to the vector. This map must be reset + ///between runs. + DenseMap<Value*, Value*> VectorizedValues; + + // -- Containers that are used after vectorization by the caller -- // + + /// A list of instructions that are used when gathering scalars into vectors. + /// In many cases these instructions can be hoisted outside of the BB. + /// Iterating over this list is faster than calling LICM. + ValueList GatherInstructions; + + // Analysis and block reference. + BasicBlock *BB; + ScalarEvolution *SE; + DataLayout *DL; + TargetTransformInfo *TTI; + AliasAnalysis *AA; + Loop *L; +}; + +} // end of namespace + +#endif // LLVM_TRANSFORMS_VECTORIZE_VECUTILS_H diff --git a/lib/Transforms/Vectorize/Vectorize.cpp b/lib/Transforms/Vectorize/Vectorize.cpp index 19eefd2..a927fe1 100644 --- a/lib/Transforms/Vectorize/Vectorize.cpp +++ b/lib/Transforms/Vectorize/Vectorize.cpp @@ -1,4 +1,4 @@ - //===-- Vectorize.cpp -----------------------------------------------------===// +//===-- Vectorize.cpp -----------------------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -28,6 +28,7 @@ using namespace llvm; void llvm::initializeVectorization(PassRegistry &Registry) { initializeBBVectorizePass(Registry); initializeLoopVectorizePass(Registry); + initializeSLPVectorizerPass(Registry); } void LLVMInitializeVectorization(LLVMPassRegistryRef R) { @@ -41,3 +42,7 @@ void LLVMAddBBVectorizePass(LLVMPassManagerRef PM) { void LLVMAddLoopVectorizePass(LLVMPassManagerRef PM) { unwrap(PM)->add(createLoopVectorizePass()); } + +void LLVMAddSLPVectorizePass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createSLPVectorizerPass()); +} |