diff options
Diffstat (limited to 'contrib/llvm/lib/Transforms/Vectorize')
4 files changed, 1706 insertions, 2318 deletions
diff --git a/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp b/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp index a0ccf9d..fd7661f 100644 --- a/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp +++ b/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp @@ -39,6 +39,7 @@ #include "llvm/IR/Intrinsics.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Metadata.h" +#include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/IR/ValueHandle.h" #include "llvm/Pass.h" @@ -201,14 +202,14 @@ namespace { initializeBBVectorizePass(*PassRegistry::getPassRegistry()); } - BBVectorize(Pass *P, const VectorizeConfig &C) + BBVectorize(Pass *P, Function &F, const VectorizeConfig &C) : BasicBlockPass(ID), Config(C) { AA = &P->getAnalysis<AliasAnalysis>(); DT = &P->getAnalysis<DominatorTreeWrapperPass>().getDomTree(); SE = &P->getAnalysis<ScalarEvolution>(); - DataLayoutPass *DLP = P->getAnalysisIfAvailable<DataLayoutPass>(); - DL = DLP ? &DLP->getDataLayout() : nullptr; - TTI = IgnoreTargetInfo ? nullptr : &P->getAnalysis<TargetTransformInfo>(); + TTI = IgnoreTargetInfo + ? nullptr + : &P->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); } typedef std::pair<Value *, Value *> ValuePair; @@ -220,7 +221,6 @@ namespace { AliasAnalysis *AA; DominatorTree *DT; ScalarEvolution *SE; - const DataLayout *DL; const TargetTransformInfo *TTI; // FIXME: const correct? @@ -440,9 +440,10 @@ namespace { AA = &getAnalysis<AliasAnalysis>(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); SE = &getAnalysis<ScalarEvolution>(); - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - DL = DLP ? &DLP->getDataLayout() : nullptr; - TTI = IgnoreTargetInfo ? nullptr : &getAnalysis<TargetTransformInfo>(); + TTI = IgnoreTargetInfo + ? nullptr + : &getAnalysis<TargetTransformInfoWrapperPass>().getTTI( + *BB.getParent()); return vectorizeBB(BB); } @@ -452,7 +453,7 @@ namespace { AU.addRequired<AliasAnalysis>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<ScalarEvolution>(); - AU.addRequired<TargetTransformInfo>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); AU.addPreserved<AliasAnalysis>(); AU.addPreserved<DominatorTreeWrapperPass>(); AU.addPreserved<ScalarEvolution>(); @@ -637,19 +638,19 @@ namespace { dyn_cast<SCEVConstant>(OffsetSCEV)) { ConstantInt *IntOff = ConstOffSCEV->getValue(); int64_t Offset = IntOff->getSExtValue(); - + const DataLayout &DL = I->getModule()->getDataLayout(); Type *VTy = IPtr->getType()->getPointerElementType(); - int64_t VTyTSS = (int64_t) DL->getTypeStoreSize(VTy); + int64_t VTyTSS = (int64_t)DL.getTypeStoreSize(VTy); Type *VTy2 = JPtr->getType()->getPointerElementType(); if (VTy != VTy2 && Offset < 0) { - int64_t VTy2TSS = (int64_t) DL->getTypeStoreSize(VTy2); + int64_t VTy2TSS = (int64_t)DL.getTypeStoreSize(VTy2); OffsetInElmts = Offset/VTy2TSS; - return (abs64(Offset) % VTy2TSS) == 0; + return (std::abs(Offset) % VTy2TSS) == 0; } OffsetInElmts = Offset/VTyTSS; - return (abs64(Offset) % VTyTSS) == 0; + return (std::abs(Offset) % VTyTSS) == 0; } return false; @@ -661,7 +662,7 @@ namespace { Function *F = I->getCalledFunction(); if (!F) return false; - Intrinsic::ID IID = (Intrinsic::ID) F->getIntrinsicID(); + Intrinsic::ID IID = F->getIntrinsicID(); if (!IID) return false; switch(IID) { @@ -841,7 +842,7 @@ namespace { // It is important to cleanup here so that future iterations of this // function have less work to do. - (void) SimplifyInstructionsInBlock(&BB, DL, AA->getTargetLibraryInfo()); + (void)SimplifyInstructionsInBlock(&BB, AA->getTargetLibraryInfo()); return true; } @@ -895,10 +896,6 @@ namespace { return false; } - // We can't vectorize memory operations without target data - if (!DL && IsSimpleLoadStore) - return false; - Type *T1, *T2; getInstructionTypes(I, T1, T2); @@ -933,9 +930,8 @@ namespace { if (T2->isX86_FP80Ty() || T2->isPPC_FP128Ty() || T2->isX86_MMXTy()) return false; - if ((!Config.VectorizePointers || !DL) && - (T1->getScalarType()->isPointerTy() || - T2->getScalarType()->isPointerTy())) + if (!Config.VectorizePointers && (T1->getScalarType()->isPointerTy() || + T2->getScalarType()->isPointerTy())) return false; if (!TTI && (T1->getPrimitiveSizeInBits() >= Config.VectorBits || @@ -980,8 +976,8 @@ namespace { unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace; int64_t OffsetInElmts = 0; if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, - IAddressSpace, JAddressSpace, - OffsetInElmts) && abs64(OffsetInElmts) == 1) { + IAddressSpace, JAddressSpace, OffsetInElmts) && + std::abs(OffsetInElmts) == 1) { FixedOrder = (int) OffsetInElmts; unsigned BottomAlignment = IAlignment; if (OffsetInElmts < 0) BottomAlignment = JAlignment; @@ -996,8 +992,8 @@ namespace { // An aligned load or store is possible only if the instruction // with the lower offset has an alignment suitable for the // vector type. - - unsigned VecAlignment = DL->getPrefTypeAlignment(VType); + const DataLayout &DL = I->getModule()->getDataLayout(); + unsigned VecAlignment = DL.getPrefTypeAlignment(VType); if (BottomAlignment < VecAlignment) return false; } @@ -1102,7 +1098,7 @@ namespace { CallInst *CI = dyn_cast<CallInst>(I); Function *FI; if (CI && (FI = CI->getCalledFunction())) { - Intrinsic::ID IID = (Intrinsic::ID) FI->getIntrinsicID(); + Intrinsic::ID IID = FI->getIntrinsicID(); if (IID == Intrinsic::powi || IID == Intrinsic::ctlz || IID == Intrinsic::cttz) { Value *A1I = CI->getArgOperand(1), @@ -1277,7 +1273,7 @@ namespace { CostSavings, FixedOrder)) continue; // J is a candidate for merging with I. - if (!PairableInsts.size() || + if (PairableInsts.empty() || PairableInsts[PairableInsts.size()-1] != I) { PairableInsts.push_back(I); } @@ -2774,7 +2770,7 @@ namespace { continue; } else if (isa<CallInst>(I)) { Function *F = cast<CallInst>(I)->getCalledFunction(); - Intrinsic::ID IID = (Intrinsic::ID) F->getIntrinsicID(); + Intrinsic::ID IID = F->getIntrinsicID(); if (o == NumOperands-1) { BasicBlock &BB = *I->getParent(); @@ -3107,7 +3103,17 @@ namespace { else if (H->hasName()) K->takeName(H); - if (!isa<StoreInst>(K)) + if (auto CS = CallSite(K)) { + SmallVector<Type *, 3> Tys; + FunctionType *Old = CS.getFunctionType(); + unsigned NumOld = Old->getNumParams(); + assert(NumOld <= ReplacedOperands.size()); + for (unsigned i = 0; i != NumOld; ++i) + Tys.push_back(ReplacedOperands[i]->getType()); + CS.mutateFunctionType( + FunctionType::get(getVecTypeForPair(L->getType(), H->getType()), + Tys, Old->isVarArg())); + } else if (!isa<StoreInst>(K)) K->mutateType(getVecTypeForPair(L->getType(), H->getType())); unsigned KnownIDs[] = { @@ -3186,13 +3192,13 @@ namespace { DEBUG(dbgs() << "BBV: final: \n" << BB << "\n"); } -} +} // namespace char BBVectorize::ID = 0; static const char bb_vectorize_name[] = "Basic-Block Vectorization"; INITIALIZE_PASS_BEGIN(BBVectorize, BBV_NAME, bb_vectorize_name, false, false) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) -INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_END(BBVectorize, BBV_NAME, bb_vectorize_name, false, false) @@ -3203,7 +3209,7 @@ BasicBlockPass *llvm::createBBVectorizePass(const VectorizeConfig &C) { bool llvm::vectorizeBasicBlock(Pass *P, BasicBlock &BB, const VectorizeConfig &C) { - BBVectorize BBVectorizer(P, C); + BBVectorize BBVectorizer(P, *BB.getParent(), C); return BBVectorizer.vectorizeBB(BB); } diff --git a/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 47b92a3..b7faa20 100644 --- a/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -34,6 +34,10 @@ // Variable uniformity checks are inspired by: // Karrenberg, R. and Hack, S. Whole Function Vectorization. // +// The interleaved access vectorization is based on the paper: +// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved +// Data for SIMD +// // Other ideas/concepts are from: // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. // @@ -58,6 +62,7 @@ #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/LoopPass.h" @@ -92,6 +97,7 @@ #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/VectorUtils.h" +#include "llvm/Transforms/Utils/LoopUtils.h" #include <algorithm> #include <map> #include <tuple> @@ -105,15 +111,6 @@ using namespace llvm::PatternMatch; STATISTIC(LoopsVectorized, "Number of loops vectorized"); STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); -static cl::opt<unsigned> -VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, - cl::desc("Sets the SIMD width. Zero is autoselect.")); - -static cl::opt<unsigned> -VectorizationInterleave("force-vector-interleave", cl::init(0), cl::Hidden, - cl::desc("Sets the vectorization interleave count. " - "Zero is autoselect.")); - static cl::opt<bool> EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, cl::desc("Enable if-conversion during vectorization.")); @@ -141,15 +138,18 @@ static cl::opt<bool> EnableMemAccessVersioning( "enable-mem-access-versioning", cl::init(true), cl::Hidden, cl::desc("Enable symblic stride memory access versioning")); -/// We don't unroll loops with a known constant trip count below this number. -static const unsigned TinyTripCountUnrollThreshold = 128; +static cl::opt<bool> EnableInterleavedMemAccesses( + "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, + cl::desc("Enable vectorization on interleaved memory accesses in a loop")); -/// When performing memory disambiguation checks at runtime do not make more -/// than this number of comparisons. -static const unsigned RuntimeMemoryCheckThreshold = 8; +/// Maximum factor for an interleaved memory access. +static cl::opt<unsigned> MaxInterleaveGroupFactor( + "max-interleave-group-factor", cl::Hidden, + cl::desc("Maximum factor for an interleaved access group (default = 8)"), + cl::init(8)); -/// Maximum simd width. -static const unsigned MaxVectorWidth = 64; +/// We don't unroll loops with a known constant trip count below this number. +static const unsigned TinyTripCountUnrollThreshold = 128; static cl::opt<unsigned> ForceTargetNumScalarRegs( "force-target-num-scalar-regs", cl::init(0), cl::Hidden, @@ -218,29 +218,30 @@ class LoopVectorizationLegality; class LoopVectorizationCostModel; class LoopVectorizeHints; -/// Optimization analysis message produced during vectorization. Messages inform -/// the user why vectorization did not occur. -class Report { - std::string Message; - raw_string_ostream Out; - Instruction *Instr; - +/// \brief This modifies LoopAccessReport to initialize message with +/// loop-vectorizer-specific part. +class VectorizationReport : public LoopAccessReport { public: - Report(Instruction *I = nullptr) : Out(Message), Instr(I) { - Out << "loop not vectorized: "; - } - - template <typename A> Report &operator<<(const A &Value) { - Out << Value; - return *this; - } - - Instruction *getInstr() { return Instr; } - - std::string &str() { return Out.str(); } - operator Twine() { return Out.str(); } + VectorizationReport(Instruction *I = nullptr) + : LoopAccessReport("loop not vectorized: ", I) {} + + /// \brief This allows promotion of the loop-access analysis report into the + /// loop-vectorizer report. It modifies the message to add the + /// loop-vectorizer-specific part of the message. + explicit VectorizationReport(const LoopAccessReport &R) + : LoopAccessReport(Twine("loop not vectorized: ") + R.str(), + R.getInstr()) {} }; +/// A helper function for converting Scalar types to vector types. +/// If the incoming type is void, we return void. If the VF is 1, we return +/// the scalar type. +static Type* ToVectorTy(Type *Scalar, unsigned VF) { + if (Scalar->isVoidTy() || VF == 1) + return Scalar; + return VectorType::get(Scalar, VF); +} + /// InnerLoopVectorizer vectorizes loops which contain only one basic /// block to a specified vectorization factor (VF). /// This class performs the widening of scalars into vectors, or multiple @@ -258,13 +259,13 @@ public: class InnerLoopVectorizer { public: InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI, - DominatorTree *DT, const DataLayout *DL, - const TargetLibraryInfo *TLI, unsigned VecWidth, + DominatorTree *DT, const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI, unsigned VecWidth, unsigned UnrollFactor) - : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), DL(DL), TLI(TLI), + : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()), Induction(nullptr), OldInduction(nullptr), WidenMap(UnrollFactor), - Legal(nullptr) {} + Legal(nullptr), AddedSafetyChecks(false) {} // Perform the actual loop widening (vectorization). void vectorize(LoopVectorizationLegality *L) { @@ -278,6 +279,11 @@ public: updateAnalysis(); } + // Return true if any runtime check is added. + bool IsSafetyChecksAdded() { + return AddedSafetyChecks; + } + virtual ~InnerLoopVectorizer() {} protected: @@ -288,19 +294,12 @@ protected: /// originated from one scalar instruction. typedef SmallVector<Value*, 2> VectorParts; - // When we if-convert we need create edge masks. We have to cache values so - // that we don't end up with exponential recursion/IR. + // When we if-convert we need to create edge masks. We have to cache values + // so that we don't end up with exponential recursion/IR. typedef DenseMap<std::pair<BasicBlock*, BasicBlock*>, VectorParts> EdgeMaskCache; - /// \brief Add code that checks at runtime if the accessed arrays overlap. - /// - /// Returns a pair of instructions where the first element is the first - /// instruction generated in possibly a sequence of instructions and the - /// second value is the final comparator value or NULL if no check is needed. - std::pair<Instruction *, Instruction *> addRuntimeCheck(Instruction *Loc); - - /// \brief Add checks for strides that where assumed to be 1. + /// \brief Add checks for strides that were assumed to be 1. /// /// Returns the last check instruction and the first check instruction in the /// pair as (first, last). @@ -355,10 +354,9 @@ protected: /// element. virtual Value *getBroadcastInstrs(Value *V); - /// This function adds 0, 1, 2 ... to each vector element, starting at zero. - /// If Negate is set then negative numbers are added e.g. (0, -1, -2, ...). - /// The sequence starts at StartIndex. - virtual Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate); + /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) + /// to each vector element of Val. The sequence starts at StartIndex. + virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step); /// When we go over instructions in the basic block we rely on previous /// values within the current basic block or on loop invariant values. @@ -367,6 +365,9 @@ protected: /// broadcast them into a vector. VectorParts &getVectorValue(Value *V); + /// Try to vectorize the interleaved access group that \p Instr belongs to. + void vectorizeInterleaveGroup(Instruction *Instr); + /// Generate a shuffle sequence that will reverse the vector Vec. virtual Value *reverseVector(Value *Vec); @@ -420,10 +421,10 @@ protected: DominatorTree *DT; /// Alias Analysis. AliasAnalysis *AA; - /// Data Layout. - const DataLayout *DL; /// Target Library Info. const TargetLibraryInfo *TLI; + /// Target Transform Info. + const TargetTransformInfo *TTI; /// The vectorization SIMD factor to use. Each vector will have this many /// vector elements. @@ -465,21 +466,24 @@ protected: EdgeMaskCache MaskCache; LoopVectorizationLegality *Legal; + + // Record whether runtime check is added. + bool AddedSafetyChecks; }; class InnerLoopUnroller : public InnerLoopVectorizer { public: InnerLoopUnroller(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI, - DominatorTree *DT, const DataLayout *DL, - const TargetLibraryInfo *TLI, unsigned UnrollFactor) : - InnerLoopVectorizer(OrigLoop, SE, LI, DT, DL, TLI, 1, UnrollFactor) { } + DominatorTree *DT, const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI, unsigned UnrollFactor) + : InnerLoopVectorizer(OrigLoop, SE, LI, DT, TLI, TTI, 1, UnrollFactor) {} private: void scalarizeInstruction(Instruction *Instr, bool IfPredicateStore = false) override; void vectorizeMemoryInstruction(Instruction *Instr) override; Value *getBroadcastInstrs(Value *V) override; - Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate) override; + Value *getStepVector(Value *Val, int StartIdx, Value *Step) override; Value *reverseVector(Value *Vec) override; }; @@ -517,9 +521,8 @@ static std::string getDebugLocString(const Loop *L) { std::string Result; if (L) { raw_string_ostream OS(Result); - const DebugLoc LoopDbgLoc = L->getStartLoc(); - if (!LoopDbgLoc.isUnknown()) - LoopDbgLoc.print(L->getHeader()->getContext(), OS); + if (const DebugLoc LoopDbgLoc = L->getStartLoc()) + LoopDbgLoc.print(OS); else // Just print the module name. OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); @@ -559,6 +562,219 @@ static void propagateMetadata(SmallVectorImpl<Value *> &To, const Instruction *F propagateMetadata(I, From); } +/// \brief The group of interleaved loads/stores sharing the same stride and +/// close to each other. +/// +/// Each member in this group has an index starting from 0, and the largest +/// index should be less than interleaved factor, which is equal to the absolute +/// value of the access's stride. +/// +/// E.g. An interleaved load group of factor 4: +/// for (unsigned i = 0; i < 1024; i+=4) { +/// a = A[i]; // Member of index 0 +/// b = A[i+1]; // Member of index 1 +/// d = A[i+3]; // Member of index 3 +/// ... +/// } +/// +/// An interleaved store group of factor 4: +/// for (unsigned i = 0; i < 1024; i+=4) { +/// ... +/// A[i] = a; // Member of index 0 +/// A[i+1] = b; // Member of index 1 +/// A[i+2] = c; // Member of index 2 +/// A[i+3] = d; // Member of index 3 +/// } +/// +/// Note: the interleaved load group could have gaps (missing members), but +/// the interleaved store group doesn't allow gaps. +class InterleaveGroup { +public: + InterleaveGroup(Instruction *Instr, int Stride, unsigned Align) + : Align(Align), SmallestKey(0), LargestKey(0), InsertPos(Instr) { + assert(Align && "The alignment should be non-zero"); + + Factor = std::abs(Stride); + assert(Factor > 1 && "Invalid interleave factor"); + + Reverse = Stride < 0; + Members[0] = Instr; + } + + bool isReverse() const { return Reverse; } + unsigned getFactor() const { return Factor; } + unsigned getAlignment() const { return Align; } + unsigned getNumMembers() const { return Members.size(); } + + /// \brief Try to insert a new member \p Instr with index \p Index and + /// alignment \p NewAlign. The index is related to the leader and it could be + /// negative if it is the new leader. + /// + /// \returns false if the instruction doesn't belong to the group. + bool insertMember(Instruction *Instr, int Index, unsigned NewAlign) { + assert(NewAlign && "The new member's alignment should be non-zero"); + + int Key = Index + SmallestKey; + + // Skip if there is already a member with the same index. + if (Members.count(Key)) + return false; + + if (Key > LargestKey) { + // The largest index is always less than the interleave factor. + if (Index >= static_cast<int>(Factor)) + return false; + + LargestKey = Key; + } else if (Key < SmallestKey) { + // The largest index is always less than the interleave factor. + if (LargestKey - Key >= static_cast<int>(Factor)) + return false; + + SmallestKey = Key; + } + + // It's always safe to select the minimum alignment. + Align = std::min(Align, NewAlign); + Members[Key] = Instr; + return true; + } + + /// \brief Get the member with the given index \p Index + /// + /// \returns nullptr if contains no such member. + Instruction *getMember(unsigned Index) const { + int Key = SmallestKey + Index; + if (!Members.count(Key)) + return nullptr; + + return Members.find(Key)->second; + } + + /// \brief Get the index for the given member. Unlike the key in the member + /// map, the index starts from 0. + unsigned getIndex(Instruction *Instr) const { + for (auto I : Members) + if (I.second == Instr) + return I.first - SmallestKey; + + llvm_unreachable("InterleaveGroup contains no such member"); + } + + Instruction *getInsertPos() const { return InsertPos; } + void setInsertPos(Instruction *Inst) { InsertPos = Inst; } + +private: + unsigned Factor; // Interleave Factor. + bool Reverse; + unsigned Align; + DenseMap<int, Instruction *> Members; + int SmallestKey; + int LargestKey; + + // To avoid breaking dependences, vectorized instructions of an interleave + // group should be inserted at either the first load or the last store in + // program order. + // + // E.g. %even = load i32 // Insert Position + // %add = add i32 %even // Use of %even + // %odd = load i32 + // + // store i32 %even + // %odd = add i32 // Def of %odd + // store i32 %odd // Insert Position + Instruction *InsertPos; +}; + +/// \brief Drive the analysis of interleaved memory accesses in the loop. +/// +/// Use this class to analyze interleaved accesses only when we can vectorize +/// a loop. Otherwise it's meaningless to do analysis as the vectorization +/// on interleaved accesses is unsafe. +/// +/// The analysis collects interleave groups and records the relationships +/// between the member and the group in a map. +class InterleavedAccessInfo { +public: + InterleavedAccessInfo(ScalarEvolution *SE, Loop *L, DominatorTree *DT) + : SE(SE), TheLoop(L), DT(DT) {} + + ~InterleavedAccessInfo() { + SmallSet<InterleaveGroup *, 4> DelSet; + // Avoid releasing a pointer twice. + for (auto &I : InterleaveGroupMap) + DelSet.insert(I.second); + for (auto *Ptr : DelSet) + delete Ptr; + } + + /// \brief Analyze the interleaved accesses and collect them in interleave + /// groups. Substitute symbolic strides using \p Strides. + void analyzeInterleaving(const ValueToValueMap &Strides); + + /// \brief Check if \p Instr belongs to any interleave group. + bool isInterleaved(Instruction *Instr) const { + return InterleaveGroupMap.count(Instr); + } + + /// \brief Get the interleave group that \p Instr belongs to. + /// + /// \returns nullptr if doesn't have such group. + InterleaveGroup *getInterleaveGroup(Instruction *Instr) const { + if (InterleaveGroupMap.count(Instr)) + return InterleaveGroupMap.find(Instr)->second; + return nullptr; + } + +private: + ScalarEvolution *SE; + Loop *TheLoop; + DominatorTree *DT; + + /// Holds the relationships between the members and the interleave group. + DenseMap<Instruction *, InterleaveGroup *> InterleaveGroupMap; + + /// \brief The descriptor for a strided memory access. + struct StrideDescriptor { + StrideDescriptor(int Stride, const SCEV *Scev, unsigned Size, + unsigned Align) + : Stride(Stride), Scev(Scev), Size(Size), Align(Align) {} + + StrideDescriptor() : Stride(0), Scev(nullptr), Size(0), Align(0) {} + + int Stride; // The access's stride. It is negative for a reverse access. + const SCEV *Scev; // The scalar expression of this access + unsigned Size; // The size of the memory object. + unsigned Align; // The alignment of this access. + }; + + /// \brief Create a new interleave group with the given instruction \p Instr, + /// stride \p Stride and alignment \p Align. + /// + /// \returns the newly created interleave group. + InterleaveGroup *createInterleaveGroup(Instruction *Instr, int Stride, + unsigned Align) { + assert(!InterleaveGroupMap.count(Instr) && + "Already in an interleaved access group"); + InterleaveGroupMap[Instr] = new InterleaveGroup(Instr, Stride, Align); + return InterleaveGroupMap[Instr]; + } + + /// \brief Release the group and remove all the relationships. + void releaseGroup(InterleaveGroup *Group) { + for (unsigned i = 0; i < Group->getFactor(); i++) + if (Instruction *Member = Group->getMember(i)) + InterleaveGroupMap.erase(Member); + + delete Group; + } + + /// \brief Collect all the accesses with a constant stride in program order. + void collectConstStridedAccesses( + MapVector<Instruction *, StrideDescriptor> &StrideAccesses, + const ValueToValueMap &Strides); +}; + /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and /// to what vectorization factor. /// This class does not look at the profitability of vectorization, only the @@ -574,140 +790,89 @@ static void propagateMetadata(SmallVectorImpl<Value *> &To, const Instruction *F /// induction variable and the different reduction variables. class LoopVectorizationLegality { public: - unsigned NumLoads; - unsigned NumStores; - unsigned NumPredStores; - - LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, const DataLayout *DL, - DominatorTree *DT, TargetLibraryInfo *TLI, - AliasAnalysis *AA, Function *F, - const TargetTransformInfo *TTI) - : NumLoads(0), NumStores(0), NumPredStores(0), TheLoop(L), SE(SE), DL(DL), - DT(DT), TLI(TLI), AA(AA), TheFunction(F), TTI(TTI), Induction(nullptr), - WidestIndTy(nullptr), HasFunNoNaNAttr(false), MaxSafeDepDistBytes(-1U) { - } - - /// This enum represents the kinds of reductions that we support. - enum ReductionKind { - RK_NoReduction, ///< Not a reduction. - RK_IntegerAdd, ///< Sum of integers. - RK_IntegerMult, ///< Product of integers. - RK_IntegerOr, ///< Bitwise or logical OR of numbers. - RK_IntegerAnd, ///< Bitwise or logical AND of numbers. - RK_IntegerXor, ///< Bitwise or logical XOR of numbers. - RK_IntegerMinMax, ///< Min/max implemented in terms of select(cmp()). - RK_FloatAdd, ///< Sum of floats. - RK_FloatMult, ///< Product of floats. - RK_FloatMinMax ///< Min/max implemented in terms of select(cmp()). - }; + LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DominatorTree *DT, + TargetLibraryInfo *TLI, AliasAnalysis *AA, + Function *F, const TargetTransformInfo *TTI, + LoopAccessAnalysis *LAA) + : NumPredStores(0), TheLoop(L), SE(SE), TLI(TLI), TheFunction(F), + TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), InterleaveInfo(SE, L, DT), + Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false) {} /// This enum represents the kinds of inductions that we support. enum InductionKind { - IK_NoInduction, ///< Not an induction variable. - IK_IntInduction, ///< Integer induction variable. Step = 1. - IK_ReverseIntInduction, ///< Reverse int induction variable. Step = -1. - IK_PtrInduction, ///< Pointer induction var. Step = sizeof(elem). - IK_ReversePtrInduction ///< Reverse ptr indvar. Step = - sizeof(elem). - }; - - // This enum represents the kind of minmax reduction. - enum MinMaxReductionKind { - MRK_Invalid, - MRK_UIntMin, - MRK_UIntMax, - MRK_SIntMin, - MRK_SIntMax, - MRK_FloatMin, - MRK_FloatMax - }; - - /// This struct holds information about reduction variables. - struct ReductionDescriptor { - ReductionDescriptor() : StartValue(nullptr), LoopExitInstr(nullptr), - Kind(RK_NoReduction), MinMaxKind(MRK_Invalid) {} - - ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K, - MinMaxReductionKind MK) - : StartValue(Start), LoopExitInstr(Exit), Kind(K), MinMaxKind(MK) {} - - // The starting value of the reduction. - // It does not have to be zero! - TrackingVH<Value> StartValue; - // The instruction who's value is used outside the loop. - Instruction *LoopExitInstr; - // The kind of the reduction. - ReductionKind Kind; - // If this a min/max reduction the kind of reduction. - MinMaxReductionKind MinMaxKind; + IK_NoInduction, ///< Not an induction variable. + IK_IntInduction, ///< Integer induction variable. Step = C. + IK_PtrInduction ///< Pointer induction var. Step = C / sizeof(elem). }; - /// This POD struct holds information about a potential reduction operation. - struct ReductionInstDesc { - ReductionInstDesc(bool IsRedux, Instruction *I) : - IsReduction(IsRedux), PatternLastInst(I), MinMaxKind(MRK_Invalid) {} - - ReductionInstDesc(Instruction *I, MinMaxReductionKind K) : - IsReduction(true), PatternLastInst(I), MinMaxKind(K) {} - - // Is this instruction a reduction candidate. - bool IsReduction; - // The last instruction in a min/max pattern (select of the select(icmp()) - // pattern), or the current reduction instruction otherwise. - Instruction *PatternLastInst; - // If this is a min/max pattern the comparison predicate. - MinMaxReductionKind MinMaxKind; - }; - - /// This struct holds information about the memory runtime legality - /// check that a group of pointers do not overlap. - struct RuntimePointerCheck { - RuntimePointerCheck() : Need(false) {} - - /// Reset the state of the pointer runtime information. - void reset() { - Need = false; - Pointers.clear(); - Starts.clear(); - Ends.clear(); - IsWritePtr.clear(); - DependencySetId.clear(); - AliasSetId.clear(); + /// A struct for saving information about induction variables. + struct InductionInfo { + InductionInfo(Value *Start, InductionKind K, ConstantInt *Step) + : StartValue(Start), IK(K), StepValue(Step) { + assert(IK != IK_NoInduction && "Not an induction"); + assert(StartValue && "StartValue is null"); + assert(StepValue && !StepValue->isZero() && "StepValue is zero"); + assert((IK != IK_PtrInduction || StartValue->getType()->isPointerTy()) && + "StartValue is not a pointer for pointer induction"); + assert((IK != IK_IntInduction || StartValue->getType()->isIntegerTy()) && + "StartValue is not an integer for integer induction"); + assert(StepValue->getType()->isIntegerTy() && + "StepValue is not an integer"); + } + InductionInfo() + : StartValue(nullptr), IK(IK_NoInduction), StepValue(nullptr) {} + + /// Get the consecutive direction. Returns: + /// 0 - unknown or non-consecutive. + /// 1 - consecutive and increasing. + /// -1 - consecutive and decreasing. + int getConsecutiveDirection() const { + if (StepValue && (StepValue->isOne() || StepValue->isMinusOne())) + return StepValue->getSExtValue(); + return 0; } - /// Insert a pointer and calculate the start and end SCEVs. - void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr, - unsigned DepSetId, unsigned ASId, ValueToValueMap &Strides); - - /// This flag indicates if we need to add the runtime check. - bool Need; - /// Holds the pointers that we need to check. - SmallVector<TrackingVH<Value>, 2> Pointers; - /// Holds the pointer value at the beginning of the loop. - SmallVector<const SCEV*, 2> Starts; - /// Holds the pointer value at the end of the loop. - SmallVector<const SCEV*, 2> Ends; - /// Holds the information if this pointer is used for writing to memory. - SmallVector<bool, 2> IsWritePtr; - /// Holds the id of the set of pointers that could be dependent because of a - /// shared underlying object. - SmallVector<unsigned, 2> DependencySetId; - /// Holds the id of the disjoint alias set to which this pointer belongs. - SmallVector<unsigned, 2> AliasSetId; - }; + /// Compute the transformed value of Index at offset StartValue using step + /// StepValue. + /// For integer induction, returns StartValue + Index * StepValue. + /// For pointer induction, returns StartValue[Index * StepValue]. + /// FIXME: The newly created binary instructions should contain nsw/nuw + /// flags, which can be found from the original scalar operations. + Value *transform(IRBuilder<> &B, Value *Index) const { + switch (IK) { + case IK_IntInduction: + assert(Index->getType() == StartValue->getType() && + "Index type does not match StartValue type"); + if (StepValue->isMinusOne()) + return B.CreateSub(StartValue, Index); + if (!StepValue->isOne()) + Index = B.CreateMul(Index, StepValue); + return B.CreateAdd(StartValue, Index); + + case IK_PtrInduction: + if (StepValue->isMinusOne()) + Index = B.CreateNeg(Index); + else if (!StepValue->isOne()) + Index = B.CreateMul(Index, StepValue); + return B.CreateGEP(nullptr, StartValue, Index); + + case IK_NoInduction: + return nullptr; + } + llvm_unreachable("invalid enum"); + } - /// A struct for saving information about induction variables. - struct InductionInfo { - InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {} - InductionInfo() : StartValue(nullptr), IK(IK_NoInduction) {} /// Start value. TrackingVH<Value> StartValue; /// Induction kind. InductionKind IK; + /// Step value. + ConstantInt *StepValue; }; /// ReductionList contains the reduction descriptors for all /// of the reductions that were found in the loop. - typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList; + typedef DenseMap<PHINode *, RecurrenceDescriptor> ReductionList; /// InductionList saves induction variables and maps them to the /// induction descriptor. @@ -754,13 +919,25 @@ public: bool isUniformAfterVectorization(Instruction* I) { return Uniforms.count(I); } /// Returns the information that we collected about runtime memory check. - RuntimePointerCheck *getRuntimePointerCheck() { return &PtrRtCheck; } + const LoopAccessInfo::RuntimePointerCheck *getRuntimePointerCheck() const { + return LAI->getRuntimePointerCheck(); + } + + const LoopAccessInfo *getLAI() const { + return LAI; + } - /// This function returns the identity element (or neutral element) for - /// the operation K. - static Constant *getReductionIdentity(ReductionKind K, Type *Tp); + /// \brief Check if \p Instr belongs to any interleaved access group. + bool isAccessInterleaved(Instruction *Instr) { + return InterleaveInfo.isInterleaved(Instr); + } + + /// \brief Get the interleaved access group that \p Instr belongs to. + const InterleaveGroup *getInterleavedAccessGroup(Instruction *Instr) { + return InterleaveInfo.getInterleaveGroup(Instr); + } - unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; } + unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); } bool hasStride(Value *V) { return StrideSet.count(V); } bool mustCheckStrides() { return !StrideSet.empty(); } @@ -784,6 +961,15 @@ public: bool isMaskRequired(const Instruction* I) { return (MaskedOp.count(I) != 0); } + unsigned getNumStores() const { + return LAI->getNumStores(); + } + unsigned getNumLoads() const { + return LAI->getNumLoads(); + } + unsigned getNumPredStores() const { + return NumPredStores; + } private: /// Check if a single basic block loop is vectorizable. /// At this point we know that this is a loop with a constant trip count @@ -808,23 +994,9 @@ private: /// and we know that we can read from them without segfault. bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs); - /// Returns True, if 'Phi' is the kind of reduction variable for type - /// 'Kind'. If this is a reduction variable, it adds it to ReductionList. - bool AddReductionVar(PHINode *Phi, ReductionKind Kind); - /// Returns a struct describing if the instruction 'I' can be a reduction - /// variable of type 'Kind'. If the reduction is a min/max pattern of - /// select(icmp()) this function advances the instruction pointer 'I' from the - /// compare instruction to the select instruction and stores this pointer in - /// 'PatternLastInst' member of the returned struct. - ReductionInstDesc isReductionInstr(Instruction *I, ReductionKind Kind, - ReductionInstDesc &Desc); - /// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction - /// pattern corresponding to a min(X, Y) or max(X, Y). - static ReductionInstDesc isMinMaxSelectCmpPattern(Instruction *I, - ReductionInstDesc &Prev); - /// Returns the induction kind of Phi. This function may return NoInduction - /// if the PHI is not an induction variable. - InductionKind isInductionVariable(PHINode *Phi); + /// Returns the induction kind of Phi and record the step. This function may + /// return NoInduction if the PHI is not an induction variable. + InductionKind isInductionVariable(PHINode *Phi, ConstantInt *&StepValue); /// \brief Collect memory access with loop invariant strides. /// @@ -833,31 +1005,36 @@ private: void collectStridedAccess(Value *LoadOrStoreInst); /// Report an analysis message to assist the user in diagnosing loops that are - /// not vectorized. - void emitAnalysis(Report &Message) { - DebugLoc DL = TheLoop->getStartLoc(); - if (Instruction *I = Message.getInstr()) - DL = I->getDebugLoc(); - emitOptimizationRemarkAnalysis(TheFunction->getContext(), DEBUG_TYPE, - *TheFunction, DL, Message.str()); + /// not vectorized. These are handled as LoopAccessReport rather than + /// VectorizationReport because the << operator of VectorizationReport returns + /// LoopAccessReport. + void emitAnalysis(const LoopAccessReport &Message) { + LoopAccessReport::emitAnalysis(Message, TheFunction, TheLoop, LV_NAME); } + unsigned NumPredStores; + /// The loop that we evaluate. Loop *TheLoop; /// Scev analysis. ScalarEvolution *SE; - /// DataLayout analysis. - const DataLayout *DL; - /// Dominators. - DominatorTree *DT; /// Target Library Info. TargetLibraryInfo *TLI; - /// Alias analysis. - AliasAnalysis *AA; /// Parent function Function *TheFunction; /// Target Transform Info const TargetTransformInfo *TTI; + /// Dominator Tree. + DominatorTree *DT; + // LoopAccess analysis. + LoopAccessAnalysis *LAA; + // And the loop-accesses info corresponding to this loop. This pointer is + // null until canVectorizeMemory sets it up. + const LoopAccessInfo *LAI; + + /// The interleave access information contains groups of interleaved accesses + /// with the same stride and close to each other. + InterleavedAccessInfo InterleaveInfo; // --- vectorization state --- // @@ -879,17 +1056,13 @@ private: /// This set holds the variables which are known to be uniform after /// vectorization. SmallPtrSet<Instruction*, 4> Uniforms; - /// We need to check that all of the pointers in this list are disjoint - /// at runtime. - RuntimePointerCheck PtrRtCheck; + /// Can we assume the absence of NaNs. bool HasFunNoNaNAttr; - unsigned MaxSafeDepDistBytes; - ValueToValueMap Strides; SmallPtrSet<Value *, 8> StrideSet; - + /// While vectorizing these instructions we have to generate a /// call to the appropriate masked intrinsic SmallPtrSet<const Instruction*, 8> MaskedOp; @@ -907,10 +1080,9 @@ public: LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI, LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, - const DataLayout *DL, const TargetLibraryInfo *TLI, - AssumptionCache *AC, const Function *F, - const LoopVectorizeHints *Hints) - : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), DL(DL), TLI(TLI), + const TargetLibraryInfo *TLI, AssumptionCache *AC, + const Function *F, const LoopVectorizeHints *Hints) + : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), TheFunction(F), Hints(Hints) { CodeMetrics::collectEphemeralValues(L, AC, EphValues); } @@ -963,23 +1135,16 @@ private: /// width. Vector width of one means scalar. unsigned getInstructionCost(Instruction *I, unsigned VF); - /// A helper function for converting Scalar types to vector types. - /// If the incoming type is void, we return void. If the VF is 1, we return - /// the scalar type. - static Type* ToVectorTy(Type *Scalar, unsigned VF); - /// Returns whether the instruction is a load or store and will be a emitted /// as a vector operation. bool isConsecutiveLoadOrStore(Instruction *I); /// Report an analysis message to assist the user in diagnosing loops that are - /// not vectorized. - void emitAnalysis(Report &Message) { - DebugLoc DL = TheLoop->getStartLoc(); - if (Instruction *I = Message.getInstr()) - DL = I->getDebugLoc(); - emitOptimizationRemarkAnalysis(TheFunction->getContext(), DEBUG_TYPE, - *TheFunction, DL, Message.str()); + /// not vectorized. These are handled as LoopAccessReport rather than + /// VectorizationReport because the << operator of VectorizationReport returns + /// LoopAccessReport. + void emitAnalysis(const LoopAccessReport &Message) { + LoopAccessReport::emitAnalysis(Message, TheFunction, TheLoop, LV_NAME); } /// Values used only by @llvm.assume calls. @@ -995,8 +1160,6 @@ private: LoopVectorizationLegality *Legal; /// Vector target information. const TargetTransformInfo &TTI; - /// Target data layout information. - const DataLayout *DL; /// Target Library Info. const TargetLibraryInfo *TLI; const Function *TheFunction; @@ -1032,7 +1195,7 @@ class LoopVectorizeHints { bool validate(unsigned Val) { switch (Kind) { case HK_WIDTH: - return isPowerOf2_32(Val) && Val <= MaxVectorWidth; + return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth; case HK_UNROLL: return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor; case HK_FORCE: @@ -1060,7 +1223,8 @@ public: }; LoopVectorizeHints(const Loop *L, bool DisableInterleaving) - : Width("vectorize.width", VectorizationFactor, HK_WIDTH), + : Width("vectorize.width", VectorizerParams::VectorizationFactor, + HK_WIDTH), Interleave("interleave.count", DisableInterleaving, HK_UNROLL), Force("vectorize.enable", FK_Undefined, HK_FORCE), TheLoop(L) { @@ -1068,8 +1232,8 @@ public: getHintsFromMetadata(); // force-vector-interleave overrides DisableInterleaving. - if (VectorizationInterleave.getNumOccurrences() > 0) - Interleave.Value = VectorizationInterleave; + if (VectorizerParams::isInterleaveForced()) + Interleave.Value = VectorizerParams::VectorizationInterleave; DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs() << "LV: Interleaving disabled by the pass manager\n"); @@ -1084,7 +1248,7 @@ public: /// Dumps all the hint information. std::string emitRemark() const { - Report R; + VectorizationReport R; if (Force.Value == LoopVectorizeHints::FK_Disabled) R << "vectorization is explicitly disabled"; else { @@ -1260,7 +1424,6 @@ struct LoopVectorize : public FunctionPass { } ScalarEvolution *SE; - const DataLayout *DL; LoopInfo *LI; TargetTransformInfo *TTI; DominatorTree *DT; @@ -1268,6 +1431,7 @@ struct LoopVectorize : public FunctionPass { TargetLibraryInfo *TLI; AliasAnalysis *AA; AssumptionCache *AC; + LoopAccessAnalysis *LAA; bool DisableUnrolling; bool AlwaysVectorize; @@ -1275,15 +1439,15 @@ struct LoopVectorize : public FunctionPass { bool runOnFunction(Function &F) override { SE = &getAnalysis<ScalarEvolution>(); - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - DL = DLP ? &DLP->getDataLayout() : nullptr; - LI = &getAnalysis<LoopInfo>(); - TTI = &getAnalysis<TargetTransformInfo>(); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); BFI = &getAnalysis<BlockFrequencyInfo>(); - TLI = getAnalysisIfAvailable<TargetLibraryInfo>(); + auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); + TLI = TLIP ? &TLIP->getTLI() : nullptr; AA = &getAnalysis<AliasAnalysis>(); AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + LAA = &getAnalysis<LoopAccessAnalysis>(); // Compute some weights outside of the loop over the loops. Compute this // using a BranchProbability to re-use its scaling math. @@ -1295,12 +1459,6 @@ struct LoopVectorize : public FunctionPass { if (!TTI->getNumberOfRegisters(true)) return false; - if (!DL) { - DEBUG(dbgs() << "\nLV: Not vectorizing " << F.getName() - << ": Missing data layout\n"); - return false; - } - // Build up a worklist of inner-loops to vectorize. This is necessary as // the act of vectorizing or partially unrolling a loop creates new loops // and can invalidate iterators across the loops. @@ -1320,6 +1478,40 @@ struct LoopVectorize : public FunctionPass { return Changed; } + static void AddRuntimeUnrollDisableMetaData(Loop *L) { + SmallVector<Metadata *, 4> MDs; + // Reserve first location for self reference to the LoopID metadata node. + MDs.push_back(nullptr); + bool IsUnrollMetadata = false; + MDNode *LoopID = L->getLoopID(); + if (LoopID) { + // First find existing loop unrolling disable metadata. + for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { + MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); + if (MD) { + const MDString *S = dyn_cast<MDString>(MD->getOperand(0)); + IsUnrollMetadata = + S && S->getString().startswith("llvm.loop.unroll.disable"); + } + MDs.push_back(LoopID->getOperand(i)); + } + } + + if (!IsUnrollMetadata) { + // Add runtime unroll disable metadata. + LLVMContext &Context = L->getHeader()->getContext(); + SmallVector<Metadata *, 1> DisableOperands; + DisableOperands.push_back( + MDString::get(Context, "llvm.loop.unroll.runtime.disable")); + MDNode *DisableNode = MDNode::get(Context, DisableOperands); + MDs.push_back(DisableNode); + MDNode *NewLoopID = MDNode::get(Context, MDs); + // Set operand 0 to refer to the loop id itself. + NewLoopID->replaceOperandWith(0, NewLoopID); + L->setLoopID(NewLoopID); + } + } + bool processLoop(Loop *L) { assert(L->empty() && "Only process inner loops."); @@ -1394,7 +1586,7 @@ struct LoopVectorize : public FunctionPass { } // Check if it is legal to vectorize the loop. - LoopVectorizationLegality LVL(L, SE, DL, DT, TLI, AA, F, TTI); + LoopVectorizationLegality LVL(L, SE, DT, TLI, AA, F, TTI, LAA); if (!LVL.canVectorize()) { DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); emitMissedWarning(F, L, Hints); @@ -1402,8 +1594,7 @@ struct LoopVectorize : public FunctionPass { } // Use the cost model. - LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, DL, TLI, AC, F, - &Hints); + LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, TLI, AC, F, &Hints); // Check the function attributes to find out if this function should be // optimized for size. @@ -1467,14 +1658,20 @@ struct LoopVectorize : public FunctionPass { // We decided not to vectorize, but we may want to unroll. - InnerLoopUnroller Unroller(L, SE, LI, DT, DL, TLI, UF); + InnerLoopUnroller Unroller(L, SE, LI, DT, TLI, TTI, UF); Unroller.vectorize(&LVL); } else { // If we decided that it is *legal* to vectorize the loop then do it. - InnerLoopVectorizer LB(L, SE, LI, DT, DL, TLI, VF.Width, UF); + InnerLoopVectorizer LB(L, SE, LI, DT, TLI, TTI, VF.Width, UF); LB.vectorize(&LVL); ++LoopsVectorized; + // Add metadata to disable runtime unrolling scalar loop when there's no + // runtime check about strides and memory. Because at this situation, + // scalar loop is rarely used not worthy to be unrolled. + if (!LB.IsSafetyChecksAdded()) + AddRuntimeUnrollDisableMetaData(L); + // Report the vectorization decision. emitOptimizationRemark( F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), @@ -1495,11 +1692,12 @@ struct LoopVectorize : public FunctionPass { AU.addRequiredID(LCSSAID); AU.addRequired<BlockFrequencyInfo>(); AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<LoopInfo>(); + AU.addRequired<LoopInfoWrapperPass>(); AU.addRequired<ScalarEvolution>(); - AU.addRequired<TargetTransformInfo>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); AU.addRequired<AliasAnalysis>(); - AU.addPreserved<LoopInfo>(); + AU.addRequired<LoopAccessAnalysis>(); + AU.addPreserved<LoopInfoWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); AU.addPreserved<AliasAnalysis>(); } @@ -1513,65 +1711,6 @@ struct LoopVectorize : public FunctionPass { // LoopVectorizationCostModel. //===----------------------------------------------------------------------===// -static Value *stripIntegerCast(Value *V) { - if (CastInst *CI = dyn_cast<CastInst>(V)) - if (CI->getOperand(0)->getType()->isIntegerTy()) - return CI->getOperand(0); - return V; -} - -///\brief Replaces the symbolic stride in a pointer SCEV expression by one. -/// -/// If \p OrigPtr is not null, use it to look up the stride value instead of -/// \p Ptr. -static const SCEV *replaceSymbolicStrideSCEV(ScalarEvolution *SE, - ValueToValueMap &PtrToStride, - Value *Ptr, Value *OrigPtr = nullptr) { - - const SCEV *OrigSCEV = SE->getSCEV(Ptr); - - // If there is an entry in the map return the SCEV of the pointer with the - // symbolic stride replaced by one. - ValueToValueMap::iterator SI = PtrToStride.find(OrigPtr ? OrigPtr : Ptr); - if (SI != PtrToStride.end()) { - Value *StrideVal = SI->second; - - // Strip casts. - StrideVal = stripIntegerCast(StrideVal); - - // Replace symbolic stride by one. - Value *One = ConstantInt::get(StrideVal->getType(), 1); - ValueToValueMap RewriteMap; - RewriteMap[StrideVal] = One; - - const SCEV *ByOne = - SCEVParameterRewriter::rewrite(OrigSCEV, *SE, RewriteMap, true); - DEBUG(dbgs() << "LV: Replacing SCEV: " << *OrigSCEV << " by: " << *ByOne - << "\n"); - return ByOne; - } - - // Otherwise, just return the SCEV of the original pointer. - return SE->getSCEV(Ptr); -} - -void LoopVectorizationLegality::RuntimePointerCheck::insert( - ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr, unsigned DepSetId, - unsigned ASId, ValueToValueMap &Strides) { - // Get the stride replaced scev. - const SCEV *Sc = replaceSymbolicStrideSCEV(SE, Strides, Ptr); - const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc); - assert(AR && "Invalid addrec expression"); - const SCEV *Ex = SE->getBackedgeTakenCount(Lp); - const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE); - Pointers.push_back(Ptr); - Starts.push_back(AR->getStart()); - Ends.push_back(ScEnd); - IsWritePtr.push_back(WritePtr); - DependencySetId.push_back(DepSetId); - AliasSetId.push_back(ASId); -} - Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { // We need to place the broadcast of invariant variables outside the loop. Instruction *Instr = dyn_cast<Instruction>(V); @@ -1591,11 +1730,13 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { return Shuf; } -Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, int StartIdx, - bool Negate) { +Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, + Value *Step) { assert(Val->getType()->isVectorTy() && "Must be a vector"); assert(Val->getType()->getScalarType()->isIntegerTy() && "Elem must be an integer"); + assert(Step->getType() == Val->getType()->getScalarType() && + "Step has wrong type"); // Create the types. Type *ITy = Val->getType()->getScalarType(); VectorType *Ty = cast<VectorType>(Val->getType()); @@ -1603,24 +1744,27 @@ Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, int StartIdx, SmallVector<Constant*, 8> Indices; // Create a vector of consecutive numbers from zero to VF. - for (int i = 0; i < VLen; ++i) { - int64_t Idx = Negate ? (-i) : i; - Indices.push_back(ConstantInt::get(ITy, StartIdx + Idx, Negate)); - } + for (int i = 0; i < VLen; ++i) + Indices.push_back(ConstantInt::get(ITy, StartIdx + i)); // Add the consecutive indices to the vector value. Constant *Cv = ConstantVector::get(Indices); assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); - return Builder.CreateAdd(Val, Cv, "induction"); + Step = Builder.CreateVectorSplat(VLen, Step); + assert(Step->getType() == Val->getType() && "Invalid step vec"); + // FIXME: The newly created binary instructions should contain nsw/nuw flags, + // which can be found from the original scalar operations. + Step = Builder.CreateMul(Cv, Step); + return Builder.CreateAdd(Val, Step, "induction"); } /// \brief Find the operand of the GEP that should be checked for consecutive /// stores. This ignores trailing indices that have no effect on the final /// pointer. -static unsigned getGEPInductionOperand(const DataLayout *DL, - const GetElementPtrInst *Gep) { +static unsigned getGEPInductionOperand(const GetElementPtrInst *Gep) { + const DataLayout &DL = Gep->getModule()->getDataLayout(); unsigned LastOperand = Gep->getNumOperands() - 1; - unsigned GEPAllocSize = DL->getTypeAllocSize( + unsigned GEPAllocSize = DL.getTypeAllocSize( cast<PointerType>(Gep->getType()->getScalarType())->getElementType()); // Walk backwards and try to peel off zeros. @@ -1631,7 +1775,7 @@ static unsigned getGEPInductionOperand(const DataLayout *DL, // If it's a type with the same allocation size as the result of the GEP we // can peel off the zero index. - if (DL->getTypeAllocSize(*GEPTI) != GEPAllocSize) + if (DL.getTypeAllocSize(*GEPTI) != GEPAllocSize) break; --LastOperand; } @@ -1649,10 +1793,7 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr); if (Phi && Inductions.count(Phi)) { InductionInfo II = Inductions[Phi]; - if (IK_PtrInduction == II.IK) - return 1; - else if (IK_ReversePtrInduction == II.IK) - return -1; + return II.getConsecutiveDirection(); } GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr); @@ -1677,13 +1818,10 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { return 0; InductionInfo II = Inductions[Phi]; - if (IK_PtrInduction == II.IK) - return 1; - else if (IK_ReversePtrInduction == II.IK) - return -1; + return II.getConsecutiveDirection(); } - unsigned InductionOperand = getGEPInductionOperand(DL, Gep); + unsigned InductionOperand = getGEPInductionOperand(Gep); // Check that all of the gep indices are uniform except for our induction // operand. @@ -1730,7 +1868,7 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { } bool LoopVectorizationLegality::isUniform(Value *V) { - return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop)); + return LAI->isUniform(V); } InnerLoopVectorizer::VectorParts& @@ -1763,6 +1901,251 @@ Value *InnerLoopVectorizer::reverseVector(Value *Vec) { "reverse"); } +// Get a mask to interleave \p NumVec vectors into a wide vector. +// I.e. <0, VF, VF*2, ..., VF*(NumVec-1), 1, VF+1, VF*2+1, ...> +// E.g. For 2 interleaved vectors, if VF is 4, the mask is: +// <0, 4, 1, 5, 2, 6, 3, 7> +static Constant *getInterleavedMask(IRBuilder<> &Builder, unsigned VF, + unsigned NumVec) { + SmallVector<Constant *, 16> Mask; + for (unsigned i = 0; i < VF; i++) + for (unsigned j = 0; j < NumVec; j++) + Mask.push_back(Builder.getInt32(j * VF + i)); + + return ConstantVector::get(Mask); +} + +// Get the strided mask starting from index \p Start. +// I.e. <Start, Start + Stride, ..., Start + Stride*(VF-1)> +static Constant *getStridedMask(IRBuilder<> &Builder, unsigned Start, + unsigned Stride, unsigned VF) { + SmallVector<Constant *, 16> Mask; + for (unsigned i = 0; i < VF; i++) + Mask.push_back(Builder.getInt32(Start + i * Stride)); + + return ConstantVector::get(Mask); +} + +// Get a mask of two parts: The first part consists of sequential integers +// starting from 0, The second part consists of UNDEFs. +// I.e. <0, 1, 2, ..., NumInt - 1, undef, ..., undef> +static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned NumInt, + unsigned NumUndef) { + SmallVector<Constant *, 16> Mask; + for (unsigned i = 0; i < NumInt; i++) + Mask.push_back(Builder.getInt32(i)); + + Constant *Undef = UndefValue::get(Builder.getInt32Ty()); + for (unsigned i = 0; i < NumUndef; i++) + Mask.push_back(Undef); + + return ConstantVector::get(Mask); +} + +// Concatenate two vectors with the same element type. The 2nd vector should +// not have more elements than the 1st vector. If the 2nd vector has less +// elements, extend it with UNDEFs. +static Value *ConcatenateTwoVectors(IRBuilder<> &Builder, Value *V1, + Value *V2) { + VectorType *VecTy1 = dyn_cast<VectorType>(V1->getType()); + VectorType *VecTy2 = dyn_cast<VectorType>(V2->getType()); + assert(VecTy1 && VecTy2 && + VecTy1->getScalarType() == VecTy2->getScalarType() && + "Expect two vectors with the same element type"); + + unsigned NumElts1 = VecTy1->getNumElements(); + unsigned NumElts2 = VecTy2->getNumElements(); + assert(NumElts1 >= NumElts2 && "Unexpect the first vector has less elements"); + + if (NumElts1 > NumElts2) { + // Extend with UNDEFs. + Constant *ExtMask = + getSequentialMask(Builder, NumElts2, NumElts1 - NumElts2); + V2 = Builder.CreateShuffleVector(V2, UndefValue::get(VecTy2), ExtMask); + } + + Constant *Mask = getSequentialMask(Builder, NumElts1 + NumElts2, 0); + return Builder.CreateShuffleVector(V1, V2, Mask); +} + +// Concatenate vectors in the given list. All vectors have the same type. +static Value *ConcatenateVectors(IRBuilder<> &Builder, + ArrayRef<Value *> InputList) { + unsigned NumVec = InputList.size(); + assert(NumVec > 1 && "Should be at least two vectors"); + + SmallVector<Value *, 8> ResList; + ResList.append(InputList.begin(), InputList.end()); + do { + SmallVector<Value *, 8> TmpList; + for (unsigned i = 0; i < NumVec - 1; i += 2) { + Value *V0 = ResList[i], *V1 = ResList[i + 1]; + assert((V0->getType() == V1->getType() || i == NumVec - 2) && + "Only the last vector may have a different type"); + + TmpList.push_back(ConcatenateTwoVectors(Builder, V0, V1)); + } + + // Push the last vector if the total number of vectors is odd. + if (NumVec % 2 != 0) + TmpList.push_back(ResList[NumVec - 1]); + + ResList = TmpList; + NumVec = ResList.size(); + } while (NumVec > 1); + + return ResList[0]; +} + +// Try to vectorize the interleave group that \p Instr belongs to. +// +// E.g. Translate following interleaved load group (factor = 3): +// for (i = 0; i < N; i+=3) { +// R = Pic[i]; // Member of index 0 +// G = Pic[i+1]; // Member of index 1 +// B = Pic[i+2]; // Member of index 2 +// ... // do something to R, G, B +// } +// To: +// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B +// %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements +// %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements +// %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements +// +// Or translate following interleaved store group (factor = 3): +// for (i = 0; i < N; i+=3) { +// ... do something to R, G, B +// Pic[i] = R; // Member of index 0 +// Pic[i+1] = G; // Member of index 1 +// Pic[i+2] = B; // Member of index 2 +// } +// To: +// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> +// %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u> +// %interleaved.vec = shuffle %R_G.vec, %B_U.vec, +// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements +// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B +void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { + const InterleaveGroup *Group = Legal->getInterleavedAccessGroup(Instr); + assert(Group && "Fail to get an interleaved access group."); + + // Skip if current instruction is not the insert position. + if (Instr != Group->getInsertPos()) + return; + + LoadInst *LI = dyn_cast<LoadInst>(Instr); + StoreInst *SI = dyn_cast<StoreInst>(Instr); + Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); + + // Prepare for the vector type of the interleaved load/store. + Type *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); + unsigned InterleaveFactor = Group->getFactor(); + Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF); + Type *PtrTy = VecTy->getPointerTo(Ptr->getType()->getPointerAddressSpace()); + + // Prepare for the new pointers. + setDebugLocFromInst(Builder, Ptr); + VectorParts &PtrParts = getVectorValue(Ptr); + SmallVector<Value *, 2> NewPtrs; + unsigned Index = Group->getIndex(Instr); + for (unsigned Part = 0; Part < UF; Part++) { + // Extract the pointer for current instruction from the pointer vector. A + // reverse access uses the pointer in the last lane. + Value *NewPtr = Builder.CreateExtractElement( + PtrParts[Part], + Group->isReverse() ? Builder.getInt32(VF - 1) : Builder.getInt32(0)); + + // Notice current instruction could be any index. Need to adjust the address + // to the member of index 0. + // + // E.g. a = A[i+1]; // Member of index 1 (Current instruction) + // b = A[i]; // Member of index 0 + // Current pointer is pointed to A[i+1], adjust it to A[i]. + // + // E.g. A[i+1] = a; // Member of index 1 + // A[i] = b; // Member of index 0 + // A[i+2] = c; // Member of index 2 (Current instruction) + // Current pointer is pointed to A[i+2], adjust it to A[i]. + NewPtr = Builder.CreateGEP(NewPtr, Builder.getInt32(-Index)); + + // Cast to the vector pointer type. + NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy)); + } + + setDebugLocFromInst(Builder, Instr); + Value *UndefVec = UndefValue::get(VecTy); + + // Vectorize the interleaved load group. + if (LI) { + for (unsigned Part = 0; Part < UF; Part++) { + Instruction *NewLoadInstr = Builder.CreateAlignedLoad( + NewPtrs[Part], Group->getAlignment(), "wide.vec"); + + for (unsigned i = 0; i < InterleaveFactor; i++) { + Instruction *Member = Group->getMember(i); + + // Skip the gaps in the group. + if (!Member) + continue; + + Constant *StrideMask = getStridedMask(Builder, i, InterleaveFactor, VF); + Value *StridedVec = Builder.CreateShuffleVector( + NewLoadInstr, UndefVec, StrideMask, "strided.vec"); + + // If this member has different type, cast the result type. + if (Member->getType() != ScalarTy) { + VectorType *OtherVTy = VectorType::get(Member->getType(), VF); + StridedVec = Builder.CreateBitOrPointerCast(StridedVec, OtherVTy); + } + + VectorParts &Entry = WidenMap.get(Member); + Entry[Part] = + Group->isReverse() ? reverseVector(StridedVec) : StridedVec; + } + + propagateMetadata(NewLoadInstr, Instr); + } + return; + } + + // The sub vector type for current instruction. + VectorType *SubVT = VectorType::get(ScalarTy, VF); + + // Vectorize the interleaved store group. + for (unsigned Part = 0; Part < UF; Part++) { + // Collect the stored vector from each member. + SmallVector<Value *, 4> StoredVecs; + for (unsigned i = 0; i < InterleaveFactor; i++) { + // Interleaved store group doesn't allow a gap, so each index has a member + Instruction *Member = Group->getMember(i); + assert(Member && "Fail to get a member from an interleaved store group"); + + Value *StoredVec = + getVectorValue(dyn_cast<StoreInst>(Member)->getValueOperand())[Part]; + if (Group->isReverse()) + StoredVec = reverseVector(StoredVec); + + // If this member has different type, cast it to an unified type. + if (StoredVec->getType() != SubVT) + StoredVec = Builder.CreateBitOrPointerCast(StoredVec, SubVT); + + StoredVecs.push_back(StoredVec); + } + + // Concatenate all vectors into a wide vector. + Value *WideVec = ConcatenateVectors(Builder, StoredVecs); + + // Interleave the elements in the wide vector. + Constant *IMask = getInterleavedMask(Builder, VF, InterleaveFactor); + Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask, + "interleaved.vec"); + + Instruction *NewStoreInstr = + Builder.CreateAlignedStore(IVec, NewPtrs[Part], Group->getAlignment()); + propagateMetadata(NewStoreInstr, Instr); + } +} + void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { // Attempt to issue a wide load. LoadInst *LI = dyn_cast<LoadInst>(Instr); @@ -1770,17 +2153,22 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { assert((LI || SI) && "Invalid Load/Store instruction"); + // Try to vectorize the interleave group if this access is interleaved. + if (Legal->isAccessInterleaved(Instr)) + return vectorizeInterleaveGroup(Instr); + Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType(); Type *DataTy = VectorType::get(ScalarDataTy, VF); Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment(); // An alignment of 0 means target abi alignment. We need to use the scalar's // target abi alignment in such a case. + const DataLayout &DL = Instr->getModule()->getDataLayout(); if (!Alignment) - Alignment = DL->getABITypeAlignment(ScalarDataTy); + Alignment = DL.getABITypeAlignment(ScalarDataTy); unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); - unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy); - unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF; + unsigned ScalarAllocatedSize = DL.getTypeAllocSize(ScalarDataTy); + unsigned VectorElementSize = DL.getTypeStoreSize(DataTy) / VF; if (SI && Legal->blockNeedsPredication(SI->getParent()) && !Legal->isMaskRequired(SI)) @@ -1821,7 +2209,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { // The last index does not have to be the induction. It can be // consecutive and be a function of the index. For example A[I+1]; unsigned NumOperands = Gep->getNumOperands(); - unsigned InductionOperand = getGEPInductionOperand(DL, Gep); + unsigned InductionOperand = getGEPInductionOperand(Gep); // Create the new GEP with the new induction variable. GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); @@ -1864,7 +2252,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { for (unsigned Part = 0; Part < UF; ++Part) { // Calculate the pointer for the specific unroll-part. - Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF)); + Value *PartPtr = + Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF)); if (Reverse) { // If we store to reverse consecutive memory locations then we need @@ -1872,8 +2261,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { StoredVal[Part] = reverseVector(StoredVal[Part]); // If the address is consecutive but reversed, then the // wide store needs to start at the last vector element. - PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF)); - PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)); + PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF)); + PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF)); Mask[Part] = reverseVector(Mask[Part]); } @@ -1896,13 +2285,14 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { setDebugLocFromInst(Builder, LI); for (unsigned Part = 0; Part < UF; ++Part) { // Calculate the pointer for the specific unroll-part. - Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF)); + Value *PartPtr = + Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF)); if (Reverse) { // If the address is consecutive but reversed, then the // wide load needs to start at the last vector element. - PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF)); - PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)); + PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF)); + PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF)); Mask[Part] = reverseVector(Mask[Part]); } @@ -1992,7 +2382,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, ConstantInt::get(Cmp->getType(), 1)); CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store"); LoopVectorBody.push_back(CondBlock); - VectorLp->addBasicBlockToLoop(CondBlock, LI->getBase()); + VectorLp->addBasicBlockToLoop(CondBlock, *LI); // Update Builder with newly created basic block. Builder.SetInsertPoint(InsertPt); } @@ -2021,7 +2411,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic if (IfPredicateStore) { BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); LoopVectorBody.push_back(NewIfBlock); - VectorLp->addBasicBlockToLoop(NewIfBlock, LI->getBase()); + VectorLp->addBasicBlockToLoop(NewIfBlock, *LI); Builder.SetInsertPoint(InsertPt); Instruction *OldBr = IfBlock->getTerminator(); BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); @@ -2078,102 +2468,6 @@ InnerLoopVectorizer::addStrideCheck(Instruction *Loc) { return std::make_pair(FirstInst, TheCheck); } -std::pair<Instruction *, Instruction *> -InnerLoopVectorizer::addRuntimeCheck(Instruction *Loc) { - LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck = - Legal->getRuntimePointerCheck(); - - Instruction *tnullptr = nullptr; - if (!PtrRtCheck->Need) - return std::pair<Instruction *, Instruction *>(tnullptr, tnullptr); - - unsigned NumPointers = PtrRtCheck->Pointers.size(); - SmallVector<TrackingVH<Value> , 2> Starts; - SmallVector<TrackingVH<Value> , 2> Ends; - - LLVMContext &Ctx = Loc->getContext(); - SCEVExpander Exp(*SE, "induction"); - Instruction *FirstInst = nullptr; - - for (unsigned i = 0; i < NumPointers; ++i) { - Value *Ptr = PtrRtCheck->Pointers[i]; - const SCEV *Sc = SE->getSCEV(Ptr); - - if (SE->isLoopInvariant(Sc, OrigLoop)) { - DEBUG(dbgs() << "LV: Adding RT check for a loop invariant ptr:" << - *Ptr <<"\n"); - Starts.push_back(Ptr); - Ends.push_back(Ptr); - } else { - DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr << '\n'); - unsigned AS = Ptr->getType()->getPointerAddressSpace(); - - // Use this type for pointer arithmetic. - Type *PtrArithTy = Type::getInt8PtrTy(Ctx, AS); - - Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], PtrArithTy, Loc); - Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc); - Starts.push_back(Start); - Ends.push_back(End); - } - } - - IRBuilder<> ChkBuilder(Loc); - // Our instructions might fold to a constant. - Value *MemoryRuntimeCheck = nullptr; - for (unsigned i = 0; i < NumPointers; ++i) { - for (unsigned j = i+1; j < NumPointers; ++j) { - // No need to check if two readonly pointers intersect. - if (!PtrRtCheck->IsWritePtr[i] && !PtrRtCheck->IsWritePtr[j]) - continue; - - // Only need to check pointers between two different dependency sets. - if (PtrRtCheck->DependencySetId[i] == PtrRtCheck->DependencySetId[j]) - continue; - // Only need to check pointers in the same alias set. - if (PtrRtCheck->AliasSetId[i] != PtrRtCheck->AliasSetId[j]) - continue; - - unsigned AS0 = Starts[i]->getType()->getPointerAddressSpace(); - unsigned AS1 = Starts[j]->getType()->getPointerAddressSpace(); - - assert((AS0 == Ends[j]->getType()->getPointerAddressSpace()) && - (AS1 == Ends[i]->getType()->getPointerAddressSpace()) && - "Trying to bounds check pointers with different address spaces"); - - Type *PtrArithTy0 = Type::getInt8PtrTy(Ctx, AS0); - Type *PtrArithTy1 = Type::getInt8PtrTy(Ctx, AS1); - - Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy0, "bc"); - Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy1, "bc"); - Value *End0 = ChkBuilder.CreateBitCast(Ends[i], PtrArithTy1, "bc"); - Value *End1 = ChkBuilder.CreateBitCast(Ends[j], PtrArithTy0, "bc"); - - Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0"); - FirstInst = getFirstInst(FirstInst, Cmp0, Loc); - Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1"); - FirstInst = getFirstInst(FirstInst, Cmp1, Loc); - Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict"); - FirstInst = getFirstInst(FirstInst, IsConflict, Loc); - if (MemoryRuntimeCheck) { - IsConflict = ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, - "conflict.rdx"); - FirstInst = getFirstInst(FirstInst, IsConflict, Loc); - } - MemoryRuntimeCheck = IsConflict; - } - } - - // We have to do this trickery because the IRBuilder might fold the check to a - // constant expression in which case there is no Instruction anchored in a - // the block. - Instruction *Check = BinaryOperator::CreateAnd(MemoryRuntimeCheck, - ConstantInt::getTrue(Ctx)); - ChkBuilder.Insert(Check, "memcheck.conflict"); - FirstInst = getFirstInst(FirstInst, Check, Loc); - return std::make_pair(FirstInst, Check); -} - void InnerLoopVectorizer::createEmptyLoop() { /* In this function we generate a new loop. The new loop will contain @@ -2238,9 +2532,11 @@ void InnerLoopVectorizer::createEmptyLoop() { ExitCount = SE->getAddExpr(BackedgeTakeCount, SE->getConstant(BackedgeTakeCount->getType(), 1)); + const DataLayout &DL = OldBasicBlock->getModule()->getDataLayout(); + // Expand the trip count and place the new instructions in the preheader. // Notice that the pre-header does not change, only the loop body. - SCEVExpander Exp(*SE, "induction"); + SCEVExpander Exp(*SE, DL, "induction"); // We need to test whether the backedge-taken count is uint##_max. Adding one // to it will cause overflow and an incorrect loop trip count in the vector @@ -2299,13 +2595,13 @@ void InnerLoopVectorizer::createEmptyLoop() { // before calling any utilities such as SCEV that require valid LoopInfo. if (ParentLoop) { ParentLoop->addChildLoop(Lp); - ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase()); - ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase()); - ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase()); + ParentLoop->addBasicBlockToLoop(ScalarPH, *LI); + ParentLoop->addBasicBlockToLoop(VectorPH, *LI); + ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI); } else { LI->addTopLevelLoop(Lp); } - Lp->addBasicBlockToLoop(VecBody, LI->getBase()); + Lp->addBasicBlockToLoop(VecBody, *LI); // Use this IR builder to create the loop instructions (Phi, Br, Cmp) // inside the loop. @@ -2360,7 +2656,7 @@ void InnerLoopVectorizer::createEmptyLoop() { BasicBlock *CheckBlock = LastBypassBlock->splitBasicBlock(PastOverflowCheck, "overflow.checked"); if (ParentLoop) - ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase()); + ParentLoop->addBasicBlockToLoop(CheckBlock, *LI); LoopBypassBlocks.push_back(CheckBlock); Instruction *OldTerm = LastBypassBlock->getTerminator(); BranchInst::Create(ScalarPH, CheckBlock, CheckBCOverflow, OldTerm); @@ -2376,11 +2672,12 @@ void InnerLoopVectorizer::createEmptyLoop() { std::tie(FirstCheckInst, StrideCheck) = addStrideCheck(LastBypassBlock->getTerminator()); if (StrideCheck) { + AddedSafetyChecks = true; // Create a new block containing the stride check. BasicBlock *CheckBlock = LastBypassBlock->splitBasicBlock(FirstCheckInst, "vector.stridecheck"); if (ParentLoop) - ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase()); + ParentLoop->addBasicBlockToLoop(CheckBlock, *LI); LoopBypassBlocks.push_back(CheckBlock); // Replace the branch into the memory check block with a conditional branch @@ -2398,13 +2695,14 @@ void InnerLoopVectorizer::createEmptyLoop() { // faster. Instruction *MemRuntimeCheck; std::tie(FirstCheckInst, MemRuntimeCheck) = - addRuntimeCheck(LastBypassBlock->getTerminator()); + Legal->getLAI()->addRuntimeCheck(LastBypassBlock->getTerminator()); if (MemRuntimeCheck) { + AddedSafetyChecks = true; // Create a new block containing the memory check. BasicBlock *CheckBlock = - LastBypassBlock->splitBasicBlock(MemRuntimeCheck, "vector.memcheck"); + LastBypassBlock->splitBasicBlock(FirstCheckInst, "vector.memcheck"); if (ParentLoop) - ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase()); + ParentLoop->addBasicBlockToLoop(CheckBlock, *LI); LoopBypassBlocks.push_back(CheckBlock); // Replace the branch into the memory check block with a conditional branch @@ -2495,33 +2793,13 @@ void InnerLoopVectorizer::createEmptyLoop() { Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown, II.StartValue->getType(), "cast.crd"); - EndValue = BypassBuilder.CreateAdd(CRD, II.StartValue , "ind.end"); - break; - } - case LoopVectorizationLegality::IK_ReverseIntInduction: { - // Convert the CountRoundDown variable to the PHI size. - Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown, - II.StartValue->getType(), - "cast.crd"); - // Handle reverse integer induction counter. - EndValue = BypassBuilder.CreateSub(II.StartValue, CRD, "rev.ind.end"); + EndValue = II.transform(BypassBuilder, CRD); + EndValue->setName("ind.end"); break; } case LoopVectorizationLegality::IK_PtrInduction: { - // For pointer induction variables, calculate the offset using - // the end index. - EndValue = BypassBuilder.CreateGEP(II.StartValue, CountRoundDown, - "ptr.ind.end"); - break; - } - case LoopVectorizationLegality::IK_ReversePtrInduction: { - // The value at the end of the loop for the reverse pointer is calculated - // by creating a GEP with a negative index starting from the start value. - Value *Zero = ConstantInt::get(CountRoundDown->getType(), 0); - Value *NegIdx = BypassBuilder.CreateSub(Zero, CountRoundDown, - "rev.ind.end"); - EndValue = BypassBuilder.CreateGEP(II.StartValue, NegIdx, - "rev.ptr.ind.end"); + EndValue = II.transform(BypassBuilder, CountRoundDown); + EndValue->setName("ptr.ind.end"); break; } }// end of case @@ -2604,99 +2882,6 @@ void InnerLoopVectorizer::createEmptyLoop() { Hints.setAlreadyVectorized(); } -/// This function returns the identity element (or neutral element) for -/// the operation K. -Constant* -LoopVectorizationLegality::getReductionIdentity(ReductionKind K, Type *Tp) { - switch (K) { - case RK_IntegerXor: - case RK_IntegerAdd: - case RK_IntegerOr: - // Adding, Xoring, Oring zero to a number does not change it. - return ConstantInt::get(Tp, 0); - case RK_IntegerMult: - // Multiplying a number by 1 does not change it. - return ConstantInt::get(Tp, 1); - case RK_IntegerAnd: - // AND-ing a number with an all-1 value does not change it. - return ConstantInt::get(Tp, -1, true); - case RK_FloatMult: - // Multiplying a number by 1 does not change it. - return ConstantFP::get(Tp, 1.0L); - case RK_FloatAdd: - // Adding zero to a number does not change it. - return ConstantFP::get(Tp, 0.0L); - default: - llvm_unreachable("Unknown reduction kind"); - } -} - -/// This function translates the reduction kind to an LLVM binary operator. -static unsigned -getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) { - switch (Kind) { - case LoopVectorizationLegality::RK_IntegerAdd: - return Instruction::Add; - case LoopVectorizationLegality::RK_IntegerMult: - return Instruction::Mul; - case LoopVectorizationLegality::RK_IntegerOr: - return Instruction::Or; - case LoopVectorizationLegality::RK_IntegerAnd: - return Instruction::And; - case LoopVectorizationLegality::RK_IntegerXor: - return Instruction::Xor; - case LoopVectorizationLegality::RK_FloatMult: - return Instruction::FMul; - case LoopVectorizationLegality::RK_FloatAdd: - return Instruction::FAdd; - case LoopVectorizationLegality::RK_IntegerMinMax: - return Instruction::ICmp; - case LoopVectorizationLegality::RK_FloatMinMax: - return Instruction::FCmp; - default: - llvm_unreachable("Unknown reduction operation"); - } -} - -Value *createMinMaxOp(IRBuilder<> &Builder, - LoopVectorizationLegality::MinMaxReductionKind RK, - Value *Left, - Value *Right) { - CmpInst::Predicate P = CmpInst::ICMP_NE; - switch (RK) { - default: - llvm_unreachable("Unknown min/max reduction kind"); - case LoopVectorizationLegality::MRK_UIntMin: - P = CmpInst::ICMP_ULT; - break; - case LoopVectorizationLegality::MRK_UIntMax: - P = CmpInst::ICMP_UGT; - break; - case LoopVectorizationLegality::MRK_SIntMin: - P = CmpInst::ICMP_SLT; - break; - case LoopVectorizationLegality::MRK_SIntMax: - P = CmpInst::ICMP_SGT; - break; - case LoopVectorizationLegality::MRK_FloatMin: - P = CmpInst::FCMP_OLT; - break; - case LoopVectorizationLegality::MRK_FloatMax: - P = CmpInst::FCMP_OGT; - break; - } - - Value *Cmp; - if (RK == LoopVectorizationLegality::MRK_FloatMin || - RK == LoopVectorizationLegality::MRK_FloatMax) - Cmp = Builder.CreateFCmp(P, Left, Right, "rdx.minmax.cmp"); - else - Cmp = Builder.CreateICmp(P, Left, Right, "rdx.minmax.cmp"); - - Value *Select = Builder.CreateSelect(Cmp, Left, Right, "rdx.minmax.select"); - return Select; -} - namespace { struct CSEDenseMapInfo { static bool canHandle(Instruction *I) { @@ -2721,7 +2906,7 @@ struct CSEDenseMapInfo { return LHS->isIdenticalTo(RHS); } }; -} +} // namespace /// \brief Check whether this block is a predicated block. /// Due to if predication of stores we might create a sequence of "if(pred) a[i] @@ -2772,6 +2957,95 @@ static Value *addFastMathFlag(Value *V) { return V; } +/// Estimate the overhead of scalarizing a value. Insert and Extract are set if +/// the result needs to be inserted and/or extracted from vectors. +static unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract, + const TargetTransformInfo &TTI) { + if (Ty->isVoidTy()) + return 0; + + assert(Ty->isVectorTy() && "Can only scalarize vectors"); + unsigned Cost = 0; + + for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) { + if (Insert) + Cost += TTI.getVectorInstrCost(Instruction::InsertElement, Ty, i); + if (Extract) + Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, Ty, i); + } + + return Cost; +} + +// Estimate cost of a call instruction CI if it were vectorized with factor VF. +// Return the cost of the instruction, including scalarization overhead if it's +// needed. The flag NeedToScalarize shows if the call needs to be scalarized - +// i.e. either vector version isn't available, or is too expensive. +static unsigned getVectorCallCost(CallInst *CI, unsigned VF, + const TargetTransformInfo &TTI, + const TargetLibraryInfo *TLI, + bool &NeedToScalarize) { + Function *F = CI->getCalledFunction(); + StringRef FnName = CI->getCalledFunction()->getName(); + Type *ScalarRetTy = CI->getType(); + SmallVector<Type *, 4> Tys, ScalarTys; + for (auto &ArgOp : CI->arg_operands()) + ScalarTys.push_back(ArgOp->getType()); + + // Estimate cost of scalarized vector call. The source operands are assumed + // to be vectors, so we need to extract individual elements from there, + // execute VF scalar calls, and then gather the result into the vector return + // value. + unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys); + if (VF == 1) + return ScalarCallCost; + + // Compute corresponding vector type for return value and arguments. + Type *RetTy = ToVectorTy(ScalarRetTy, VF); + for (unsigned i = 0, ie = ScalarTys.size(); i != ie; ++i) + Tys.push_back(ToVectorTy(ScalarTys[i], VF)); + + // Compute costs of unpacking argument values for the scalar calls and + // packing the return values to a vector. + unsigned ScalarizationCost = + getScalarizationOverhead(RetTy, true, false, TTI); + for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) + ScalarizationCost += getScalarizationOverhead(Tys[i], false, true, TTI); + + unsigned Cost = ScalarCallCost * VF + ScalarizationCost; + + // If we can't emit a vector call for this function, then the currently found + // cost is the cost we need to return. + NeedToScalarize = true; + if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin()) + return Cost; + + // If the corresponding vector cost is cheaper, return its cost. + unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys); + if (VectorCallCost < Cost) { + NeedToScalarize = false; + return VectorCallCost; + } + return Cost; +} + +// Estimate cost of an intrinsic call instruction CI if it were vectorized with +// factor VF. Return the cost of the instruction, including scalarization +// overhead if it's needed. +static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF, + const TargetTransformInfo &TTI, + const TargetLibraryInfo *TLI) { + Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); + assert(ID && "Expected intrinsic call!"); + + Type *RetTy = ToVectorTy(CI->getType(), VF); + SmallVector<Type *, 4> Tys; + for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) + Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF)); + + return TTI.getIntrinsicInstrCost(ID, RetTy, Tys); +} + void InnerLoopVectorizer::vectorizeLoop() { //===------------------------------------------------===// // @@ -2819,10 +3093,14 @@ void InnerLoopVectorizer::vectorizeLoop() { // Find the reduction variable descriptor. assert(Legal->getReductionVars()->count(RdxPhi) && "Unable to find the reduction variable"); - LoopVectorizationLegality::ReductionDescriptor RdxDesc = - (*Legal->getReductionVars())[RdxPhi]; + RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[RdxPhi]; - setDebugLocFromInst(Builder, RdxDesc.StartValue); + RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); + TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); + Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); + RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = + RdxDesc.getMinMaxRecurrenceKind(); + setDebugLocFromInst(Builder, ReductionStartValue); // We need to generate a reduction vector from the incoming scalar. // To do so, we need to generate the 'identity' vector and override @@ -2831,40 +3109,38 @@ void InnerLoopVectorizer::vectorizeLoop() { Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator()); // This is the vector-clone of the value that leaves the loop. - VectorParts &VectorExit = getVectorValue(RdxDesc.LoopExitInstr); + VectorParts &VectorExit = getVectorValue(LoopExitInst); Type *VecTy = VectorExit[0]->getType(); // Find the reduction identity variable. Zero for addition, or, xor, // one for multiplication, -1 for And. Value *Identity; Value *VectorStart; - if (RdxDesc.Kind == LoopVectorizationLegality::RK_IntegerMinMax || - RdxDesc.Kind == LoopVectorizationLegality::RK_FloatMinMax) { + if (RK == RecurrenceDescriptor::RK_IntegerMinMax || + RK == RecurrenceDescriptor::RK_FloatMinMax) { // MinMax reduction have the start value as their identify. if (VF == 1) { - VectorStart = Identity = RdxDesc.StartValue; + VectorStart = Identity = ReductionStartValue; } else { - VectorStart = Identity = Builder.CreateVectorSplat(VF, - RdxDesc.StartValue, - "minmax.ident"); + VectorStart = Identity = + Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident"); } } else { // Handle other reduction kinds: - Constant *Iden = - LoopVectorizationLegality::getReductionIdentity(RdxDesc.Kind, - VecTy->getScalarType()); + Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( + RK, VecTy->getScalarType()); if (VF == 1) { Identity = Iden; // This vector is the Identity vector where the first element is the // incoming scalar reduction. - VectorStart = RdxDesc.StartValue; + VectorStart = ReductionStartValue; } else { Identity = ConstantVector::getSplat(VF, Iden); // This vector is the Identity vector where the first element is the // incoming scalar reduction. - VectorStart = Builder.CreateInsertElement(Identity, - RdxDesc.StartValue, Zero); + VectorStart = + Builder.CreateInsertElement(Identity, ReductionStartValue, Zero); } } @@ -2893,11 +3169,11 @@ void InnerLoopVectorizer::vectorizeLoop() { Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt()); VectorParts RdxParts; - setDebugLocFromInst(Builder, RdxDesc.LoopExitInstr); + setDebugLocFromInst(Builder, LoopExitInst); for (unsigned part = 0; part < UF; ++part) { // This PHINode contains the vectorized reduction variable, or // the initial value vector, if we bypass the vector loop. - VectorParts &RdxExitVal = getVectorValue(RdxDesc.LoopExitInstr); + VectorParts &RdxExitVal = getVectorValue(LoopExitInst); PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi"); Value *StartVal = (part == 0) ? VectorStart : Identity; for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) @@ -2909,7 +3185,7 @@ void InnerLoopVectorizer::vectorizeLoop() { // Reduce all of the unrolled parts into a single vector. Value *ReducedPartRdx = RdxParts[0]; - unsigned Op = getReductionBinOp(RdxDesc.Kind); + unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK); setDebugLocFromInst(Builder, ReducedPartRdx); for (unsigned part = 1; part < UF; ++part) { if (Op != Instruction::ICmp && Op != Instruction::FCmp) @@ -2918,8 +3194,8 @@ void InnerLoopVectorizer::vectorizeLoop() { Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxParts[part], ReducedPartRdx, "bin.rdx")); else - ReducedPartRdx = createMinMaxOp(Builder, RdxDesc.MinMaxKind, - ReducedPartRdx, RdxParts[part]); + ReducedPartRdx = RecurrenceDescriptor::createMinMaxOp( + Builder, MinMaxKind, ReducedPartRdx, RdxParts[part]); } if (VF > 1) { @@ -2950,7 +3226,8 @@ void InnerLoopVectorizer::vectorizeLoop() { TmpVec = addFastMathFlag(Builder.CreateBinOp( (Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx")); else - TmpVec = createMinMaxOp(Builder, RdxDesc.MinMaxKind, TmpVec, Shuf); + TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind, + TmpVec, Shuf); } // The result is in the first element of the vector. @@ -2962,7 +3239,7 @@ void InnerLoopVectorizer::vectorizeLoop() { // block and the middle block. PHINode *BCBlockPhi = PHINode::Create(RdxPhi->getType(), 2, "bc.merge.rdx", LoopScalarPreHeader->getTerminator()); - BCBlockPhi->addIncoming(RdxDesc.StartValue, LoopBypassBlocks[0]); + BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[0]); BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); // Now, we need to fix the users of the reduction variable @@ -2980,7 +3257,7 @@ void InnerLoopVectorizer::vectorizeLoop() { // We found our reduction value exit-PHI. Update it with the // incoming bypass edge. - if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) { + if (LCSSAPhi->getIncomingValue(0) == LoopExitInst) { // Add an edge coming from the bypass. LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); break; @@ -2995,7 +3272,7 @@ void InnerLoopVectorizer::vectorizeLoop() { // Pick the other block. int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); - (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr); + (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); }// end of for each redux variable. fixLCSSAPHIs(); @@ -3136,6 +3413,8 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, LoopVectorizationLegality::InductionInfo II = Legal->getInductionVars()->lookup(P); + // FIXME: The newly created binary instructions should contain nsw/nuw flags, + // which can be found from the original scalar operations. switch (II.IK) { case LoopVectorizationLegality::IK_NoInduction: llvm_unreachable("Unknown induction"); @@ -3153,80 +3432,42 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, Value *NormalizedIdx = Builder.CreateSub(Induction, ExtendedIdx, "normalized.idx"); NormalizedIdx = Builder.CreateSExtOrTrunc(NormalizedIdx, PhiTy); - Broadcasted = Builder.CreateAdd(II.StartValue, NormalizedIdx, - "offset.idx"); + Broadcasted = II.transform(Builder, NormalizedIdx); + Broadcasted->setName("offset.idx"); } Broadcasted = getBroadcastInstrs(Broadcasted); // After broadcasting the induction variable we need to make the vector // consecutive by adding 0, 1, 2, etc. for (unsigned part = 0; part < UF; ++part) - Entry[part] = getConsecutiveVector(Broadcasted, VF * part, false); + Entry[part] = getStepVector(Broadcasted, VF * part, II.StepValue); return; } - case LoopVectorizationLegality::IK_ReverseIntInduction: case LoopVectorizationLegality::IK_PtrInduction: - case LoopVectorizationLegality::IK_ReversePtrInduction: - // Handle reverse integer and pointer inductions. - Value *StartIdx = ExtendedIdx; - // This is the normalized GEP that starts counting at zero. - Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx, - "normalized.idx"); - - // Handle the reverse integer induction variable case. - if (LoopVectorizationLegality::IK_ReverseIntInduction == II.IK) { - IntegerType *DstTy = cast<IntegerType>(II.StartValue->getType()); - Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy, - "resize.norm.idx"); - Value *ReverseInd = Builder.CreateSub(II.StartValue, CNI, - "reverse.idx"); - - // This is a new value so do not hoist it out. - Value *Broadcasted = getBroadcastInstrs(ReverseInd); - // After broadcasting the induction variable we need to make the - // vector consecutive by adding ... -3, -2, -1, 0. - for (unsigned part = 0; part < UF; ++part) - Entry[part] = getConsecutiveVector(Broadcasted, -(int)VF * part, - true); - return; - } - // Handle the pointer induction variable case. assert(P->getType()->isPointerTy() && "Unexpected type."); - - // Is this a reverse induction ptr or a consecutive induction ptr. - bool Reverse = (LoopVectorizationLegality::IK_ReversePtrInduction == - II.IK); - + // This is the normalized GEP that starts counting at zero. + Value *NormalizedIdx = + Builder.CreateSub(Induction, ExtendedIdx, "normalized.idx"); // This is the vector of results. Notice that we don't generate // vector geps because scalar geps result in better code. for (unsigned part = 0; part < UF; ++part) { if (VF == 1) { - int EltIndex = (part) * (Reverse ? -1 : 1); + int EltIndex = part; Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex); - Value *GlobalIdx; - if (Reverse) - GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx"); - else - GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx"); - - Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx, - "next.gep"); + Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx); + Value *SclrGep = II.transform(Builder, GlobalIdx); + SclrGep->setName("next.gep"); Entry[part] = SclrGep; continue; } Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF)); for (unsigned int i = 0; i < VF; ++i) { - int EltIndex = (i + part * VF) * (Reverse ? -1 : 1); + int EltIndex = i + part * VF; Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex); - Value *GlobalIdx; - if (!Reverse) - GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx"); - else - GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx"); - - Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx, - "next.gep"); + Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx); + Value *SclrGep = II.transform(Builder, GlobalIdx); + SclrGep->setName("next.gep"); VecVal = Builder.CreateInsertElement(VecVal, SclrGep, Builder.getInt32(i), "insert.gep"); @@ -3246,7 +3487,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { // Nothing to do for PHIs and BR, since we already took care of the // loop control flow instructions. continue; - case Instruction::PHI:{ + case Instruction::PHI: { // Vectorize PHINodes. widenPHIInstruction(it, Entry, UF, VF, PV); continue; @@ -3367,8 +3608,12 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction, CI->getType()); Value *Broadcasted = getBroadcastInstrs(ScalarCast); + LoopVectorizationLegality::InductionInfo II = + Legal->getInductionVars()->lookup(OldInduction); + Constant *Step = + ConstantInt::getSigned(CI->getType(), II.StepValue->getSExtValue()); for (unsigned Part = 0; Part < UF; ++Part) - Entry[Part] = getConsecutiveVector(Broadcasted, VF * Part, false); + Entry[Part] = getStepVector(Broadcasted, VF * Part, Step); propagateMetadata(Entry, it); break; } @@ -3391,37 +3636,71 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { Module *M = BB->getParent()->getParent(); CallInst *CI = cast<CallInst>(it); + + StringRef FnName = CI->getCalledFunction()->getName(); + Function *F = CI->getCalledFunction(); + Type *RetTy = ToVectorTy(CI->getType(), VF); + SmallVector<Type *, 4> Tys; + for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) + Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF)); + Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); - assert(ID && "Not an intrinsic call!"); - switch (ID) { - case Intrinsic::assume: - case Intrinsic::lifetime_end: - case Intrinsic::lifetime_start: + if (ID && + (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || + ID == Intrinsic::lifetime_start)) { scalarizeInstruction(it); break; - default: - bool HasScalarOpd = hasVectorInstrinsicScalarOpd(ID, 1); - for (unsigned Part = 0; Part < UF; ++Part) { - SmallVector<Value *, 4> Args; - for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { - if (HasScalarOpd && i == 1) { - Args.push_back(CI->getArgOperand(i)); - continue; - } - VectorParts &Arg = getVectorValue(CI->getArgOperand(i)); - Args.push_back(Arg[Part]); - } - Type *Tys[] = {CI->getType()}; - if (VF > 1) - Tys[0] = VectorType::get(CI->getType()->getScalarType(), VF); + } + // The flag shows whether we use Intrinsic or a usual Call for vectorized + // version of the instruction. + // Is it beneficial to perform intrinsic call compared to lib call? + bool NeedToScalarize; + unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize); + bool UseVectorIntrinsic = + ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost; + if (!UseVectorIntrinsic && NeedToScalarize) { + scalarizeInstruction(it); + break; + } - Function *F = Intrinsic::getDeclaration(M, ID, Tys); - Entry[Part] = Builder.CreateCall(F, Args); + for (unsigned Part = 0; Part < UF; ++Part) { + SmallVector<Value *, 4> Args; + for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { + Value *Arg = CI->getArgOperand(i); + // Some intrinsics have a scalar argument - don't replace it with a + // vector. + if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) { + VectorParts &VectorArg = getVectorValue(CI->getArgOperand(i)); + Arg = VectorArg[Part]; + } + Args.push_back(Arg); } - propagateMetadata(Entry, it); - break; + Function *VectorF; + if (UseVectorIntrinsic) { + // Use vector version of the intrinsic. + Type *TysForDecl[] = {CI->getType()}; + if (VF > 1) + TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); + VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); + } else { + // Use vector version of the library call. + StringRef VFnName = TLI->getVectorizedFunction(FnName, VF); + assert(!VFnName.empty() && "Vector function name is empty."); + VectorF = M->getFunction(VFnName); + if (!VectorF) { + // Generate a declaration + FunctionType *FTy = FunctionType::get(RetTy, Tys, false); + VectorF = + Function::Create(FTy, Function::ExternalLinkage, VFnName, M); + VectorF->copyAttributesFrom(F); + } + } + assert(VectorF && "Can't create vector function."); + Entry[Part] = Builder.CreateCall(VectorF, Args); } + + propagateMetadata(Entry, it); break; } @@ -3484,7 +3763,7 @@ static bool canIfConvertPHINodes(BasicBlock *BB) { bool LoopVectorizationLegality::canVectorizeWithIfConvert() { if (!EnableIfConversion) { - emitAnalysis(Report() << "if-conversion is disabled"); + emitAnalysis(VectorizationReport() << "if-conversion is disabled"); return false; } @@ -3517,7 +3796,7 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { // We don't support switch statements inside loops. if (!isa<BranchInst>(BB->getTerminator())) { - emitAnalysis(Report(BB->getTerminator()) + emitAnalysis(VectorizationReport(BB->getTerminator()) << "loop contains a switch statement"); return false; } @@ -3525,12 +3804,12 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { // We must be able to predicate all blocks that need to be predicated. if (blockNeedsPredication(BB)) { if (!blockCanBePredicated(BB, SafePointes)) { - emitAnalysis(Report(BB->getTerminator()) + emitAnalysis(VectorizationReport(BB->getTerminator()) << "control flow cannot be substituted for a select"); return false; } } else if (BB != Header && !canIfConvertPHINodes(BB)) { - emitAnalysis(Report(BB->getTerminator()) + emitAnalysis(VectorizationReport(BB->getTerminator()) << "control flow cannot be substituted for a select"); return false; } @@ -3545,27 +3824,30 @@ bool LoopVectorizationLegality::canVectorize() { // be canonicalized. if (!TheLoop->getLoopPreheader()) { emitAnalysis( - Report() << "loop control flow is not understood by vectorizer"); + VectorizationReport() << + "loop control flow is not understood by vectorizer"); return false; } // We can only vectorize innermost loops. - if (TheLoop->getSubLoopsVector().size()) { - emitAnalysis(Report() << "loop is not the innermost loop"); + if (!TheLoop->getSubLoopsVector().empty()) { + emitAnalysis(VectorizationReport() << "loop is not the innermost loop"); return false; } // We must have a single backedge. if (TheLoop->getNumBackEdges() != 1) { emitAnalysis( - Report() << "loop control flow is not understood by vectorizer"); + VectorizationReport() << + "loop control flow is not understood by vectorizer"); return false; } // We must have a single exiting block. if (!TheLoop->getExitingBlock()) { emitAnalysis( - Report() << "loop control flow is not understood by vectorizer"); + VectorizationReport() << + "loop control flow is not understood by vectorizer"); return false; } @@ -3574,7 +3856,8 @@ bool LoopVectorizationLegality::canVectorize() { // instructions in the loop are executed the same number of times. if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { emitAnalysis( - Report() << "loop control flow is not understood by vectorizer"); + VectorizationReport() << + "loop control flow is not understood by vectorizer"); return false; } @@ -3592,7 +3875,8 @@ bool LoopVectorizationLegality::canVectorize() { // ScalarEvolution needs to be able to find the exit count. const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop); if (ExitCount == SE->getCouldNotCompute()) { - emitAnalysis(Report() << "could not determine number of loop iterations"); + emitAnalysis(VectorizationReport() << + "could not determine number of loop iterations"); DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n"); return false; } @@ -3613,9 +3897,14 @@ bool LoopVectorizationLegality::canVectorize() { collectLoopUniforms(); DEBUG(dbgs() << "LV: We can vectorize this loop" << - (PtrRtCheck.Need ? " (with a runtime bound check)" : "") + (LAI->getRuntimePointerCheck()->Need ? " (with a runtime bound check)" : + "") <<"!\n"); + // Analyze interleaved memory accesses. + if (EnableInterleavedMemAccesses) + InterleaveInfo.analyzeInterleaving(Strides); + // Okay! We can vectorize. At this point we don't have any other mem analysis // which may limit our maximum vectorization factor, so just return true with // no restrictions. @@ -3667,10 +3956,10 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // Look for the attribute signaling the absence of NaNs. Function &F = *Header->getParent(); + const DataLayout &DL = F.getParent()->getDataLayout(); if (F.hasFnAttribute("no-nans-fp-math")) - HasFunNoNaNAttr = F.getAttributes().getAttribute( - AttributeSet::FunctionIndex, - "no-nans-fp-math").getValueAsString() == "true"; + HasFunNoNaNAttr = + F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true"; // For each block in the loop. for (Loop::block_iterator bb = TheLoop->block_begin(), @@ -3686,7 +3975,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() && !PhiTy->isPointerTy()) { - emitAnalysis(Report(it) + emitAnalysis(VectorizationReport(it) << "loop control flow is not understood by vectorizer"); DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n"); return false; @@ -3700,14 +3989,15 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // identified reduction value with an outside user. if (!hasOutsideLoopUser(TheLoop, it, AllowedExit)) continue; - emitAnalysis(Report(it) << "value could not be identified as " - "an induction or reduction variable"); + emitAnalysis(VectorizationReport(it) << + "value could not be identified as " + "an induction or reduction variable"); return false; } // We only allow if-converted PHIs with exactly two incoming values. if (Phi->getNumIncomingValues() != 2) { - emitAnalysis(Report(it) + emitAnalysis(VectorizationReport(it) << "control flow not understood by vectorizer"); DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); return false; @@ -3715,18 +4005,19 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // This is the value coming from the preheader. Value *StartValue = Phi->getIncomingValueForBlock(PreHeader); + ConstantInt *StepValue = nullptr; // Check if this is an induction variable. - InductionKind IK = isInductionVariable(Phi); + InductionKind IK = isInductionVariable(Phi, StepValue); if (IK_NoInduction != IK) { // Get the widest type. if (!WidestIndTy) - WidestIndTy = convertPointerToIntegerType(*DL, PhiTy); + WidestIndTy = convertPointerToIntegerType(DL, PhiTy); else - WidestIndTy = getWiderType(*DL, PhiTy, WidestIndTy); + WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy); // Int inductions are special because we only allow one IV. - if (IK == IK_IntInduction) { + if (IK == IK_IntInduction && StepValue->isOne()) { // Use the phi node with the widest type as induction. Use the last // one if there are multiple (no good reason for doing this other // than it is expedient). @@ -3735,69 +4026,44 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { } DEBUG(dbgs() << "LV: Found an induction variable.\n"); - Inductions[Phi] = InductionInfo(StartValue, IK); + Inductions[Phi] = InductionInfo(StartValue, IK, StepValue); // Until we explicitly handle the case of an induction variable with // an outside loop user we have to give up vectorizing this loop. if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) { - emitAnalysis(Report(it) << "use of induction value outside of the " - "loop is not handled by vectorizer"); + emitAnalysis(VectorizationReport(it) << + "use of induction value outside of the " + "loop is not handled by vectorizer"); return false; } continue; } - if (AddReductionVar(Phi, RK_IntegerAdd)) { - DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n"); - continue; - } - if (AddReductionVar(Phi, RK_IntegerMult)) { - DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n"); - continue; - } - if (AddReductionVar(Phi, RK_IntegerOr)) { - DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n"); - continue; - } - if (AddReductionVar(Phi, RK_IntegerAnd)) { - DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n"); - continue; - } - if (AddReductionVar(Phi, RK_IntegerXor)) { - DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n"); - continue; - } - if (AddReductionVar(Phi, RK_IntegerMinMax)) { - DEBUG(dbgs() << "LV: Found a MINMAX reduction PHI."<< *Phi <<"\n"); - continue; - } - if (AddReductionVar(Phi, RK_FloatMult)) { - DEBUG(dbgs() << "LV: Found an FMult reduction PHI."<< *Phi <<"\n"); - continue; - } - if (AddReductionVar(Phi, RK_FloatAdd)) { - DEBUG(dbgs() << "LV: Found an FAdd reduction PHI."<< *Phi <<"\n"); - continue; - } - if (AddReductionVar(Phi, RK_FloatMinMax)) { - DEBUG(dbgs() << "LV: Found an float MINMAX reduction PHI."<< *Phi << - "\n"); + if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, + Reductions[Phi])) { + AllowedExit.insert(Reductions[Phi].getLoopExitInstr()); continue; } - emitAnalysis(Report(it) << "value that could not be identified as " - "reduction is used outside the loop"); + emitAnalysis(VectorizationReport(it) << + "value that could not be identified as " + "reduction is used outside the loop"); DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n"); return false; }// end of PHI handling - // We still don't handle functions. However, we can ignore dbg intrinsic - // calls and we do handle certain intrinsic and libm functions. + // We handle calls that: + // * Are debug info intrinsics. + // * Have a mapping to an IR intrinsic. + // * Have a vector version available. CallInst *CI = dyn_cast<CallInst>(it); - if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgInfoIntrinsic>(CI)) { - emitAnalysis(Report(it) << "call instruction cannot be vectorized"); - DEBUG(dbgs() << "LV: Found a call site.\n"); + if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgInfoIntrinsic>(CI) && + !(CI->getCalledFunction() && TLI && + TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) { + emitAnalysis(VectorizationReport(it) << + "call instruction cannot be vectorized"); + DEBUG(dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n"); return false; } @@ -3806,7 +4072,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (CI && hasVectorInstrinsicScalarOpd(getIntrinsicIDForCall(CI, TLI), 1)) { if (!SE->isLoopInvariant(SE->getSCEV(CI->getOperand(1)), TheLoop)) { - emitAnalysis(Report(it) + emitAnalysis(VectorizationReport(it) << "intrinsic instruction cannot be vectorized"); DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n"); return false; @@ -3817,7 +4083,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // Also, we can't vectorize extractelement instructions. if ((!VectorType::isValidElementType(it->getType()) && !it->getType()->isVoidTy()) || isa<ExtractElementInst>(it)) { - emitAnalysis(Report(it) + emitAnalysis(VectorizationReport(it) << "instruction return type cannot be vectorized"); DEBUG(dbgs() << "LV: Found unvectorizable type.\n"); return false; @@ -3827,7 +4093,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (StoreInst *ST = dyn_cast<StoreInst>(it)) { Type *T = ST->getValueOperand()->getType(); if (!VectorType::isValidElementType(T)) { - emitAnalysis(Report(ST) << "store instruction cannot be vectorized"); + emitAnalysis(VectorizationReport(ST) << + "store instruction cannot be vectorized"); return false; } if (EnableMemAccessVersioning) @@ -3841,7 +4108,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // Reduction instructions are allowed to have exit users. // All other instructions must not have external users. if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) { - emitAnalysis(Report(it) << "value cannot be used outside the loop"); + emitAnalysis(VectorizationReport(it) << + "value cannot be used outside the loop"); return false; } @@ -3852,7 +4120,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (!Induction) { DEBUG(dbgs() << "LV: Did not find one integer induction var.\n"); if (Inductions.empty()) { - emitAnalysis(Report() + emitAnalysis(VectorizationReport() << "loop induction variable could not be identified"); return false; } @@ -3863,13 +4131,12 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { ///\brief Remove GEPs whose indices but the last one are loop invariant and /// return the induction operand of the gep pointer. -static Value *stripGetElementPtr(Value *Ptr, ScalarEvolution *SE, - const DataLayout *DL, Loop *Lp) { +static Value *stripGetElementPtr(Value *Ptr, ScalarEvolution *SE, Loop *Lp) { GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr); if (!GEP) return Ptr; - unsigned InductionOperand = getGEPInductionOperand(DL, GEP); + unsigned InductionOperand = getGEPInductionOperand(GEP); // Check that all of the gep indices are uniform except for our induction // operand. @@ -3898,8 +4165,7 @@ static Value *getUniqueCastUse(Value *Ptr, Loop *Lp, Type *Ty) { ///\brief Get the stride of a pointer access in a loop. /// Looks for symbolic strides "a[i*stride]". Returns the symbolic stride as a /// pointer to the Value, or null otherwise. -static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, - const DataLayout *DL, Loop *Lp) { +static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *Lp) { const PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType()); if (!PtrTy || PtrTy->isAggregateType()) return nullptr; @@ -3912,7 +4178,7 @@ static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, // The size of the pointer access. int64_t PtrAccessSize = 1; - Ptr = stripGetElementPtr(Ptr, SE, DL, Lp); + Ptr = stripGetElementPtr(Ptr, SE, Lp); const SCEV *V = SE->getSCEV(Ptr); if (Ptr != OrigPtr) @@ -3931,7 +4197,8 @@ static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, // Strip off the size of access multiplication if we are still analyzing the // pointer. if (OrigPtr == Ptr) { - DL->getTypeAllocSize(PtrTy->getElementType()); + const DataLayout &DL = Lp->getHeader()->getModule()->getDataLayout(); + DL.getTypeAllocSize(PtrTy->getElementType()); if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(V)) { if (M->getOperand(0)->getSCEVType() != scConstant) return nullptr; @@ -3983,7 +4250,7 @@ void LoopVectorizationLegality::collectStridedAccess(Value *MemAccess) { else return; - Value *Stride = getStrideFromPointer(Ptr, SE, DL, TheLoop); + Value *Stride = getStrideFromPointer(Ptr, SE, TheLoop); if (!Stride) return; @@ -4012,7 +4279,7 @@ void LoopVectorizationLegality::collectLoopUniforms() { if (I->getType()->isPointerTy() && isConsecutivePtr(I)) Worklist.insert(Worklist.end(), I->op_begin(), I->op_end()); - while (Worklist.size()) { + while (!Worklist.empty()) { Instruction *I = dyn_cast<Instruction>(Worklist.back()); Worklist.pop_back(); @@ -4030,1305 +4297,46 @@ void LoopVectorizationLegality::collectLoopUniforms() { } } -namespace { -/// \brief Analyses memory accesses in a loop. -/// -/// Checks whether run time pointer checks are needed and builds sets for data -/// dependence checking. -class AccessAnalysis { -public: - /// \brief Read or write access location. - typedef PointerIntPair<Value *, 1, bool> MemAccessInfo; - typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet; - - /// \brief Set of potential dependent memory accesses. - typedef EquivalenceClasses<MemAccessInfo> DepCandidates; - - AccessAnalysis(const DataLayout *Dl, AliasAnalysis *AA, DepCandidates &DA) : - DL(Dl), AST(*AA), DepCands(DA), IsRTCheckNeeded(false) {} - - /// \brief Register a load and whether it is only read from. - void addLoad(AliasAnalysis::Location &Loc, bool IsReadOnly) { - Value *Ptr = const_cast<Value*>(Loc.Ptr); - AST.add(Ptr, AliasAnalysis::UnknownSize, Loc.AATags); - Accesses.insert(MemAccessInfo(Ptr, false)); - if (IsReadOnly) - ReadOnlyPtr.insert(Ptr); - } - - /// \brief Register a store. - void addStore(AliasAnalysis::Location &Loc) { - Value *Ptr = const_cast<Value*>(Loc.Ptr); - AST.add(Ptr, AliasAnalysis::UnknownSize, Loc.AATags); - Accesses.insert(MemAccessInfo(Ptr, true)); - } - - /// \brief Check whether we can check the pointers at runtime for - /// non-intersection. - bool canCheckPtrAtRT(LoopVectorizationLegality::RuntimePointerCheck &RtCheck, - unsigned &NumComparisons, ScalarEvolution *SE, - Loop *TheLoop, ValueToValueMap &Strides, - bool ShouldCheckStride = false); - - /// \brief Goes over all memory accesses, checks whether a RT check is needed - /// and builds sets of dependent accesses. - void buildDependenceSets() { - processMemAccesses(); - } - - bool isRTCheckNeeded() { return IsRTCheckNeeded; } - - bool isDependencyCheckNeeded() { return !CheckDeps.empty(); } - void resetDepChecks() { CheckDeps.clear(); } - - MemAccessInfoSet &getDependenciesToCheck() { return CheckDeps; } - -private: - typedef SetVector<MemAccessInfo> PtrAccessSet; - - /// \brief Go over all memory access and check whether runtime pointer checks - /// are needed /// and build sets of dependency check candidates. - void processMemAccesses(); - - /// Set of all accesses. - PtrAccessSet Accesses; - - /// Set of accesses that need a further dependence check. - MemAccessInfoSet CheckDeps; - - /// Set of pointers that are read only. - SmallPtrSet<Value*, 16> ReadOnlyPtr; - - const DataLayout *DL; - - /// An alias set tracker to partition the access set by underlying object and - //intrinsic property (such as TBAA metadata). - AliasSetTracker AST; - - /// Sets of potentially dependent accesses - members of one set share an - /// underlying pointer. The set "CheckDeps" identfies which sets really need a - /// dependence check. - DepCandidates &DepCands; - - bool IsRTCheckNeeded; -}; - -} // end anonymous namespace - -/// \brief Check whether a pointer can participate in a runtime bounds check. -static bool hasComputableBounds(ScalarEvolution *SE, ValueToValueMap &Strides, - Value *Ptr) { - const SCEV *PtrScev = replaceSymbolicStrideSCEV(SE, Strides, Ptr); - const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev); - if (!AR) - return false; - - return AR->isAffine(); -} - -/// \brief Check the stride of the pointer and ensure that it does not wrap in -/// the address space. -static int isStridedPtr(ScalarEvolution *SE, const DataLayout *DL, Value *Ptr, - const Loop *Lp, ValueToValueMap &StridesMap); - -bool AccessAnalysis::canCheckPtrAtRT( - LoopVectorizationLegality::RuntimePointerCheck &RtCheck, - unsigned &NumComparisons, ScalarEvolution *SE, Loop *TheLoop, - ValueToValueMap &StridesMap, bool ShouldCheckStride) { - // Find pointers with computable bounds. We are going to use this information - // to place a runtime bound check. - bool CanDoRT = true; - - bool IsDepCheckNeeded = isDependencyCheckNeeded(); - NumComparisons = 0; - - // We assign a consecutive id to access from different alias sets. - // Accesses between different groups doesn't need to be checked. - unsigned ASId = 1; - for (auto &AS : AST) { - unsigned NumReadPtrChecks = 0; - unsigned NumWritePtrChecks = 0; - - // We assign consecutive id to access from different dependence sets. - // Accesses within the same set don't need a runtime check. - unsigned RunningDepId = 1; - DenseMap<Value *, unsigned> DepSetId; - - for (auto A : AS) { - Value *Ptr = A.getValue(); - bool IsWrite = Accesses.count(MemAccessInfo(Ptr, true)); - MemAccessInfo Access(Ptr, IsWrite); - - if (IsWrite) - ++NumWritePtrChecks; - else - ++NumReadPtrChecks; - - if (hasComputableBounds(SE, StridesMap, Ptr) && - // When we run after a failing dependency check we have to make sure we - // don't have wrapping pointers. - (!ShouldCheckStride || - isStridedPtr(SE, DL, Ptr, TheLoop, StridesMap) == 1)) { - // The id of the dependence set. - unsigned DepId; - - if (IsDepCheckNeeded) { - Value *Leader = DepCands.getLeaderValue(Access).getPointer(); - unsigned &LeaderId = DepSetId[Leader]; - if (!LeaderId) - LeaderId = RunningDepId++; - DepId = LeaderId; - } else - // Each access has its own dependence set. - DepId = RunningDepId++; - - RtCheck.insert(SE, TheLoop, Ptr, IsWrite, DepId, ASId, StridesMap); - - DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *Ptr << '\n'); - } else { - CanDoRT = false; - } - } - - if (IsDepCheckNeeded && CanDoRT && RunningDepId == 2) - NumComparisons += 0; // Only one dependence set. - else { - NumComparisons += (NumWritePtrChecks * (NumReadPtrChecks + - NumWritePtrChecks - 1)); - } - - ++ASId; - } - - // If the pointers that we would use for the bounds comparison have different - // address spaces, assume the values aren't directly comparable, so we can't - // use them for the runtime check. We also have to assume they could - // overlap. In the future there should be metadata for whether address spaces - // are disjoint. - unsigned NumPointers = RtCheck.Pointers.size(); - for (unsigned i = 0; i < NumPointers; ++i) { - for (unsigned j = i + 1; j < NumPointers; ++j) { - // Only need to check pointers between two different dependency sets. - if (RtCheck.DependencySetId[i] == RtCheck.DependencySetId[j]) - continue; - // Only need to check pointers in the same alias set. - if (RtCheck.AliasSetId[i] != RtCheck.AliasSetId[j]) - continue; - - Value *PtrI = RtCheck.Pointers[i]; - Value *PtrJ = RtCheck.Pointers[j]; - - unsigned ASi = PtrI->getType()->getPointerAddressSpace(); - unsigned ASj = PtrJ->getType()->getPointerAddressSpace(); - if (ASi != ASj) { - DEBUG(dbgs() << "LV: Runtime check would require comparison between" - " different address spaces\n"); - return false; - } - } - } - - return CanDoRT; -} - -void AccessAnalysis::processMemAccesses() { - // We process the set twice: first we process read-write pointers, last we - // process read-only pointers. This allows us to skip dependence tests for - // read-only pointers. - - DEBUG(dbgs() << "LV: Processing memory accesses...\n"); - DEBUG(dbgs() << " AST: "; AST.dump()); - DEBUG(dbgs() << "LV: Accesses:\n"); - DEBUG({ - for (auto A : Accesses) - dbgs() << "\t" << *A.getPointer() << " (" << - (A.getInt() ? "write" : (ReadOnlyPtr.count(A.getPointer()) ? - "read-only" : "read")) << ")\n"; - }); - - // The AliasSetTracker has nicely partitioned our pointers by metadata - // compatibility and potential for underlying-object overlap. As a result, we - // only need to check for potential pointer dependencies within each alias - // set. - for (auto &AS : AST) { - // Note that both the alias-set tracker and the alias sets themselves used - // linked lists internally and so the iteration order here is deterministic - // (matching the original instruction order within each set). - - bool SetHasWrite = false; - - // Map of pointers to last access encountered. - typedef DenseMap<Value*, MemAccessInfo> UnderlyingObjToAccessMap; - UnderlyingObjToAccessMap ObjToLastAccess; - - // Set of access to check after all writes have been processed. - PtrAccessSet DeferredAccesses; - - // Iterate over each alias set twice, once to process read/write pointers, - // and then to process read-only pointers. - for (int SetIteration = 0; SetIteration < 2; ++SetIteration) { - bool UseDeferred = SetIteration > 0; - PtrAccessSet &S = UseDeferred ? DeferredAccesses : Accesses; - - for (auto AV : AS) { - Value *Ptr = AV.getValue(); - - // For a single memory access in AliasSetTracker, Accesses may contain - // both read and write, and they both need to be handled for CheckDeps. - for (auto AC : S) { - if (AC.getPointer() != Ptr) - continue; - - bool IsWrite = AC.getInt(); - - // If we're using the deferred access set, then it contains only - // reads. - bool IsReadOnlyPtr = ReadOnlyPtr.count(Ptr) && !IsWrite; - if (UseDeferred && !IsReadOnlyPtr) - continue; - // Otherwise, the pointer must be in the PtrAccessSet, either as a - // read or a write. - assert(((IsReadOnlyPtr && UseDeferred) || IsWrite || - S.count(MemAccessInfo(Ptr, false))) && - "Alias-set pointer not in the access set?"); - - MemAccessInfo Access(Ptr, IsWrite); - DepCands.insert(Access); - - // Memorize read-only pointers for later processing and skip them in - // the first round (they need to be checked after we have seen all - // write pointers). Note: we also mark pointer that are not - // consecutive as "read-only" pointers (so that we check - // "a[b[i]] +="). Hence, we need the second check for "!IsWrite". - if (!UseDeferred && IsReadOnlyPtr) { - DeferredAccesses.insert(Access); - continue; - } - - // If this is a write - check other reads and writes for conflicts. If - // this is a read only check other writes for conflicts (but only if - // there is no other write to the ptr - this is an optimization to - // catch "a[i] = a[i] + " without having to do a dependence check). - if ((IsWrite || IsReadOnlyPtr) && SetHasWrite) { - CheckDeps.insert(Access); - IsRTCheckNeeded = true; - } - - if (IsWrite) - SetHasWrite = true; - - // Create sets of pointers connected by a shared alias set and - // underlying object. - typedef SmallVector<Value *, 16> ValueVector; - ValueVector TempObjects; - GetUnderlyingObjects(Ptr, TempObjects, DL); - for (Value *UnderlyingObj : TempObjects) { - UnderlyingObjToAccessMap::iterator Prev = - ObjToLastAccess.find(UnderlyingObj); - if (Prev != ObjToLastAccess.end()) - DepCands.unionSets(Access, Prev->second); - - ObjToLastAccess[UnderlyingObj] = Access; - } - } - } - } - } -} - -namespace { -/// \brief Checks memory dependences among accesses to the same underlying -/// object to determine whether there vectorization is legal or not (and at -/// which vectorization factor). -/// -/// This class works under the assumption that we already checked that memory -/// locations with different underlying pointers are "must-not alias". -/// We use the ScalarEvolution framework to symbolically evalutate access -/// functions pairs. Since we currently don't restructure the loop we can rely -/// on the program order of memory accesses to determine their safety. -/// At the moment we will only deem accesses as safe for: -/// * A negative constant distance assuming program order. -/// -/// Safe: tmp = a[i + 1]; OR a[i + 1] = x; -/// a[i] = tmp; y = a[i]; -/// -/// The latter case is safe because later checks guarantuee that there can't -/// be a cycle through a phi node (that is, we check that "x" and "y" is not -/// the same variable: a header phi can only be an induction or a reduction, a -/// reduction can't have a memory sink, an induction can't have a memory -/// source). This is important and must not be violated (or we have to -/// resort to checking for cycles through memory). -/// -/// * A positive constant distance assuming program order that is bigger -/// than the biggest memory access. -/// -/// tmp = a[i] OR b[i] = x -/// a[i+2] = tmp y = b[i+2]; -/// -/// Safe distance: 2 x sizeof(a[0]), and 2 x sizeof(b[0]), respectively. -/// -/// * Zero distances and all accesses have the same size. -/// -class MemoryDepChecker { -public: - typedef PointerIntPair<Value *, 1, bool> MemAccessInfo; - typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet; - - MemoryDepChecker(ScalarEvolution *Se, const DataLayout *Dl, const Loop *L) - : SE(Se), DL(Dl), InnermostLoop(L), AccessIdx(0), - ShouldRetryWithRuntimeCheck(false) {} - - /// \brief Register the location (instructions are given increasing numbers) - /// of a write access. - void addAccess(StoreInst *SI) { - Value *Ptr = SI->getPointerOperand(); - Accesses[MemAccessInfo(Ptr, true)].push_back(AccessIdx); - InstMap.push_back(SI); - ++AccessIdx; - } - - /// \brief Register the location (instructions are given increasing numbers) - /// of a write access. - void addAccess(LoadInst *LI) { - Value *Ptr = LI->getPointerOperand(); - Accesses[MemAccessInfo(Ptr, false)].push_back(AccessIdx); - InstMap.push_back(LI); - ++AccessIdx; - } - - /// \brief Check whether the dependencies between the accesses are safe. - /// - /// Only checks sets with elements in \p CheckDeps. - bool areDepsSafe(AccessAnalysis::DepCandidates &AccessSets, - MemAccessInfoSet &CheckDeps, ValueToValueMap &Strides); - - /// \brief The maximum number of bytes of a vector register we can vectorize - /// the accesses safely with. - unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; } - - /// \brief In same cases when the dependency check fails we can still - /// vectorize the loop with a dynamic array access check. - bool shouldRetryWithRuntimeCheck() { return ShouldRetryWithRuntimeCheck; } - -private: - ScalarEvolution *SE; - const DataLayout *DL; - const Loop *InnermostLoop; - - /// \brief Maps access locations (ptr, read/write) to program order. - DenseMap<MemAccessInfo, std::vector<unsigned> > Accesses; - - /// \brief Memory access instructions in program order. - SmallVector<Instruction *, 16> InstMap; - - /// \brief The program order index to be used for the next instruction. - unsigned AccessIdx; - - // We can access this many bytes in parallel safely. - unsigned MaxSafeDepDistBytes; - - /// \brief If we see a non-constant dependence distance we can still try to - /// vectorize this loop with runtime checks. - bool ShouldRetryWithRuntimeCheck; - - /// \brief Check whether there is a plausible dependence between the two - /// accesses. - /// - /// Access \p A must happen before \p B in program order. The two indices - /// identify the index into the program order map. - /// - /// This function checks whether there is a plausible dependence (or the - /// absence of such can't be proved) between the two accesses. If there is a - /// plausible dependence but the dependence distance is bigger than one - /// element access it records this distance in \p MaxSafeDepDistBytes (if this - /// distance is smaller than any other distance encountered so far). - /// Otherwise, this function returns true signaling a possible dependence. - bool isDependent(const MemAccessInfo &A, unsigned AIdx, - const MemAccessInfo &B, unsigned BIdx, - ValueToValueMap &Strides); - - /// \brief Check whether the data dependence could prevent store-load - /// forwarding. - bool couldPreventStoreLoadForward(unsigned Distance, unsigned TypeByteSize); -}; - -} // end anonymous namespace - -static bool isInBoundsGep(Value *Ptr) { - if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr)) - return GEP->isInBounds(); - return false; -} - -/// \brief Check whether the access through \p Ptr has a constant stride. -static int isStridedPtr(ScalarEvolution *SE, const DataLayout *DL, Value *Ptr, - const Loop *Lp, ValueToValueMap &StridesMap) { - const Type *Ty = Ptr->getType(); - assert(Ty->isPointerTy() && "Unexpected non-ptr"); - - // Make sure that the pointer does not point to aggregate types. - const PointerType *PtrTy = cast<PointerType>(Ty); - if (PtrTy->getElementType()->isAggregateType()) { - DEBUG(dbgs() << "LV: Bad stride - Not a pointer to a scalar type" << *Ptr << - "\n"); - return 0; - } - - const SCEV *PtrScev = replaceSymbolicStrideSCEV(SE, StridesMap, Ptr); - - const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev); - if (!AR) { - DEBUG(dbgs() << "LV: Bad stride - Not an AddRecExpr pointer " - << *Ptr << " SCEV: " << *PtrScev << "\n"); - return 0; - } - - // The accesss function must stride over the innermost loop. - if (Lp != AR->getLoop()) { - DEBUG(dbgs() << "LV: Bad stride - Not striding over innermost loop " << - *Ptr << " SCEV: " << *PtrScev << "\n"); - } - - // The address calculation must not wrap. Otherwise, a dependence could be - // inverted. - // An inbounds getelementptr that is a AddRec with a unit stride - // cannot wrap per definition. The unit stride requirement is checked later. - // An getelementptr without an inbounds attribute and unit stride would have - // to access the pointer value "0" which is undefined behavior in address - // space 0, therefore we can also vectorize this case. - bool IsInBoundsGEP = isInBoundsGep(Ptr); - bool IsNoWrapAddRec = AR->getNoWrapFlags(SCEV::NoWrapMask); - bool IsInAddressSpaceZero = PtrTy->getAddressSpace() == 0; - if (!IsNoWrapAddRec && !IsInBoundsGEP && !IsInAddressSpaceZero) { - DEBUG(dbgs() << "LV: Bad stride - Pointer may wrap in the address space " - << *Ptr << " SCEV: " << *PtrScev << "\n"); - return 0; - } - - // Check the step is constant. - const SCEV *Step = AR->getStepRecurrence(*SE); - - // Calculate the pointer stride and check if it is consecutive. - const SCEVConstant *C = dyn_cast<SCEVConstant>(Step); - if (!C) { - DEBUG(dbgs() << "LV: Bad stride - Not a constant strided " << *Ptr << - " SCEV: " << *PtrScev << "\n"); - return 0; - } - - int64_t Size = DL->getTypeAllocSize(PtrTy->getElementType()); - const APInt &APStepVal = C->getValue()->getValue(); - - // Huge step value - give up. - if (APStepVal.getBitWidth() > 64) - return 0; - - int64_t StepVal = APStepVal.getSExtValue(); - - // Strided access. - int64_t Stride = StepVal / Size; - int64_t Rem = StepVal % Size; - if (Rem) - return 0; - - // If the SCEV could wrap but we have an inbounds gep with a unit stride we - // know we can't "wrap around the address space". In case of address space - // zero we know that this won't happen without triggering undefined behavior. - if (!IsNoWrapAddRec && (IsInBoundsGEP || IsInAddressSpaceZero) && - Stride != 1 && Stride != -1) - return 0; - - return Stride; -} - -bool MemoryDepChecker::couldPreventStoreLoadForward(unsigned Distance, - unsigned TypeByteSize) { - // If loads occur at a distance that is not a multiple of a feasible vector - // factor store-load forwarding does not take place. - // Positive dependences might cause troubles because vectorizing them might - // prevent store-load forwarding making vectorized code run a lot slower. - // a[i] = a[i-3] ^ a[i-8]; - // The stores to a[i:i+1] don't align with the stores to a[i-3:i-2] and - // hence on your typical architecture store-load forwarding does not take - // place. Vectorizing in such cases does not make sense. - // Store-load forwarding distance. - const unsigned NumCyclesForStoreLoadThroughMemory = 8*TypeByteSize; - // Maximum vector factor. - unsigned MaxVFWithoutSLForwardIssues = MaxVectorWidth*TypeByteSize; - if(MaxSafeDepDistBytes < MaxVFWithoutSLForwardIssues) - MaxVFWithoutSLForwardIssues = MaxSafeDepDistBytes; - - for (unsigned vf = 2*TypeByteSize; vf <= MaxVFWithoutSLForwardIssues; - vf *= 2) { - if (Distance % vf && Distance / vf < NumCyclesForStoreLoadThroughMemory) { - MaxVFWithoutSLForwardIssues = (vf >>=1); - break; - } - } - - if (MaxVFWithoutSLForwardIssues< 2*TypeByteSize) { - DEBUG(dbgs() << "LV: Distance " << Distance << - " that could cause a store-load forwarding conflict\n"); - return true; - } - - if (MaxVFWithoutSLForwardIssues < MaxSafeDepDistBytes && - MaxVFWithoutSLForwardIssues != MaxVectorWidth*TypeByteSize) - MaxSafeDepDistBytes = MaxVFWithoutSLForwardIssues; - return false; -} - -bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, - const MemAccessInfo &B, unsigned BIdx, - ValueToValueMap &Strides) { - assert (AIdx < BIdx && "Must pass arguments in program order"); - - Value *APtr = A.getPointer(); - Value *BPtr = B.getPointer(); - bool AIsWrite = A.getInt(); - bool BIsWrite = B.getInt(); - - // Two reads are independent. - if (!AIsWrite && !BIsWrite) - return false; - - // We cannot check pointers in different address spaces. - if (APtr->getType()->getPointerAddressSpace() != - BPtr->getType()->getPointerAddressSpace()) - return true; - - const SCEV *AScev = replaceSymbolicStrideSCEV(SE, Strides, APtr); - const SCEV *BScev = replaceSymbolicStrideSCEV(SE, Strides, BPtr); - - int StrideAPtr = isStridedPtr(SE, DL, APtr, InnermostLoop, Strides); - int StrideBPtr = isStridedPtr(SE, DL, BPtr, InnermostLoop, Strides); - - const SCEV *Src = AScev; - const SCEV *Sink = BScev; - - // If the induction step is negative we have to invert source and sink of the - // dependence. - if (StrideAPtr < 0) { - //Src = BScev; - //Sink = AScev; - std::swap(APtr, BPtr); - std::swap(Src, Sink); - std::swap(AIsWrite, BIsWrite); - std::swap(AIdx, BIdx); - std::swap(StrideAPtr, StrideBPtr); - } - - const SCEV *Dist = SE->getMinusSCEV(Sink, Src); - - DEBUG(dbgs() << "LV: Src Scev: " << *Src << "Sink Scev: " << *Sink - << "(Induction step: " << StrideAPtr << ")\n"); - DEBUG(dbgs() << "LV: Distance for " << *InstMap[AIdx] << " to " - << *InstMap[BIdx] << ": " << *Dist << "\n"); - - // Need consecutive accesses. We don't want to vectorize - // "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap in - // the address space. - if (!StrideAPtr || !StrideBPtr || StrideAPtr != StrideBPtr){ - DEBUG(dbgs() << "Non-consecutive pointer access\n"); - return true; - } - - const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist); - if (!C) { - DEBUG(dbgs() << "LV: Dependence because of non-constant distance\n"); - ShouldRetryWithRuntimeCheck = true; - return true; - } - - Type *ATy = APtr->getType()->getPointerElementType(); - Type *BTy = BPtr->getType()->getPointerElementType(); - unsigned TypeByteSize = DL->getTypeAllocSize(ATy); - - // Negative distances are not plausible dependencies. - const APInt &Val = C->getValue()->getValue(); - if (Val.isNegative()) { - bool IsTrueDataDependence = (AIsWrite && !BIsWrite); - if (IsTrueDataDependence && - (couldPreventStoreLoadForward(Val.abs().getZExtValue(), TypeByteSize) || - ATy != BTy)) - return true; - - DEBUG(dbgs() << "LV: Dependence is negative: NoDep\n"); - return false; - } - - // Write to the same location with the same size. - // Could be improved to assert type sizes are the same (i32 == float, etc). - if (Val == 0) { - if (ATy == BTy) - return false; - DEBUG(dbgs() << "LV: Zero dependence difference but different types\n"); - return true; - } - - assert(Val.isStrictlyPositive() && "Expect a positive value"); - - // Positive distance bigger than max vectorization factor. - if (ATy != BTy) { - DEBUG(dbgs() << - "LV: ReadWrite-Write positive dependency with different types\n"); - return false; - } - - unsigned Distance = (unsigned) Val.getZExtValue(); - - // Bail out early if passed-in parameters make vectorization not feasible. - unsigned ForcedFactor = VectorizationFactor ? VectorizationFactor : 1; - unsigned ForcedUnroll = VectorizationInterleave ? VectorizationInterleave : 1; - - // The distance must be bigger than the size needed for a vectorized version - // of the operation and the size of the vectorized operation must not be - // bigger than the currrent maximum size. - if (Distance < 2*TypeByteSize || - 2*TypeByteSize > MaxSafeDepDistBytes || - Distance < TypeByteSize * ForcedUnroll * ForcedFactor) { - DEBUG(dbgs() << "LV: Failure because of Positive distance " - << Val.getSExtValue() << '\n'); - return true; - } - - MaxSafeDepDistBytes = Distance < MaxSafeDepDistBytes ? - Distance : MaxSafeDepDistBytes; - - bool IsTrueDataDependence = (!AIsWrite && BIsWrite); - if (IsTrueDataDependence && - couldPreventStoreLoadForward(Distance, TypeByteSize)) - return true; - - DEBUG(dbgs() << "LV: Positive distance " << Val.getSExtValue() << - " with max VF = " << MaxSafeDepDistBytes / TypeByteSize << '\n'); - - return false; -} - -bool MemoryDepChecker::areDepsSafe(AccessAnalysis::DepCandidates &AccessSets, - MemAccessInfoSet &CheckDeps, - ValueToValueMap &Strides) { - - MaxSafeDepDistBytes = -1U; - while (!CheckDeps.empty()) { - MemAccessInfo CurAccess = *CheckDeps.begin(); - - // Get the relevant memory access set. - EquivalenceClasses<MemAccessInfo>::iterator I = - AccessSets.findValue(AccessSets.getLeaderValue(CurAccess)); - - // Check accesses within this set. - EquivalenceClasses<MemAccessInfo>::member_iterator AI, AE; - AI = AccessSets.member_begin(I), AE = AccessSets.member_end(); - - // Check every access pair. - while (AI != AE) { - CheckDeps.erase(*AI); - EquivalenceClasses<MemAccessInfo>::member_iterator OI = std::next(AI); - while (OI != AE) { - // Check every accessing instruction pair in program order. - for (std::vector<unsigned>::iterator I1 = Accesses[*AI].begin(), - I1E = Accesses[*AI].end(); I1 != I1E; ++I1) - for (std::vector<unsigned>::iterator I2 = Accesses[*OI].begin(), - I2E = Accesses[*OI].end(); I2 != I2E; ++I2) { - if (*I1 < *I2 && isDependent(*AI, *I1, *OI, *I2, Strides)) - return false; - if (*I2 < *I1 && isDependent(*OI, *I2, *AI, *I1, Strides)) - return false; - } - ++OI; - } - AI++; - } - } - return true; -} - bool LoopVectorizationLegality::canVectorizeMemory() { - - typedef SmallVector<Value*, 16> ValueVector; - typedef SmallPtrSet<Value*, 16> ValueSet; - - // Holds the Load and Store *instructions*. - ValueVector Loads; - ValueVector Stores; - - // Holds all the different accesses in the loop. - unsigned NumReads = 0; - unsigned NumReadWrites = 0; - - PtrRtCheck.Pointers.clear(); - PtrRtCheck.Need = false; - - const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel(); - MemoryDepChecker DepChecker(SE, DL, TheLoop); - - // For each block. - for (Loop::block_iterator bb = TheLoop->block_begin(), - be = TheLoop->block_end(); bb != be; ++bb) { - - // Scan the BB and collect legal loads and stores. - for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; - ++it) { - - // If this is a load, save it. If this instruction can read from memory - // but is not a load, then we quit. Notice that we don't handle function - // calls that read or write. - if (it->mayReadFromMemory()) { - // Many math library functions read the rounding mode. We will only - // vectorize a loop if it contains known function calls that don't set - // the flag. Therefore, it is safe to ignore this read from memory. - CallInst *Call = dyn_cast<CallInst>(it); - if (Call && getIntrinsicIDForCall(Call, TLI)) - continue; - - LoadInst *Ld = dyn_cast<LoadInst>(it); - if (!Ld || (!Ld->isSimple() && !IsAnnotatedParallel)) { - emitAnalysis(Report(Ld) - << "read with atomic ordering or volatile read"); - DEBUG(dbgs() << "LV: Found a non-simple load.\n"); - return false; - } - NumLoads++; - Loads.push_back(Ld); - DepChecker.addAccess(Ld); - continue; - } - - // Save 'store' instructions. Abort if other instructions write to memory. - if (it->mayWriteToMemory()) { - StoreInst *St = dyn_cast<StoreInst>(it); - if (!St) { - emitAnalysis(Report(it) << "instruction cannot be vectorized"); - return false; - } - if (!St->isSimple() && !IsAnnotatedParallel) { - emitAnalysis(Report(St) - << "write with atomic ordering or volatile write"); - DEBUG(dbgs() << "LV: Found a non-simple store.\n"); - return false; - } - NumStores++; - Stores.push_back(St); - DepChecker.addAccess(St); - } - } // Next instr. - } // Next block. - - // Now we have two lists that hold the loads and the stores. - // Next, we find the pointers that they use. - - // Check if we see any stores. If there are no stores, then we don't - // care if the pointers are *restrict*. - if (!Stores.size()) { - DEBUG(dbgs() << "LV: Found a read-only loop!\n"); - return true; - } - - AccessAnalysis::DepCandidates DependentAccesses; - AccessAnalysis Accesses(DL, AA, DependentAccesses); - - // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects - // multiple times on the same object. If the ptr is accessed twice, once - // for read and once for write, it will only appear once (on the write - // list). This is okay, since we are going to check for conflicts between - // writes and between reads and writes, but not between reads and reads. - ValueSet Seen; - - ValueVector::iterator I, IE; - for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) { - StoreInst *ST = cast<StoreInst>(*I); - Value* Ptr = ST->getPointerOperand(); - - if (isUniform(Ptr)) { - emitAnalysis( - Report(ST) - << "write to a loop invariant address could not be vectorized"); - DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n"); - return false; - } - - // If we did *not* see this pointer before, insert it to the read-write - // list. At this phase it is only a 'write' list. - if (Seen.insert(Ptr).second) { - ++NumReadWrites; - - AliasAnalysis::Location Loc = AA->getLocation(ST); - // The TBAA metadata could have a control dependency on the predication - // condition, so we cannot rely on it when determining whether or not we - // need runtime pointer checks. - if (blockNeedsPredication(ST->getParent())) - Loc.AATags.TBAA = nullptr; - - Accesses.addStore(Loc); - } - } - - if (IsAnnotatedParallel) { - DEBUG(dbgs() - << "LV: A loop annotated parallel, ignore memory dependency " - << "checks.\n"); - return true; - } - - for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) { - LoadInst *LD = cast<LoadInst>(*I); - Value* Ptr = LD->getPointerOperand(); - // If we did *not* see this pointer before, insert it to the - // read list. If we *did* see it before, then it is already in - // the read-write list. This allows us to vectorize expressions - // such as A[i] += x; Because the address of A[i] is a read-write - // pointer. This only works if the index of A[i] is consecutive. - // If the address of i is unknown (for example A[B[i]]) then we may - // read a few words, modify, and write a few words, and some of the - // words may be written to the same address. - bool IsReadOnlyPtr = false; - if (Seen.insert(Ptr).second || - !isStridedPtr(SE, DL, Ptr, TheLoop, Strides)) { - ++NumReads; - IsReadOnlyPtr = true; - } - - AliasAnalysis::Location Loc = AA->getLocation(LD); - // The TBAA metadata could have a control dependency on the predication - // condition, so we cannot rely on it when determining whether or not we - // need runtime pointer checks. - if (blockNeedsPredication(LD->getParent())) - Loc.AATags.TBAA = nullptr; - - Accesses.addLoad(Loc, IsReadOnlyPtr); - } - - // If we write (or read-write) to a single destination and there are no - // other reads in this loop then is it safe to vectorize. - if (NumReadWrites == 1 && NumReads == 0) { - DEBUG(dbgs() << "LV: Found a write-only loop!\n"); - return true; - } - - // Build dependence sets and check whether we need a runtime pointer bounds - // check. - Accesses.buildDependenceSets(); - bool NeedRTCheck = Accesses.isRTCheckNeeded(); - - // Find pointers with computable bounds. We are going to use this information - // to place a runtime bound check. - unsigned NumComparisons = 0; - bool CanDoRT = false; - if (NeedRTCheck) - CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE, TheLoop, - Strides); - - DEBUG(dbgs() << "LV: We need to do " << NumComparisons << - " pointer comparisons.\n"); - - // If we only have one set of dependences to check pointers among we don't - // need a runtime check. - if (NumComparisons == 0 && NeedRTCheck) - NeedRTCheck = false; - - // Check that we did not collect too many pointers or found an unsizeable - // pointer. - if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) { - PtrRtCheck.reset(); - CanDoRT = false; - } - - if (CanDoRT) { - DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n"); - } - - if (NeedRTCheck && !CanDoRT) { - emitAnalysis(Report() << "cannot identify array bounds"); - DEBUG(dbgs() << "LV: We can't vectorize because we can't find " << - "the array bounds.\n"); - PtrRtCheck.reset(); - return false; - } - - PtrRtCheck.Need = NeedRTCheck; - - bool CanVecMem = true; - if (Accesses.isDependencyCheckNeeded()) { - DEBUG(dbgs() << "LV: Checking memory dependencies\n"); - CanVecMem = DepChecker.areDepsSafe( - DependentAccesses, Accesses.getDependenciesToCheck(), Strides); - MaxSafeDepDistBytes = DepChecker.getMaxSafeDepDistBytes(); - - if (!CanVecMem && DepChecker.shouldRetryWithRuntimeCheck()) { - DEBUG(dbgs() << "LV: Retrying with memory checks\n"); - NeedRTCheck = true; - - // Clear the dependency checks. We assume they are not needed. - Accesses.resetDepChecks(); - - PtrRtCheck.reset(); - PtrRtCheck.Need = true; - - CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE, - TheLoop, Strides, true); - // Check that we did not collect too many pointers or found an unsizeable - // pointer. - if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) { - if (!CanDoRT && NumComparisons > 0) - emitAnalysis(Report() - << "cannot check memory dependencies at runtime"); - else - emitAnalysis(Report() - << NumComparisons << " exceeds limit of " - << RuntimeMemoryCheckThreshold - << " dependent memory operations checked at runtime"); - DEBUG(dbgs() << "LV: Can't vectorize with memory checks\n"); - PtrRtCheck.reset(); - return false; - } - - CanVecMem = true; - } - } - - if (!CanVecMem) - emitAnalysis(Report() << "unsafe dependent memory operations in loop"); - - DEBUG(dbgs() << "LV: We" << (NeedRTCheck ? "" : " don't") << - " need a runtime memory check.\n"); - - return CanVecMem; -} - -static bool hasMultipleUsesOf(Instruction *I, - SmallPtrSetImpl<Instruction *> &Insts) { - unsigned NumUses = 0; - for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use) { - if (Insts.count(dyn_cast<Instruction>(*Use))) - ++NumUses; - if (NumUses > 1) - return true; - } - - return false; -} - -static bool areAllUsesIn(Instruction *I, SmallPtrSetImpl<Instruction *> &Set) { - for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use) - if (!Set.count(dyn_cast<Instruction>(*Use))) - return false; - return true; -} - -bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, - ReductionKind Kind) { - if (Phi->getNumIncomingValues() != 2) + LAI = &LAA->getInfo(TheLoop, Strides); + auto &OptionalReport = LAI->getReport(); + if (OptionalReport) + emitAnalysis(VectorizationReport(*OptionalReport)); + if (!LAI->canVectorizeMemory()) return false; - // Reduction variables are only found in the loop header block. - if (Phi->getParent() != TheLoop->getHeader()) + if (LAI->hasStoreToLoopInvariantAddress()) { + emitAnalysis( + VectorizationReport() + << "write to a loop invariant address could not be vectorized"); + DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n"); return false; - - // Obtain the reduction start value from the value that comes from the loop - // preheader. - Value *RdxStart = Phi->getIncomingValueForBlock(TheLoop->getLoopPreheader()); - - // ExitInstruction is the single value which is used outside the loop. - // We only allow for a single reduction value to be used outside the loop. - // This includes users of the reduction, variables (which form a cycle - // which ends in the phi node). - Instruction *ExitInstruction = nullptr; - // Indicates that we found a reduction operation in our scan. - bool FoundReduxOp = false; - - // We start with the PHI node and scan for all of the users of this - // instruction. All users must be instructions that can be used as reduction - // variables (such as ADD). We must have a single out-of-block user. The cycle - // must include the original PHI. - bool FoundStartPHI = false; - - // To recognize min/max patterns formed by a icmp select sequence, we store - // the number of instruction we saw from the recognized min/max pattern, - // to make sure we only see exactly the two instructions. - unsigned NumCmpSelectPatternInst = 0; - ReductionInstDesc ReduxDesc(false, nullptr); - - SmallPtrSet<Instruction *, 8> VisitedInsts; - SmallVector<Instruction *, 8> Worklist; - Worklist.push_back(Phi); - VisitedInsts.insert(Phi); - - // A value in the reduction can be used: - // - By the reduction: - // - Reduction operation: - // - One use of reduction value (safe). - // - Multiple use of reduction value (not safe). - // - PHI: - // - All uses of the PHI must be the reduction (safe). - // - Otherwise, not safe. - // - By one instruction outside of the loop (safe). - // - By further instructions outside of the loop (not safe). - // - By an instruction that is not part of the reduction (not safe). - // This is either: - // * An instruction type other than PHI or the reduction operation. - // * A PHI in the header other than the initial PHI. - while (!Worklist.empty()) { - Instruction *Cur = Worklist.back(); - Worklist.pop_back(); - - // No Users. - // If the instruction has no users then this is a broken chain and can't be - // a reduction variable. - if (Cur->use_empty()) - return false; - - bool IsAPhi = isa<PHINode>(Cur); - - // A header PHI use other than the original PHI. - if (Cur != Phi && IsAPhi && Cur->getParent() == Phi->getParent()) - return false; - - // Reductions of instructions such as Div, and Sub is only possible if the - // LHS is the reduction variable. - if (!Cur->isCommutative() && !IsAPhi && !isa<SelectInst>(Cur) && - !isa<ICmpInst>(Cur) && !isa<FCmpInst>(Cur) && - !VisitedInsts.count(dyn_cast<Instruction>(Cur->getOperand(0)))) - return false; - - // Any reduction instruction must be of one of the allowed kinds. - ReduxDesc = isReductionInstr(Cur, Kind, ReduxDesc); - if (!ReduxDesc.IsReduction) - return false; - - // A reduction operation must only have one use of the reduction value. - if (!IsAPhi && Kind != RK_IntegerMinMax && Kind != RK_FloatMinMax && - hasMultipleUsesOf(Cur, VisitedInsts)) - return false; - - // All inputs to a PHI node must be a reduction value. - if(IsAPhi && Cur != Phi && !areAllUsesIn(Cur, VisitedInsts)) - return false; - - if (Kind == RK_IntegerMinMax && (isa<ICmpInst>(Cur) || - isa<SelectInst>(Cur))) - ++NumCmpSelectPatternInst; - if (Kind == RK_FloatMinMax && (isa<FCmpInst>(Cur) || - isa<SelectInst>(Cur))) - ++NumCmpSelectPatternInst; - - // Check whether we found a reduction operator. - FoundReduxOp |= !IsAPhi; - - // Process users of current instruction. Push non-PHI nodes after PHI nodes - // onto the stack. This way we are going to have seen all inputs to PHI - // nodes once we get to them. - SmallVector<Instruction *, 8> NonPHIs; - SmallVector<Instruction *, 8> PHIs; - for (User *U : Cur->users()) { - Instruction *UI = cast<Instruction>(U); - - // Check if we found the exit user. - BasicBlock *Parent = UI->getParent(); - if (!TheLoop->contains(Parent)) { - // Exit if you find multiple outside users or if the header phi node is - // being used. In this case the user uses the value of the previous - // iteration, in which case we would loose "VF-1" iterations of the - // reduction operation if we vectorize. - if (ExitInstruction != nullptr || Cur == Phi) - return false; - - // The instruction used by an outside user must be the last instruction - // before we feed back to the reduction phi. Otherwise, we loose VF-1 - // operations on the value. - if (std::find(Phi->op_begin(), Phi->op_end(), Cur) == Phi->op_end()) - return false; - - ExitInstruction = Cur; - continue; - } - - // Process instructions only once (termination). Each reduction cycle - // value must only be used once, except by phi nodes and min/max - // reductions which are represented as a cmp followed by a select. - ReductionInstDesc IgnoredVal(false, nullptr); - if (VisitedInsts.insert(UI).second) { - if (isa<PHINode>(UI)) - PHIs.push_back(UI); - else - NonPHIs.push_back(UI); - } else if (!isa<PHINode>(UI) && - ((!isa<FCmpInst>(UI) && - !isa<ICmpInst>(UI) && - !isa<SelectInst>(UI)) || - !isMinMaxSelectCmpPattern(UI, IgnoredVal).IsReduction)) - return false; - - // Remember that we completed the cycle. - if (UI == Phi) - FoundStartPHI = true; - } - Worklist.append(PHIs.begin(), PHIs.end()); - Worklist.append(NonPHIs.begin(), NonPHIs.end()); } - // This means we have seen one but not the other instruction of the - // pattern or more than just a select and cmp. - if ((Kind == RK_IntegerMinMax || Kind == RK_FloatMinMax) && - NumCmpSelectPatternInst != 2) - return false; - - if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction) + if (LAI->getNumRuntimePointerChecks() > + VectorizerParams::RuntimeMemoryCheckThreshold) { + emitAnalysis(VectorizationReport() + << LAI->getNumRuntimePointerChecks() << " exceeds limit of " + << VectorizerParams::RuntimeMemoryCheckThreshold + << " dependent memory operations checked at runtime"); + DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); return false; - - // We found a reduction var if we have reached the original phi node and we - // only have a single instruction with out-of-loop users. - - // This instruction is allowed to have out-of-loop users. - AllowedExit.insert(ExitInstruction); - - // Save the description of this reduction variable. - ReductionDescriptor RD(RdxStart, ExitInstruction, Kind, - ReduxDesc.MinMaxKind); - Reductions[Phi] = RD; - // We've ended the cycle. This is a reduction variable if we have an - // outside user and it has a binary op. - - return true; -} - -/// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction -/// pattern corresponding to a min(X, Y) or max(X, Y). -LoopVectorizationLegality::ReductionInstDesc -LoopVectorizationLegality::isMinMaxSelectCmpPattern(Instruction *I, - ReductionInstDesc &Prev) { - - assert((isa<ICmpInst>(I) || isa<FCmpInst>(I) || isa<SelectInst>(I)) && - "Expect a select instruction"); - Instruction *Cmp = nullptr; - SelectInst *Select = nullptr; - - // We must handle the select(cmp()) as a single instruction. Advance to the - // select. - if ((Cmp = dyn_cast<ICmpInst>(I)) || (Cmp = dyn_cast<FCmpInst>(I))) { - if (!Cmp->hasOneUse() || !(Select = dyn_cast<SelectInst>(*I->user_begin()))) - return ReductionInstDesc(false, I); - return ReductionInstDesc(Select, Prev.MinMaxKind); - } - - // Only handle single use cases for now. - if (!(Select = dyn_cast<SelectInst>(I))) - return ReductionInstDesc(false, I); - if (!(Cmp = dyn_cast<ICmpInst>(I->getOperand(0))) && - !(Cmp = dyn_cast<FCmpInst>(I->getOperand(0)))) - return ReductionInstDesc(false, I); - if (!Cmp->hasOneUse()) - return ReductionInstDesc(false, I); - - Value *CmpLeft; - Value *CmpRight; - - // Look for a min/max pattern. - if (m_UMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) - return ReductionInstDesc(Select, MRK_UIntMin); - else if (m_UMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) - return ReductionInstDesc(Select, MRK_UIntMax); - else if (m_SMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) - return ReductionInstDesc(Select, MRK_SIntMax); - else if (m_SMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) - return ReductionInstDesc(Select, MRK_SIntMin); - else if (m_OrdFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) - return ReductionInstDesc(Select, MRK_FloatMin); - else if (m_OrdFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) - return ReductionInstDesc(Select, MRK_FloatMax); - else if (m_UnordFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) - return ReductionInstDesc(Select, MRK_FloatMin); - else if (m_UnordFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) - return ReductionInstDesc(Select, MRK_FloatMax); - - return ReductionInstDesc(false, I); -} - -LoopVectorizationLegality::ReductionInstDesc -LoopVectorizationLegality::isReductionInstr(Instruction *I, - ReductionKind Kind, - ReductionInstDesc &Prev) { - bool FP = I->getType()->isFloatingPointTy(); - bool FastMath = FP && I->hasUnsafeAlgebra(); - switch (I->getOpcode()) { - default: - return ReductionInstDesc(false, I); - case Instruction::PHI: - if (FP && (Kind != RK_FloatMult && Kind != RK_FloatAdd && - Kind != RK_FloatMinMax)) - return ReductionInstDesc(false, I); - return ReductionInstDesc(I, Prev.MinMaxKind); - case Instruction::Sub: - case Instruction::Add: - return ReductionInstDesc(Kind == RK_IntegerAdd, I); - case Instruction::Mul: - return ReductionInstDesc(Kind == RK_IntegerMult, I); - case Instruction::And: - return ReductionInstDesc(Kind == RK_IntegerAnd, I); - case Instruction::Or: - return ReductionInstDesc(Kind == RK_IntegerOr, I); - case Instruction::Xor: - return ReductionInstDesc(Kind == RK_IntegerXor, I); - case Instruction::FMul: - return ReductionInstDesc(Kind == RK_FloatMult && FastMath, I); - case Instruction::FSub: - case Instruction::FAdd: - return ReductionInstDesc(Kind == RK_FloatAdd && FastMath, I); - case Instruction::FCmp: - case Instruction::ICmp: - case Instruction::Select: - if (Kind != RK_IntegerMinMax && - (!HasFunNoNaNAttr || Kind != RK_FloatMinMax)) - return ReductionInstDesc(false, I); - return isMinMaxSelectCmpPattern(I, Prev); } + return true; } LoopVectorizationLegality::InductionKind -LoopVectorizationLegality::isInductionVariable(PHINode *Phi) { - Type *PhiTy = Phi->getType(); - // We only handle integer and pointer inductions variables. - if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy()) - return IK_NoInduction; - - // Check that the PHI is consecutive. - const SCEV *PhiScev = SE->getSCEV(Phi); - const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev); - if (!AR) { - DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n"); - return IK_NoInduction; - } - const SCEV *Step = AR->getStepRecurrence(*SE); - - // Integer inductions need to have a stride of one. - if (PhiTy->isIntegerTy()) { - if (Step->isOne()) - return IK_IntInduction; - if (Step->isAllOnesValue()) - return IK_ReverseIntInduction; - return IK_NoInduction; - } - - // Calculate the pointer stride and check if it is consecutive. - const SCEVConstant *C = dyn_cast<SCEVConstant>(Step); - if (!C) - return IK_NoInduction; - - assert(PhiTy->isPointerTy() && "The PHI must be a pointer"); - Type *PointerElementType = PhiTy->getPointerElementType(); - // The pointer stride cannot be determined if the pointer element type is not - // sized. - if (!PointerElementType->isSized()) +LoopVectorizationLegality::isInductionVariable(PHINode *Phi, + ConstantInt *&StepValue) { + if (!isInductionPHI(Phi, SE, StepValue)) return IK_NoInduction; - uint64_t Size = DL->getTypeAllocSize(PointerElementType); - if (C->getValue()->equalsInt(Size)) - return IK_PtrInduction; - else if (C->getValue()->equalsInt(0 - Size)) - return IK_ReversePtrInduction; - - return IK_NoInduction; + Type *PhiTy = Phi->getType(); + // Found an Integer induction variable. + if (PhiTy->isIntegerTy()) + return IK_IntInduction; + // Found an Pointer induction variable. + return IK_PtrInduction; } bool LoopVectorizationLegality::isInductionVariable(const Value *V) { @@ -5341,11 +4349,7 @@ bool LoopVectorizationLegality::isInductionVariable(const Value *V) { } bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) { - assert(TheLoop->contains(BB) && "Unknown block used"); - - // Blocks that do not dominate the latch need predication. - BasicBlock* Latch = TheLoop->getLoopLatch(); - return !DT->dominates(BB, Latch); + return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT); } bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB, @@ -5416,18 +4420,182 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB, return true; } +void InterleavedAccessInfo::collectConstStridedAccesses( + MapVector<Instruction *, StrideDescriptor> &StrideAccesses, + const ValueToValueMap &Strides) { + // Holds load/store instructions in program order. + SmallVector<Instruction *, 16> AccessList; + + for (auto *BB : TheLoop->getBlocks()) { + bool IsPred = LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT); + + for (auto &I : *BB) { + if (!isa<LoadInst>(&I) && !isa<StoreInst>(&I)) + continue; + // FIXME: Currently we can't handle mixed accesses and predicated accesses + if (IsPred) + return; + + AccessList.push_back(&I); + } + } + + if (AccessList.empty()) + return; + + auto &DL = TheLoop->getHeader()->getModule()->getDataLayout(); + for (auto I : AccessList) { + LoadInst *LI = dyn_cast<LoadInst>(I); + StoreInst *SI = dyn_cast<StoreInst>(I); + + Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); + int Stride = isStridedPtr(SE, Ptr, TheLoop, Strides); + + // The factor of the corresponding interleave group. + unsigned Factor = std::abs(Stride); + + // Ignore the access if the factor is too small or too large. + if (Factor < 2 || Factor > MaxInterleaveGroupFactor) + continue; + + const SCEV *Scev = replaceSymbolicStrideSCEV(SE, Strides, Ptr); + PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType()); + unsigned Size = DL.getTypeAllocSize(PtrTy->getElementType()); + + // An alignment of 0 means target ABI alignment. + unsigned Align = LI ? LI->getAlignment() : SI->getAlignment(); + if (!Align) + Align = DL.getABITypeAlignment(PtrTy->getElementType()); + + StrideAccesses[I] = StrideDescriptor(Stride, Scev, Size, Align); + } +} + +// Analyze interleaved accesses and collect them into interleave groups. +// +// Notice that the vectorization on interleaved groups will change instruction +// orders and may break dependences. But the memory dependence check guarantees +// that there is no overlap between two pointers of different strides, element +// sizes or underlying bases. +// +// For pointers sharing the same stride, element size and underlying base, no +// need to worry about Read-After-Write dependences and Write-After-Read +// dependences. +// +// E.g. The RAW dependence: A[i] = a; +// b = A[i]; +// This won't exist as it is a store-load forwarding conflict, which has +// already been checked and forbidden in the dependence check. +// +// E.g. The WAR dependence: a = A[i]; // (1) +// A[i] = b; // (2) +// The store group of (2) is always inserted at or below (2), and the load group +// of (1) is always inserted at or above (1). The dependence is safe. +void InterleavedAccessInfo::analyzeInterleaving( + const ValueToValueMap &Strides) { + DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n"); + + // Holds all the stride accesses. + MapVector<Instruction *, StrideDescriptor> StrideAccesses; + collectConstStridedAccesses(StrideAccesses, Strides); + + if (StrideAccesses.empty()) + return; + + // Holds all interleaved store groups temporarily. + SmallSetVector<InterleaveGroup *, 4> StoreGroups; + + // Search the load-load/write-write pair B-A in bottom-up order and try to + // insert B into the interleave group of A according to 3 rules: + // 1. A and B have the same stride. + // 2. A and B have the same memory object size. + // 3. B belongs to the group according to the distance. + // + // The bottom-up order can avoid breaking the Write-After-Write dependences + // between two pointers of the same base. + // E.g. A[i] = a; (1) + // A[i] = b; (2) + // A[i+1] = c (3) + // We form the group (2)+(3) in front, so (1) has to form groups with accesses + // above (1), which guarantees that (1) is always above (2). + for (auto I = StrideAccesses.rbegin(), E = StrideAccesses.rend(); I != E; + ++I) { + Instruction *A = I->first; + StrideDescriptor DesA = I->second; + + InterleaveGroup *Group = getInterleaveGroup(A); + if (!Group) { + DEBUG(dbgs() << "LV: Creating an interleave group with:" << *A << '\n'); + Group = createInterleaveGroup(A, DesA.Stride, DesA.Align); + } + + if (A->mayWriteToMemory()) + StoreGroups.insert(Group); + + for (auto II = std::next(I); II != E; ++II) { + Instruction *B = II->first; + StrideDescriptor DesB = II->second; + + // Ignore if B is already in a group or B is a different memory operation. + if (isInterleaved(B) || A->mayReadFromMemory() != B->mayReadFromMemory()) + continue; + + // Check the rule 1 and 2. + if (DesB.Stride != DesA.Stride || DesB.Size != DesA.Size) + continue; + + // Calculate the distance and prepare for the rule 3. + const SCEVConstant *DistToA = + dyn_cast<SCEVConstant>(SE->getMinusSCEV(DesB.Scev, DesA.Scev)); + if (!DistToA) + continue; + + int DistanceToA = DistToA->getValue()->getValue().getSExtValue(); + + // Skip if the distance is not multiple of size as they are not in the + // same group. + if (DistanceToA % static_cast<int>(DesA.Size)) + continue; + + // The index of B is the index of A plus the related index to A. + int IndexB = + Group->getIndex(A) + DistanceToA / static_cast<int>(DesA.Size); + + // Try to insert B into the group. + if (Group->insertMember(B, IndexB, DesB.Align)) { + DEBUG(dbgs() << "LV: Inserted:" << *B << '\n' + << " into the interleave group with" << *A << '\n'); + InterleaveGroupMap[B] = Group; + + // Set the first load in program order as the insert position. + if (B->mayReadFromMemory()) + Group->setInsertPos(B); + } + } // Iteration on instruction B + } // Iteration on instruction A + + // Remove interleaved store groups with gaps. + for (InterleaveGroup *Group : StoreGroups) + if (Group->getNumMembers() != Group->getFactor()) + releaseGroup(Group); +} + LoopVectorizationCostModel::VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { // Width 1 means no vectorize VectorizationFactor Factor = { 1U, 0U }; if (OptForSize && Legal->getRuntimePointerCheck()->Need) { - emitAnalysis(Report() << "runtime pointer checks needed. Enable vectorization of this loop with '#pragma clang loop vectorize(enable)' when compiling with -Os"); + emitAnalysis(VectorizationReport() << + "runtime pointer checks needed. Enable vectorization of this " + "loop with '#pragma clang loop vectorize(enable)' when " + "compiling with -Os"); DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n"); return Factor; } - if (!EnableCondStoresVectorization && Legal->NumPredStores) { - emitAnalysis(Report() << "store that is conditionally executed prevents vectorization"); + if (!EnableCondStoresVectorization && Legal->getNumPredStores()) { + emitAnalysis(VectorizationReport() << + "store that is conditionally executed prevents vectorization"); DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n"); return Factor; } @@ -5462,7 +4630,9 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { if (OptForSize) { // If we are unable to calculate the trip count then don't try to vectorize. if (TC < 2) { - emitAnalysis(Report() << "unable to calculate the loop count due to complex control flow"); + emitAnalysis + (VectorizationReport() << + "unable to calculate the loop count due to complex control flow"); DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n"); return Factor; } @@ -5476,10 +4646,11 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { // If the trip count that we found modulo the vectorization factor is not // zero then we require a tail. if (VF < 2) { - emitAnalysis(Report() << "cannot optimize for size and vectorize at the " - "same time. Enable vectorization of this loop " - "with '#pragma clang loop vectorize(enable)' " - "when compiling with -Os"); + emitAnalysis(VectorizationReport() << + "cannot optimize for size and vectorize at the " + "same time. Enable vectorization of this loop " + "with '#pragma clang loop vectorize(enable)' " + "when compiling with -Os"); DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n"); return Factor; } @@ -5532,6 +4703,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { unsigned LoopVectorizationCostModel::getWidestType() { unsigned MaxWidth = 8; + const DataLayout &DL = TheFunction->getParent()->getDataLayout(); // For each block. for (Loop::block_iterator bb = TheLoop->block_begin(), @@ -5566,7 +4738,7 @@ unsigned LoopVectorizationCostModel::getWidestType() { continue; MaxWidth = std::max(MaxWidth, - (unsigned)DL->getTypeSizeInBits(T->getScalarType())); + (unsigned)DL.getTypeSizeInBits(T->getScalarType())); } } @@ -5645,7 +4817,7 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, std::max(1U, (R.MaxLocalUsers - 1))); // Clamp the unroll factor ranges to reasonable factors. - unsigned MaxInterleaveSize = TTI.getMaxInterleaveFactor(); + unsigned MaxInterleaveSize = TTI.getMaxInterleaveFactor(VF); // Check if the user has overridden the unroll max. if (VF == 1) { @@ -5692,8 +4864,10 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, // Unroll until store/load ports (estimated by max unroll factor) are // saturated. - unsigned StoresUF = UF / (Legal->NumStores ? Legal->NumStores : 1); - unsigned LoadsUF = UF / (Legal->NumLoads ? Legal->NumLoads : 1); + unsigned NumStores = Legal->getNumStores(); + unsigned NumLoads = Legal->getNumLoads(); + unsigned StoresUF = UF / (NumStores ? NumStores : 1); + unsigned LoadsUF = UF / (NumLoads ? NumLoads : 1); // If we have a scalar reduction (vector reductions are already dealt with // by this point), we can increase the critical path length if the loop @@ -5716,6 +4890,14 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, return SmallUF; } + // Unroll if this is a large loop (small loops are already dealt with by this + // point) that could benefit from interleaved unrolling. + bool HasReductions = (Legal->getReductionVars()->size() > 0); + if (TTI.enableAggressiveInterleaving(HasReductions)) { + DEBUG(dbgs() << "LV: Unrolling to expose ILP.\n"); + return UF; + } + DEBUG(dbgs() << "LV: Not Unrolling.\n"); return 1; } @@ -6050,11 +5232,52 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { return TTI.getAddressComputationCost(VectorTy) + TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); + // For an interleaved access, calculate the total cost of the whole + // interleave group. + if (Legal->isAccessInterleaved(I)) { + auto Group = Legal->getInterleavedAccessGroup(I); + assert(Group && "Fail to get an interleaved access group."); + + // Only calculate the cost once at the insert position. + if (Group->getInsertPos() != I) + return 0; + + unsigned InterleaveFactor = Group->getFactor(); + Type *WideVecTy = + VectorType::get(VectorTy->getVectorElementType(), + VectorTy->getVectorNumElements() * InterleaveFactor); + + // Holds the indices of existing members in an interleaved load group. + // An interleaved store group doesn't need this as it dones't allow gaps. + SmallVector<unsigned, 4> Indices; + if (LI) { + for (unsigned i = 0; i < InterleaveFactor; i++) + if (Group->getMember(i)) + Indices.push_back(i); + } + + // Calculate the cost of the whole interleaved group. + unsigned Cost = TTI.getInterleavedMemoryOpCost( + I->getOpcode(), WideVecTy, Group->getFactor(), Indices, + Group->getAlignment(), AS); + + if (Group->isReverse()) + Cost += + Group->getNumMembers() * + TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); + + // FIXME: The interleaved load group with a huge gap could be even more + // expensive than scalar operations. Then we could ignore such group and + // use scalar operations instead. + return Cost; + } + // Scalarized loads/stores. int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); bool Reverse = ConsecutiveStride < 0; - unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ValTy); - unsigned VectorElementSize = DL->getTypeStoreSize(VectorTy)/VF; + const DataLayout &DL = I->getModule()->getDataLayout(); + unsigned ScalarAllocatedSize = DL.getTypeAllocSize(ValTy); + unsigned VectorElementSize = DL.getTypeStoreSize(VectorTy) / VF; if (!ConsecutiveStride || ScalarAllocatedSize != VectorElementSize) { bool IsComplexComputation = isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop); @@ -6081,7 +5304,11 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { // Wide load/stores. unsigned Cost = TTI.getAddressComputationCost(VectorTy); - Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); + if (Legal->isMaskRequired(I)) + Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, + AS); + else + Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); if (Reverse) Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, @@ -6111,14 +5338,12 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); } case Instruction::Call: { + bool NeedToScalarize; CallInst *CI = cast<CallInst>(I); - Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); - assert(ID && "Not an intrinsic call!"); - Type *RetTy = ToVectorTy(CI->getType(), VF); - SmallVector<Type*, 4> Tys; - for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) - Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF)); - return TTI.getIntrinsicInstrCost(ID, RetTy, Tys); + unsigned CallCost = getVectorCallCost(CI, VF, TTI, TLI, NeedToScalarize); + if (getIntrinsicIDForCall(CI, TLI)) + return std::min(CallCost, getVectorIntrinsicCost(CI, VF, TTI, TLI)); + return CallCost; } default: { // We are scalarizing the instruction. Return the cost of the scalar @@ -6145,24 +5370,19 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { }// end of switch. } -Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) { - if (Scalar->isVoidTy() || VF == 1) - return Scalar; - return VectorType::get(Scalar, VF); -} - char LoopVectorize::ID = 0; static const char lv_name[] = "Loop Vectorization"; INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) -INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfo) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_DEPENDENCY(LCSSA) -INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis) INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) namespace llvm { @@ -6259,7 +5479,7 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr, ConstantInt::get(Cond[Part]->getType(), 1)); CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store"); LoopVectorBody.push_back(CondBlock); - VectorLp->addBasicBlockToLoop(CondBlock, LI->getBase()); + VectorLp->addBasicBlockToLoop(CondBlock, *LI); // Update Builder with newly created basic block. Builder.SetInsertPoint(InsertPt); } @@ -6285,7 +5505,7 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr, if (IfPredicateStore) { BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); LoopVectorBody.push_back(NewIfBlock); - VectorLp->addBasicBlockToLoop(NewIfBlock, LI->getBase()); + VectorLp->addBasicBlockToLoop(NewIfBlock, *LI); Builder.SetInsertPoint(InsertPt); Instruction *OldBr = IfBlock->getTerminator(); BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); @@ -6310,11 +5530,10 @@ Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } -Value *InnerLoopUnroller::getConsecutiveVector(Value* Val, int StartIdx, - bool Negate) { +Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step) { // When unrolling and the VF is 1, we only need to add a simple scalar. Type *ITy = Val->getType(); assert(!ITy->isVectorTy() && "Val must be a scalar"); - Constant *C = ConstantInt::get(ITy, StartIdx, Negate); - return Builder.CreateAdd(Val, C, "induction"); + Constant *C = ConstantInt::get(ITy, StartIdx); + return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); } diff --git a/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index bd8a4b3..370e295 100644 --- a/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -17,9 +17,9 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Vectorize.h" #include "llvm/ADT/MapVector.h" +#include "llvm/ADT/Optional.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SetVector.h" -#include "llvm/ADT/Optional.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" @@ -75,6 +75,15 @@ static const unsigned MinVecRegSize = 128; static const unsigned RecursionMaxDepth = 12; +// Limit the number of alias checks. The limit is chosen so that +// it has no negative effect on the llvm benchmarks. +static const unsigned AliasedCheckLimit = 10; + +// Another limit for the alias checks: The maximum distance between load/store +// instructions where alias checks are done. +// This limit is useful for very large basic blocks. +static const unsigned MaxMemDepDistance = 160; + /// \brief Predicate for the element types that the SLP vectorizer supports. /// /// The most important thing to filter here are types which are invalid in LLVM @@ -278,104 +287,6 @@ static bool CanReuseExtract(ArrayRef<Value *> VL) { return true; } -static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, - SmallVectorImpl<Value *> &Left, - SmallVectorImpl<Value *> &Right) { - - SmallVector<Value *, 16> OrigLeft, OrigRight; - - bool AllSameOpcodeLeft = true; - bool AllSameOpcodeRight = true; - for (unsigned i = 0, e = VL.size(); i != e; ++i) { - Instruction *I = cast<Instruction>(VL[i]); - Value *V0 = I->getOperand(0); - Value *V1 = I->getOperand(1); - - OrigLeft.push_back(V0); - OrigRight.push_back(V1); - - Instruction *I0 = dyn_cast<Instruction>(V0); - Instruction *I1 = dyn_cast<Instruction>(V1); - - // Check whether all operands on one side have the same opcode. In this case - // we want to preserve the original order and not make things worse by - // reordering. - AllSameOpcodeLeft = I0; - AllSameOpcodeRight = I1; - - if (i && AllSameOpcodeLeft) { - if(Instruction *P0 = dyn_cast<Instruction>(OrigLeft[i-1])) { - if(P0->getOpcode() != I0->getOpcode()) - AllSameOpcodeLeft = false; - } else - AllSameOpcodeLeft = false; - } - if (i && AllSameOpcodeRight) { - if(Instruction *P1 = dyn_cast<Instruction>(OrigRight[i-1])) { - if(P1->getOpcode() != I1->getOpcode()) - AllSameOpcodeRight = false; - } else - AllSameOpcodeRight = false; - } - - // Sort two opcodes. In the code below we try to preserve the ability to use - // broadcast of values instead of individual inserts. - // vl1 = load - // vl2 = phi - // vr1 = load - // vr2 = vr2 - // = vl1 x vr1 - // = vl2 x vr2 - // If we just sorted according to opcode we would leave the first line in - // tact but we would swap vl2 with vr2 because opcode(phi) > opcode(load). - // = vl1 x vr1 - // = vr2 x vl2 - // Because vr2 and vr1 are from the same load we loose the opportunity of a - // broadcast for the packed right side in the backend: we have [vr1, vl2] - // instead of [vr1, vr2=vr1]. - if (I0 && I1) { - if(!i && I0->getOpcode() > I1->getOpcode()) { - Left.push_back(I1); - Right.push_back(I0); - } else if (i && I0->getOpcode() > I1->getOpcode() && Right[i-1] != I1) { - // Try not to destroy a broad cast for no apparent benefit. - Left.push_back(I1); - Right.push_back(I0); - } else if (i && I0->getOpcode() == I1->getOpcode() && Right[i-1] == I0) { - // Try preserve broadcasts. - Left.push_back(I1); - Right.push_back(I0); - } else if (i && I0->getOpcode() == I1->getOpcode() && Left[i-1] == I1) { - // Try preserve broadcasts. - Left.push_back(I1); - Right.push_back(I0); - } else { - Left.push_back(I0); - Right.push_back(I1); - } - continue; - } - // One opcode, put the instruction on the right. - if (I0) { - Left.push_back(V1); - Right.push_back(I0); - continue; - } - Left.push_back(V0); - Right.push_back(V1); - } - - bool LeftBroadcast = isSplat(Left); - bool RightBroadcast = isSplat(Right); - - // Don't reorder if the operands where good to begin with. - if (!(LeftBroadcast || RightBroadcast) && - (AllSameOpcodeRight || AllSameOpcodeLeft)) { - Left = OrigLeft; - Right = OrigRight; - } -} - /// \returns True if in-tree use also needs extract. This refers to /// possible scalar operand in vectorized instruction. static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, @@ -404,12 +315,23 @@ static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, } /// \returns the AA location that is being access by the instruction. -static AliasAnalysis::Location getLocation(Instruction *I, AliasAnalysis *AA) { +static MemoryLocation getLocation(Instruction *I, AliasAnalysis *AA) { if (StoreInst *SI = dyn_cast<StoreInst>(I)) - return AA->getLocation(SI); + return MemoryLocation::get(SI); + if (LoadInst *LI = dyn_cast<LoadInst>(I)) + return MemoryLocation::get(LI); + return MemoryLocation(); +} + +/// \returns True if the instruction is not a volatile or atomic load/store. +static bool isSimple(Instruction *I) { if (LoadInst *LI = dyn_cast<LoadInst>(I)) - return AA->getLocation(LI); - return AliasAnalysis::Location(); + return LI->isSimple(); + if (StoreInst *SI = dyn_cast<StoreInst>(I)) + return SI->isSimple(); + if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) + return !MI->isVolatile(); + return true; } /// Bottom Up SLP Vectorizer. @@ -420,11 +342,11 @@ public: typedef SmallPtrSet<Value *, 16> ValueSet; typedef SmallVector<StoreInst *, 8> StoreList; - BoUpSLP(Function *Func, ScalarEvolution *Se, const DataLayout *Dl, - TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AliasAnalysis *Aa, - LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC) + BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, + TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li, + DominatorTree *Dt, AssumptionCache *AC) : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func), - SE(Se), DL(Dl), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), + SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), Builder(Se->getContext()) { CodeMetrics::collectEphemeralValues(F, AC, EphValues); } @@ -461,7 +383,7 @@ public: } /// \returns true if the memory operations A and B are consecutive. - bool isConsecutiveAccess(Value *A, Value *B); + bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL); /// \brief Perform LICM and CSE on the newly generated gather sequences. void optimizeGatherSequence(); @@ -518,6 +440,16 @@ private: /// be beneficial even the tree height is tiny. bool isFullyVectorizableTinyTree(); + /// \reorder commutative operands in alt shuffle if they result in + /// vectorized code. + void reorderAltShuffleOperands(ArrayRef<Value *> VL, + SmallVectorImpl<Value *> &Left, + SmallVectorImpl<Value *> &Right); + /// \reorder commutative operands to get better probability of + /// generating vectorized code. + void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, + SmallVectorImpl<Value *> &Left, + SmallVectorImpl<Value *> &Right); struct TreeEntry { TreeEntry() : Scalars(), VectorizedValue(nullptr), NeedToGather(0) {} @@ -540,7 +472,7 @@ private: /// Create a new VectorizableTree entry. TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized) { - VectorizableTree.push_back(TreeEntry()); + VectorizableTree.emplace_back(); int idx = VectorizableTree.size() - 1; TreeEntry *Last = &VectorizableTree[idx]; Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); @@ -583,7 +515,7 @@ private: /// /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it /// is invariant in the calling loop. - bool isAliased(const AliasAnalysis::Location &Loc1, Instruction *Inst1, + bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1, Instruction *Inst2) { // First check if the result is already in the cache. @@ -592,9 +524,9 @@ private: if (result.hasValue()) { return result.getValue(); } - AliasAnalysis::Location Loc2 = getLocation(Inst2, AA); + MemoryLocation Loc2 = getLocation(Inst2, AA); bool aliased = true; - if (Loc1.Ptr && Loc2.Ptr) { + if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) { // Do the alias check. aliased = AA->alias(Loc1, Loc2); } @@ -945,7 +877,6 @@ private: // Analysis and block reference. Function *F; ScalarEvolution *SE; - const DataLayout *DL; TargetTransformInfo *TTI; TargetLibraryInfo *TLI; AliasAnalysis *AA; @@ -1198,8 +1129,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); return; } - if (!isConsecutiveAccess(VL[i], VL[i + 1])) { - if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0])) { + const DataLayout &DL = F->getParent()->getDataLayout(); + if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) { + if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0], DL)) { ++NumLoadsWantToChangeOrder; } BS.cancelScheduling(VL); @@ -1251,7 +1183,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { case Instruction::ICmp: case Instruction::FCmp: { // Check that all of the compares have the same predicate. - CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate(); + CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate(); Type *ComparedTy = cast<Instruction>(VL[0])->getOperand(0)->getType(); for (unsigned i = 1, e = VL.size(); i < e; ++i) { CmpInst *Cmp = cast<CmpInst>(VL[i]); @@ -1368,9 +1300,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { return; } case Instruction::Store: { + const DataLayout &DL = F->getParent()->getDataLayout(); // Check if the stores are consecutive or of we need to swizzle them. for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) - if (!isConsecutiveAccess(VL[i], VL[i + 1])) { + if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) { BS.cancelScheduling(VL); newTreeEntry(VL, false); DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); @@ -1451,6 +1384,16 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { } newTreeEntry(VL, true); DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); + + // Reorder operands if reordering would enable vectorization. + if (isa<BinaryOperator>(VL0)) { + ValueList Left, Right; + reorderAltShuffleOperands(VL, Left, Right); + buildTree_rec(Left, Depth + 1); + buildTree_rec(Right, Depth + 1); + return; + } + for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. @@ -1694,8 +1637,10 @@ bool BoUpSLP::isFullyVectorizableTinyTree() { if (VectorizableTree.size() != 2) return false; - // Handle splat stores. - if (!VectorizableTree[0].NeedToGather && isSplat(VectorizableTree[1].Scalars)) + // Handle splat and all-constants stores. + if (!VectorizableTree[0].NeedToGather && + (allConstant(VectorizableTree[1].Scalars) || + isSplat(VectorizableTree[1].Scalars))) return true; // Gathering cost would be too much for tiny trees. @@ -1774,7 +1719,7 @@ int BoUpSLP::getTreeCost() { // We only vectorize tiny trees if it is fully vectorizable. if (VectorizableTree.size() < 3 && !isFullyVectorizableTinyTree()) { - if (!VectorizableTree.size()) { + if (VectorizableTree.empty()) { assert(!ExternalUses.size() && "We should not have any external users"); } return INT_MAX; @@ -1847,7 +1792,7 @@ unsigned BoUpSLP::getAddressSpaceOperand(Value *I) { return -1; } -bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) { +bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL) { Value *PtrA = getPointerOperand(A); Value *PtrB = getPointerOperand(B); unsigned ASA = getAddressSpaceOperand(A); @@ -1861,13 +1806,13 @@ bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) { if (PtrA == PtrB || PtrA->getType() != PtrB->getType()) return false; - unsigned PtrBitWidth = DL->getPointerSizeInBits(ASA); + unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA); Type *Ty = cast<PointerType>(PtrA->getType())->getElementType(); - APInt Size(PtrBitWidth, DL->getTypeStoreSize(Ty)); + APInt Size(PtrBitWidth, DL.getTypeStoreSize(Ty)); APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0); - PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(*DL, OffsetA); - PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(*DL, OffsetB); + PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA); + PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB); APInt OffsetDelta = OffsetB - OffsetA; @@ -1888,6 +1833,198 @@ bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) { return X == PtrSCEVB; } +// Reorder commutative operations in alternate shuffle if the resulting vectors +// are consecutive loads. This would allow us to vectorize the tree. +// If we have something like- +// load a[0] - load b[0] +// load b[1] + load a[1] +// load a[2] - load b[2] +// load a[3] + load b[3] +// Reordering the second load b[1] load a[1] would allow us to vectorize this +// code. +void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL, + SmallVectorImpl<Value *> &Left, + SmallVectorImpl<Value *> &Right) { + const DataLayout &DL = F->getParent()->getDataLayout(); + + // Push left and right operands of binary operation into Left and Right + for (unsigned i = 0, e = VL.size(); i < e; ++i) { + Left.push_back(cast<Instruction>(VL[i])->getOperand(0)); + Right.push_back(cast<Instruction>(VL[i])->getOperand(1)); + } + + // Reorder if we have a commutative operation and consecutive access + // are on either side of the alternate instructions. + for (unsigned j = 0; j < VL.size() - 1; ++j) { + if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) { + if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) { + Instruction *VL1 = cast<Instruction>(VL[j]); + Instruction *VL2 = cast<Instruction>(VL[j + 1]); + if (isConsecutiveAccess(L, L1, DL) && VL1->isCommutative()) { + std::swap(Left[j], Right[j]); + continue; + } else if (isConsecutiveAccess(L, L1, DL) && VL2->isCommutative()) { + std::swap(Left[j + 1], Right[j + 1]); + continue; + } + // else unchanged + } + } + if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) { + if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) { + Instruction *VL1 = cast<Instruction>(VL[j]); + Instruction *VL2 = cast<Instruction>(VL[j + 1]); + if (isConsecutiveAccess(L, L1, DL) && VL1->isCommutative()) { + std::swap(Left[j], Right[j]); + continue; + } else if (isConsecutiveAccess(L, L1, DL) && VL2->isCommutative()) { + std::swap(Left[j + 1], Right[j + 1]); + continue; + } + // else unchanged + } + } + } +} + +void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, + SmallVectorImpl<Value *> &Left, + SmallVectorImpl<Value *> &Right) { + + SmallVector<Value *, 16> OrigLeft, OrigRight; + + bool AllSameOpcodeLeft = true; + bool AllSameOpcodeRight = true; + for (unsigned i = 0, e = VL.size(); i != e; ++i) { + Instruction *I = cast<Instruction>(VL[i]); + Value *VLeft = I->getOperand(0); + Value *VRight = I->getOperand(1); + + OrigLeft.push_back(VLeft); + OrigRight.push_back(VRight); + + Instruction *ILeft = dyn_cast<Instruction>(VLeft); + Instruction *IRight = dyn_cast<Instruction>(VRight); + + // Check whether all operands on one side have the same opcode. In this case + // we want to preserve the original order and not make things worse by + // reordering. + if (i && AllSameOpcodeLeft && ILeft) { + if (Instruction *PLeft = dyn_cast<Instruction>(OrigLeft[i - 1])) { + if (PLeft->getOpcode() != ILeft->getOpcode()) + AllSameOpcodeLeft = false; + } else + AllSameOpcodeLeft = false; + } + if (i && AllSameOpcodeRight && IRight) { + if (Instruction *PRight = dyn_cast<Instruction>(OrigRight[i - 1])) { + if (PRight->getOpcode() != IRight->getOpcode()) + AllSameOpcodeRight = false; + } else + AllSameOpcodeRight = false; + } + + // Sort two opcodes. In the code below we try to preserve the ability to use + // broadcast of values instead of individual inserts. + // vl1 = load + // vl2 = phi + // vr1 = load + // vr2 = vr2 + // = vl1 x vr1 + // = vl2 x vr2 + // If we just sorted according to opcode we would leave the first line in + // tact but we would swap vl2 with vr2 because opcode(phi) > opcode(load). + // = vl1 x vr1 + // = vr2 x vl2 + // Because vr2 and vr1 are from the same load we loose the opportunity of a + // broadcast for the packed right side in the backend: we have [vr1, vl2] + // instead of [vr1, vr2=vr1]. + if (ILeft && IRight) { + if (!i && ILeft->getOpcode() > IRight->getOpcode()) { + Left.push_back(IRight); + Right.push_back(ILeft); + } else if (i && ILeft->getOpcode() > IRight->getOpcode() && + Right[i - 1] != IRight) { + // Try not to destroy a broad cast for no apparent benefit. + Left.push_back(IRight); + Right.push_back(ILeft); + } else if (i && ILeft->getOpcode() == IRight->getOpcode() && + Right[i - 1] == ILeft) { + // Try preserve broadcasts. + Left.push_back(IRight); + Right.push_back(ILeft); + } else if (i && ILeft->getOpcode() == IRight->getOpcode() && + Left[i - 1] == IRight) { + // Try preserve broadcasts. + Left.push_back(IRight); + Right.push_back(ILeft); + } else { + Left.push_back(ILeft); + Right.push_back(IRight); + } + continue; + } + // One opcode, put the instruction on the right. + if (ILeft) { + Left.push_back(VRight); + Right.push_back(ILeft); + continue; + } + Left.push_back(VLeft); + Right.push_back(VRight); + } + + bool LeftBroadcast = isSplat(Left); + bool RightBroadcast = isSplat(Right); + + // If operands end up being broadcast return this operand order. + if (LeftBroadcast || RightBroadcast) + return; + + // Don't reorder if the operands where good to begin. + if (AllSameOpcodeRight || AllSameOpcodeLeft) { + Left = OrigLeft; + Right = OrigRight; + } + + const DataLayout &DL = F->getParent()->getDataLayout(); + + // Finally check if we can get longer vectorizable chain by reordering + // without breaking the good operand order detected above. + // E.g. If we have something like- + // load a[0] load b[0] + // load b[1] load a[1] + // load a[2] load b[2] + // load a[3] load b[3] + // Reordering the second load b[1] load a[1] would allow us to vectorize + // this code and we still retain AllSameOpcode property. + // FIXME: This load reordering might break AllSameOpcode in some rare cases + // such as- + // add a[0],c[0] load b[0] + // add a[1],c[2] load b[1] + // b[2] load b[2] + // add a[3],c[3] load b[3] + for (unsigned j = 0; j < VL.size() - 1; ++j) { + if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) { + if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) { + if (isConsecutiveAccess(L, L1, DL)) { + std::swap(Left[j + 1], Right[j + 1]); + continue; + } + } + } + if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) { + if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) { + if (isConsecutiveAccess(L, L1, DL)) { + std::swap(Left[j + 1], Right[j + 1]); + continue; + } + } + } + // else unchanged + } +} + void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) { Instruction *VL0 = cast<Instruction>(VL[0]); BasicBlock::iterator NextInst = VL0; @@ -1974,6 +2111,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return Gather(E->Scalars, VecTy); } + const DataLayout &DL = F->getParent()->getDataLayout(); unsigned Opcode = getSameOpcode(E->Scalars); switch (Opcode) { @@ -2066,7 +2204,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (Value *V = alreadyVectorized(E->Scalars)) return V; - CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate(); + CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate(); Value *V; if (Opcode == Instruction::FCmp) V = Builder.CreateFCmp(P0, L, R); @@ -2170,8 +2308,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { unsigned Alignment = LI->getAlignment(); LI = Builder.CreateLoad(VecPtr); - if (!Alignment) - Alignment = DL->getABITypeAlignment(ScalarLoadTy); + if (!Alignment) { + Alignment = DL.getABITypeAlignment(ScalarLoadTy); + } LI->setAlignment(Alignment); E->VectorizedValue = LI; ++NumVectorInstructions; @@ -2200,8 +2339,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { ExternalUses.push_back( ExternalUser(SI->getPointerOperand(), cast<User>(VecPtr), 0)); - if (!Alignment) - Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType()); + if (!Alignment) { + Alignment = DL.getABITypeAlignment(SI->getValueOperand()->getType()); + } S->setAlignment(Alignment); E->VectorizedValue = S; ++NumVectorInstructions; @@ -2227,7 +2367,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { OpVecs.push_back(OpVec); } - Value *V = Builder.CreateGEP(Op0, OpVecs); + Value *V = Builder.CreateGEP( + cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs); E->VectorizedValue = V; ++NumVectorInstructions; @@ -2243,7 +2384,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Intrinsic::ID IID = Intrinsic::not_intrinsic; Value *ScalarArg = nullptr; if (CI && (FI = CI->getCalledFunction())) { - IID = (Intrinsic::ID) FI->getIntrinsicID(); + IID = FI->getIntrinsicID(); } std::vector<Value *> OpVecs; for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) { @@ -2284,10 +2425,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } case Instruction::ShuffleVector: { ValueList LHSVL, RHSVL; - for (int i = 0, e = E->Scalars.size(); i < e; ++i) { - LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); - RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1)); - } + assert(isa<BinaryOperator>(VL0) && "Invalid Shuffle Vector Operand"); + reorderAltShuffleOperands(E->Scalars, LHSVL, RHSVL); setInsertPointAfterBundle(E->Scalars); Value *LHS = vectorizeTree(LHSVL); @@ -2766,25 +2905,59 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, ScheduleData *DepDest = BundleMember->NextLoadStore; if (DepDest) { Instruction *SrcInst = BundleMember->Inst; - AliasAnalysis::Location SrcLoc = getLocation(SrcInst, SLP->AA); + MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA); bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory(); + unsigned numAliased = 0; + unsigned DistToSrc = 1; while (DepDest) { assert(isInSchedulingRegion(DepDest)); - if (SrcMayWrite || DepDest->Inst->mayWriteToMemory()) { - if (SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)) { - DepDest->MemoryDependencies.push_back(BundleMember); - BundleMember->Dependencies++; - ScheduleData *DestBundle = DepDest->FirstInBundle; - if (!DestBundle->IsScheduled) { - BundleMember->incrementUnscheduledDeps(1); - } - if (!DestBundle->hasValidDependencies()) { - WorkList.push_back(DestBundle); - } + + // We have two limits to reduce the complexity: + // 1) AliasedCheckLimit: It's a small limit to reduce calls to + // SLP->isAliased (which is the expensive part in this loop). + // 2) MaxMemDepDistance: It's for very large blocks and it aborts + // the whole loop (even if the loop is fast, it's quadratic). + // It's important for the loop break condition (see below) to + // check this limit even between two read-only instructions. + if (DistToSrc >= MaxMemDepDistance || + ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) && + (numAliased >= AliasedCheckLimit || + SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) { + + // We increment the counter only if the locations are aliased + // (instead of counting all alias checks). This gives a better + // balance between reduced runtime and accurate dependencies. + numAliased++; + + DepDest->MemoryDependencies.push_back(BundleMember); + BundleMember->Dependencies++; + ScheduleData *DestBundle = DepDest->FirstInBundle; + if (!DestBundle->IsScheduled) { + BundleMember->incrementUnscheduledDeps(1); + } + if (!DestBundle->hasValidDependencies()) { + WorkList.push_back(DestBundle); } } DepDest = DepDest->NextLoadStore; + + // Example, explaining the loop break condition: Let's assume our + // starting instruction is i0 and MaxMemDepDistance = 3. + // + // +--------v--v--v + // i0,i1,i2,i3,i4,i5,i6,i7,i8 + // +--------^--^--^ + // + // MaxMemDepDistance let us stop alias-checking at i3 and we add + // dependencies from i0 to i3,i4,.. (even if they are not aliased). + // Previously we already added dependencies from i3 to i6,i7,i8 + // (because of MaxMemDepDistance). As we added a dependency from + // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8 + // and we can abort this loop at i6. + if (DistToSrc >= 2 * MaxMemDepDistance) + break; + DistToSrc++; } } } @@ -2888,7 +3061,6 @@ struct SLPVectorizer : public FunctionPass { } ScalarEvolution *SE; - const DataLayout *DL; TargetTransformInfo *TTI; TargetLibraryInfo *TLI; AliasAnalysis *AA; @@ -2901,12 +3073,11 @@ struct SLPVectorizer : public FunctionPass { return false; SE = &getAnalysis<ScalarEvolution>(); - DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); - DL = DLP ? &DLP->getDataLayout() : nullptr; - TTI = &getAnalysis<TargetTransformInfo>(); - TLI = getAnalysisIfAvailable<TargetLibraryInfo>(); + TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); + TLI = TLIP ? &TLIP->getTLI() : nullptr; AA = &getAnalysis<AliasAnalysis>(); - LI = &getAnalysis<LoopInfo>(); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); @@ -2918,11 +3089,6 @@ struct SLPVectorizer : public FunctionPass { if (!TTI->getNumberOfRegisters(true)) return false; - // Must have DataLayout. We can't require it because some tests run w/o - // triple. - if (!DL) - return false; - // Don't vectorize when the attribute NoImplicitFloat is used. if (F.hasFnAttribute(Attribute::NoImplicitFloat)) return false; @@ -2931,15 +3097,13 @@ struct SLPVectorizer : public FunctionPass { // Use the bottom up slp vectorizer to construct chains that start with // store instructions. - BoUpSLP R(&F, SE, DL, TTI, TLI, AA, LI, DT, AC); + BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC); // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to // delete instructions. // Scan the blocks in the function in post order. - for (po_iterator<BasicBlock*> it = po_begin(&F.getEntryBlock()), - e = po_end(&F.getEntryBlock()); it != e; ++it) { - BasicBlock *BB = *it; + for (auto BB : post_order(&F.getEntryBlock())) { // Vectorize trees that end at stores. if (unsigned count = collectStores(BB, R)) { (void)count; @@ -2964,10 +3128,10 @@ struct SLPVectorizer : public FunctionPass { AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<ScalarEvolution>(); AU.addRequired<AliasAnalysis>(); - AU.addRequired<TargetTransformInfo>(); - AU.addRequired<LoopInfo>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.addRequired<LoopInfoWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); - AU.addPreserved<LoopInfo>(); + AU.addPreserved<LoopInfoWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); AU.setPreservesCFG(); } @@ -3014,15 +3178,11 @@ private: /// the WeakVH array. /// Vectorization of part of the VL array may cause later values in the VL array /// to become invalid. We track when this has happened in the WeakVH array. -static bool hasValueBeenRAUWed(ArrayRef<Value *> &VL, - SmallVectorImpl<WeakVH> &VH, - unsigned SliceBegin, - unsigned SliceSize) { - for (unsigned i = SliceBegin; i < SliceBegin + SliceSize; ++i) - if (VH[i] != VL[i]) - return true; - - return false; +static bool hasValueBeenRAUWed(ArrayRef<Value *> VL, ArrayRef<WeakVH> VH, + unsigned SliceBegin, unsigned SliceSize) { + VL = VL.slice(SliceBegin, SliceSize); + VH = VH.slice(SliceBegin, SliceSize); + return !std::equal(VL.begin(), VL.end(), VH.begin()); } bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain, @@ -3031,7 +3191,8 @@ bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain, DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen << "\n"); Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType(); - unsigned Sz = DL->getTypeSizeInBits(StoreTy); + auto &DL = cast<StoreInst>(Chain[0])->getModule()->getDataLayout(); + unsigned Sz = DL.getTypeSizeInBits(StoreTy); unsigned VF = MinVecRegSize / Sz; if (!isPowerOf2_32(Sz) || VF < 2) @@ -3074,8 +3235,8 @@ bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain, bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold, BoUpSLP &R) { - SetVector<Value *> Heads, Tails; - SmallDenseMap<Value *, Value *> ConsecutiveChain; + SetVector<StoreInst *> Heads, Tails; + SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain; // We may run into multiple chains that merge into a single chain. We mark the // stores that we vectorized so that we don't visit the same store twice. @@ -3088,8 +3249,8 @@ bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores, for (unsigned j = 0; j < e; ++j) { if (i == j) continue; - - if (R.isConsecutiveAccess(Stores[i], Stores[j])) { + const DataLayout &DL = Stores[i]->getModule()->getDataLayout(); + if (R.isConsecutiveAccess(Stores[i], Stores[j], DL)) { Tails.insert(Stores[j]); Heads.insert(Stores[i]); ConsecutiveChain[Stores[i]] = Stores[j]; @@ -3098,7 +3259,7 @@ bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores, } // For stores that start but don't end a link in the chain: - for (SetVector<Value *>::iterator it = Heads.begin(), e = Heads.end(); + for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end(); it != e; ++it) { if (Tails.count(*it)) continue; @@ -3106,7 +3267,7 @@ bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores, // We found a store instr that starts a chain. Now follow the chain and try // to vectorize it. BoUpSLP::ValueList Operands; - Value *I = *it; + StoreInst *I = *it; // Collect the chain into a list. while (Tails.count(I) || Heads.count(I)) { if (VectorizedStores.count(I)) @@ -3131,6 +3292,7 @@ bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores, unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) { unsigned count = 0; StoreRefs.clear(); + const DataLayout &DL = BB->getModule()->getDataLayout(); for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { StoreInst *SI = dyn_cast<StoreInst>(it); if (!SI) @@ -3176,9 +3338,10 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, return false; unsigned Opcode0 = I0->getOpcode(); + const DataLayout &DL = I0->getModule()->getDataLayout(); Type *Ty0 = I0->getType(); - unsigned Sz = DL->getTypeSizeInBits(Ty0); + unsigned Sz = DL.getTypeSizeInBits(Ty0); unsigned VF = MinVecRegSize / Sz; for (int i = 0, e = VL.size(); i < e; ++i) { @@ -3380,8 +3543,7 @@ public: ReducedValueOpcode(0), ReduxWidth(0), IsPairwiseReduction(false) {} /// \brief Try to find a reduction tree. - bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B, - const DataLayout *DL) { + bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) { assert((!Phi || std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) && "Thi phi needs to use the binary operator"); @@ -3406,9 +3568,10 @@ public: if (!isValidElementType(Ty)) return false; + const DataLayout &DL = B->getModule()->getDataLayout(); ReductionOpcode = B->getOpcode(); ReducedValueOpcode = 0; - ReduxWidth = MinVecRegSize / DL->getTypeSizeInBits(Ty); + ReduxWidth = MinVecRegSize / DL.getTypeSizeInBits(Ty); ReductionRoot = B; ReductionPHI = Phi; @@ -3718,8 +3881,7 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { // Try to match and vectorize a horizontal reduction. HorizontalReduction HorRdx; - if (ShouldVectorizeHor && - HorRdx.matchAssociativeReduction(P, BI, DL) && + if (ShouldVectorizeHor && HorRdx.matchAssociativeReduction(P, BI) && HorRdx.tryToReduce(R, TTI)) { Changed = true; it = BB->begin(); @@ -3749,7 +3911,7 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { if (BinaryOperator *BinOp = dyn_cast<BinaryOperator>(SI->getValueOperand())) { HorizontalReduction HorRdx; - if (((HorRdx.matchAssociativeReduction(nullptr, BinOp, DL) && + if (((HorRdx.matchAssociativeReduction(nullptr, BinOp) && HorRdx.tryToReduce(R, TTI)) || tryToVectorize(BinOp, R))) { Changed = true; @@ -3793,6 +3955,7 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { // and the iterator may become invalid value. it = BB->begin(); e = BB->end(); + break; } } } @@ -3849,7 +4012,7 @@ char SLPVectorizer::ID = 0; static const char lv_name[] = "SLP Vectorizer"; INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) -INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) diff --git a/contrib/llvm/lib/Transforms/Vectorize/Vectorize.cpp b/contrib/llvm/lib/Transforms/Vectorize/Vectorize.cpp index d459bcf..6e002fd 100644 --- a/contrib/llvm/lib/Transforms/Vectorize/Vectorize.cpp +++ b/contrib/llvm/lib/Transforms/Vectorize/Vectorize.cpp @@ -19,7 +19,7 @@ #include "llvm/Analysis/Passes.h" #include "llvm/IR/Verifier.h" #include "llvm/InitializePasses.h" -#include "llvm/PassManager.h" +#include "llvm/IR/LegacyPassManager.h" using namespace llvm; |