diff options
Diffstat (limited to 'contrib/llvm/lib/Transforms/Vectorize')
4 files changed, 2549 insertions, 1441 deletions
diff --git a/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp b/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp index af594cb..c01740b 100644 --- a/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp +++ b/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp @@ -3148,7 +3148,7 @@ namespace { LLVMContext::MD_noalias, LLVMContext::MD_fpmath, LLVMContext::MD_invariant_group}; combineMetadata(K, H, KnownIDs); - K->intersectOptionalDataWith(H); + K->andIRFlags(H); for (unsigned o = 0; o < NumOperands; ++o) K->setOperand(o, ReplacedOperands[o]); diff --git a/contrib/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/contrib/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index c8906bd..c44a393 100644 --- a/contrib/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/contrib/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -15,6 +15,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/Triple.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/OrderedBasicBlock.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetTransformInfo.h" @@ -30,6 +31,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Vectorize.h" using namespace llvm; @@ -40,13 +42,12 @@ STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized"); namespace { -// TODO: Remove this -static const unsigned TargetBaseAlign = 4; +// FIXME: Assuming stack alignment of 4 is always good enough +static const unsigned StackAdjustedAlignment = 4; +typedef SmallVector<Instruction *, 8> InstrList; +typedef MapVector<Value *, InstrList> InstrListMap; class Vectorizer { - typedef SmallVector<Value *, 8> ValueList; - typedef MapVector<Value *, ValueList> ValueListMap; - Function &F; AliasAnalysis &AA; DominatorTree &DT; @@ -54,8 +55,6 @@ class Vectorizer { TargetTransformInfo &TTI; const DataLayout &DL; IRBuilder<> Builder; - ValueListMap StoreRefs; - ValueListMap LoadRefs; public: Vectorizer(Function &F, AliasAnalysis &AA, DominatorTree &DT, @@ -94,45 +93,47 @@ private: /// Returns the first and the last instructions in Chain. std::pair<BasicBlock::iterator, BasicBlock::iterator> - getBoundaryInstrs(ArrayRef<Value *> Chain); + getBoundaryInstrs(ArrayRef<Instruction *> Chain); /// Erases the original instructions after vectorizing. - void eraseInstructions(ArrayRef<Value *> Chain); + void eraseInstructions(ArrayRef<Instruction *> Chain); /// "Legalize" the vector type that would be produced by combining \p /// ElementSizeBits elements in \p Chain. Break into two pieces such that the /// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is /// expected to have more than 4 elements. - std::pair<ArrayRef<Value *>, ArrayRef<Value *>> - splitOddVectorElts(ArrayRef<Value *> Chain, unsigned ElementSizeBits); + std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>> + splitOddVectorElts(ArrayRef<Instruction *> Chain, unsigned ElementSizeBits); - /// Checks for instructions which may affect the memory accessed - /// in the chain between \p From and \p To. Returns Index, where - /// \p Chain[0, Index) is the largest vectorizable chain prefix. - /// The elements of \p Chain should be all loads or all stores. - unsigned getVectorizablePrefixEndIdx(ArrayRef<Value *> Chain, - BasicBlock::iterator From, - BasicBlock::iterator To); + /// Finds the largest prefix of Chain that's vectorizable, checking for + /// intervening instructions which may affect the memory accessed by the + /// instructions within Chain. + /// + /// The elements of \p Chain must be all loads or all stores and must be in + /// address order. + ArrayRef<Instruction *> getVectorizablePrefix(ArrayRef<Instruction *> Chain); /// Collects load and store instructions to vectorize. - void collectInstructions(BasicBlock *BB); + std::pair<InstrListMap, InstrListMap> collectInstructions(BasicBlock *BB); - /// Processes the collected instructions, the \p Map. The elements of \p Map + /// Processes the collected instructions, the \p Map. The values of \p Map /// should be all loads or all stores. - bool vectorizeChains(ValueListMap &Map); + bool vectorizeChains(InstrListMap &Map); /// Finds the load/stores to consecutive memory addresses and vectorizes them. - bool vectorizeInstructions(ArrayRef<Value *> Instrs); + bool vectorizeInstructions(ArrayRef<Instruction *> Instrs); /// Vectorizes the load instructions in Chain. - bool vectorizeLoadChain(ArrayRef<Value *> Chain, - SmallPtrSet<Value *, 16> *InstructionsProcessed); + bool + vectorizeLoadChain(ArrayRef<Instruction *> Chain, + SmallPtrSet<Instruction *, 16> *InstructionsProcessed); /// Vectorizes the store instructions in Chain. - bool vectorizeStoreChain(ArrayRef<Value *> Chain, - SmallPtrSet<Value *, 16> *InstructionsProcessed); + bool + vectorizeStoreChain(ArrayRef<Instruction *> Chain, + SmallPtrSet<Instruction *, 16> *InstructionsProcessed); - /// Check if this load/store access is misaligned accesses + /// Check if this load/store access is misaligned accesses. bool accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace, unsigned Alignment); }; @@ -147,7 +148,7 @@ public: bool runOnFunction(Function &F) override; - const char *getPassName() const override { + StringRef getPassName() const override { return "GPU Load and Store Vectorizer"; } @@ -177,6 +178,13 @@ Pass *llvm::createLoadStoreVectorizerPass() { return new LoadStoreVectorizer(); } +// The real propagateMetadata expects a SmallVector<Value*>, but we deal in +// vectors of Instructions. +static void propagateMetadata(Instruction *I, ArrayRef<Instruction *> IL) { + SmallVector<Value *, 8> VL(IL.begin(), IL.end()); + propagateMetadata(I, VL); +} + bool LoadStoreVectorizer::runOnFunction(Function &F) { // Don't vectorize when the attribute NoImplicitFloat is used. if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat)) @@ -198,7 +206,8 @@ bool Vectorizer::run() { // Scan the blocks in the function in post order. for (BasicBlock *BB : post_order(&F)) { - collectInstructions(BB); + InstrListMap LoadRefs, StoreRefs; + std::tie(LoadRefs, StoreRefs) = collectInstructions(BB); Changed |= vectorizeChains(LoadRefs); Changed |= vectorizeChains(StoreRefs); } @@ -338,6 +347,7 @@ bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) { } void Vectorizer::reorder(Instruction *I) { + OrderedBasicBlock OBB(I->getParent()); SmallPtrSet<Instruction *, 16> InstructionsToMove; SmallVector<Instruction *, 16> Worklist; @@ -350,11 +360,14 @@ void Vectorizer::reorder(Instruction *I) { if (!IM || IM->getOpcode() == Instruction::PHI) continue; - if (!DT.dominates(IM, I)) { + // If IM is in another BB, no need to move it, because this pass only + // vectorizes instructions within one BB. + if (IM->getParent() != I->getParent()) + continue; + + if (!OBB.dominates(IM, I)) { InstructionsToMove.insert(IM); Worklist.push_back(IM); - assert(IM->getParent() == IW->getParent() && - "Instructions to move should be in the same basic block"); } } } @@ -362,7 +375,7 @@ void Vectorizer::reorder(Instruction *I) { // All instructions to move should follow I. Start from I, not from begin(). for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E; ++BBI) { - if (!is_contained(InstructionsToMove, &*BBI)) + if (!InstructionsToMove.count(&*BBI)) continue; Instruction *IM = &*BBI; --BBI; @@ -372,8 +385,8 @@ void Vectorizer::reorder(Instruction *I) { } std::pair<BasicBlock::iterator, BasicBlock::iterator> -Vectorizer::getBoundaryInstrs(ArrayRef<Value *> Chain) { - Instruction *C0 = cast<Instruction>(Chain[0]); +Vectorizer::getBoundaryInstrs(ArrayRef<Instruction *> Chain) { + Instruction *C0 = Chain[0]; BasicBlock::iterator FirstInstr = C0->getIterator(); BasicBlock::iterator LastInstr = C0->getIterator(); @@ -397,105 +410,152 @@ Vectorizer::getBoundaryInstrs(ArrayRef<Value *> Chain) { return std::make_pair(FirstInstr, ++LastInstr); } -void Vectorizer::eraseInstructions(ArrayRef<Value *> Chain) { +void Vectorizer::eraseInstructions(ArrayRef<Instruction *> Chain) { SmallVector<Instruction *, 16> Instrs; - for (Value *V : Chain) { - Value *PtrOperand = getPointerOperand(V); + for (Instruction *I : Chain) { + Value *PtrOperand = getPointerOperand(I); assert(PtrOperand && "Instruction must have a pointer operand."); - Instrs.push_back(cast<Instruction>(V)); + Instrs.push_back(I); if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(PtrOperand)) Instrs.push_back(GEP); } // Erase instructions. - for (Value *V : Instrs) { - Instruction *Instr = cast<Instruction>(V); - if (Instr->use_empty()) - Instr->eraseFromParent(); - } + for (Instruction *I : Instrs) + if (I->use_empty()) + I->eraseFromParent(); } -std::pair<ArrayRef<Value *>, ArrayRef<Value *>> -Vectorizer::splitOddVectorElts(ArrayRef<Value *> Chain, +std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>> +Vectorizer::splitOddVectorElts(ArrayRef<Instruction *> Chain, unsigned ElementSizeBits) { - unsigned ElemSizeInBytes = ElementSizeBits / 8; - unsigned SizeInBytes = ElemSizeInBytes * Chain.size(); - unsigned NumRight = (SizeInBytes % 4) / ElemSizeInBytes; - unsigned NumLeft = Chain.size() - NumRight; + unsigned ElementSizeBytes = ElementSizeBits / 8; + unsigned SizeBytes = ElementSizeBytes * Chain.size(); + unsigned NumLeft = (SizeBytes - (SizeBytes % 4)) / ElementSizeBytes; + if (NumLeft == Chain.size()) + --NumLeft; + else if (NumLeft == 0) + NumLeft = 1; return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft)); } -unsigned Vectorizer::getVectorizablePrefixEndIdx(ArrayRef<Value *> Chain, - BasicBlock::iterator From, - BasicBlock::iterator To) { - SmallVector<std::pair<Value *, unsigned>, 16> MemoryInstrs; - SmallVector<std::pair<Value *, unsigned>, 16> ChainInstrs; +ArrayRef<Instruction *> +Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) { + // These are in BB order, unlike Chain, which is in address order. + SmallVector<Instruction *, 16> MemoryInstrs; + SmallVector<Instruction *, 16> ChainInstrs; + + bool IsLoadChain = isa<LoadInst>(Chain[0]); + DEBUG({ + for (Instruction *I : Chain) { + if (IsLoadChain) + assert(isa<LoadInst>(I) && + "All elements of Chain must be loads, or all must be stores."); + else + assert(isa<StoreInst>(I) && + "All elements of Chain must be loads, or all must be stores."); + } + }); - unsigned InstrIdx = 0; - for (auto I = From; I != To; ++I, ++InstrIdx) { + for (Instruction &I : make_range(getBoundaryInstrs(Chain))) { if (isa<LoadInst>(I) || isa<StoreInst>(I)) { - if (!is_contained(Chain, &*I)) - MemoryInstrs.push_back({&*I, InstrIdx}); + if (!is_contained(Chain, &I)) + MemoryInstrs.push_back(&I); else - ChainInstrs.push_back({&*I, InstrIdx}); - } else if (I->mayHaveSideEffects()) { - DEBUG(dbgs() << "LSV: Found side-effecting operation: " << *I << '\n'); - return 0; + ChainInstrs.push_back(&I); + } else if (IsLoadChain && (I.mayWriteToMemory() || I.mayThrow())) { + DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << I << '\n'); + break; + } else if (!IsLoadChain && (I.mayReadOrWriteMemory() || I.mayThrow())) { + DEBUG(dbgs() << "LSV: Found may-read/write/throw operation: " << I + << '\n'); + break; } } - assert(Chain.size() == ChainInstrs.size() && - "All instructions in the Chain must exist in [From, To)."); + OrderedBasicBlock OBB(Chain[0]->getParent()); - unsigned ChainIdx = 0; - for (auto EntryChain : ChainInstrs) { - Value *ChainInstrValue = EntryChain.first; - unsigned ChainInstrIdx = EntryChain.second; - for (auto EntryMem : MemoryInstrs) { - Value *MemInstrValue = EntryMem.first; - unsigned MemInstrIdx = EntryMem.second; - if (isa<LoadInst>(MemInstrValue) && isa<LoadInst>(ChainInstrValue)) + // Loop until we find an instruction in ChainInstrs that we can't vectorize. + unsigned ChainInstrIdx = 0; + Instruction *BarrierMemoryInstr = nullptr; + + for (unsigned E = ChainInstrs.size(); ChainInstrIdx < E; ++ChainInstrIdx) { + Instruction *ChainInstr = ChainInstrs[ChainInstrIdx]; + + // If a barrier memory instruction was found, chain instructions that follow + // will not be added to the valid prefix. + if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, ChainInstr)) + break; + + // Check (in BB order) if any instruction prevents ChainInstr from being + // vectorized. Find and store the first such "conflicting" instruction. + for (Instruction *MemInstr : MemoryInstrs) { + // If a barrier memory instruction was found, do not check past it. + if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, MemInstr)) + break; + + if (isa<LoadInst>(MemInstr) && isa<LoadInst>(ChainInstr)) continue; // We can ignore the alias as long as the load comes before the store, // because that means we won't be moving the load past the store to // vectorize it (the vectorized load is inserted at the location of the // first load in the chain). - if (isa<StoreInst>(MemInstrValue) && isa<LoadInst>(ChainInstrValue) && - ChainInstrIdx < MemInstrIdx) + if (isa<StoreInst>(MemInstr) && isa<LoadInst>(ChainInstr) && + OBB.dominates(ChainInstr, MemInstr)) continue; // Same case, but in reverse. - if (isa<LoadInst>(MemInstrValue) && isa<StoreInst>(ChainInstrValue) && - ChainInstrIdx > MemInstrIdx) + if (isa<LoadInst>(MemInstr) && isa<StoreInst>(ChainInstr) && + OBB.dominates(MemInstr, ChainInstr)) continue; - Instruction *M0 = cast<Instruction>(MemInstrValue); - Instruction *M1 = cast<Instruction>(ChainInstrValue); - - if (!AA.isNoAlias(MemoryLocation::get(M0), MemoryLocation::get(M1))) { + if (!AA.isNoAlias(MemoryLocation::get(MemInstr), + MemoryLocation::get(ChainInstr))) { DEBUG({ - Value *Ptr0 = getPointerOperand(M0); - Value *Ptr1 = getPointerOperand(M1); - - dbgs() << "LSV: Found alias.\n" - " Aliasing instruction and pointer:\n" - << *MemInstrValue << " aliases " << *Ptr0 << '\n' - << " Aliased instruction and pointer:\n" - << *ChainInstrValue << " aliases " << *Ptr1 << '\n'; + dbgs() << "LSV: Found alias:\n" + " Aliasing instruction and pointer:\n" + << " " << *MemInstr << '\n' + << " " << *getPointerOperand(MemInstr) << '\n' + << " Aliased instruction and pointer:\n" + << " " << *ChainInstr << '\n' + << " " << *getPointerOperand(ChainInstr) << '\n'; }); - - return ChainIdx; + // Save this aliasing memory instruction as a barrier, but allow other + // instructions that precede the barrier to be vectorized with this one. + BarrierMemoryInstr = MemInstr; + break; } } - ChainIdx++; + // Continue the search only for store chains, since vectorizing stores that + // precede an aliasing load is valid. Conversely, vectorizing loads is valid + // up to an aliasing store, but should not pull loads from further down in + // the basic block. + if (IsLoadChain && BarrierMemoryInstr) { + // The BarrierMemoryInstr is a store that precedes ChainInstr. + assert(OBB.dominates(BarrierMemoryInstr, ChainInstr)); + break; + } } - return Chain.size(); + + // Find the largest prefix of Chain whose elements are all in + // ChainInstrs[0, ChainInstrIdx). This is the largest vectorizable prefix of + // Chain. (Recall that Chain is in address order, but ChainInstrs is in BB + // order.) + SmallPtrSet<Instruction *, 8> VectorizableChainInstrs( + ChainInstrs.begin(), ChainInstrs.begin() + ChainInstrIdx); + unsigned ChainIdx = 0; + for (unsigned ChainLen = Chain.size(); ChainIdx < ChainLen; ++ChainIdx) { + if (!VectorizableChainInstrs.count(Chain[ChainIdx])) + break; + } + return Chain.slice(0, ChainIdx); } -void Vectorizer::collectInstructions(BasicBlock *BB) { - LoadRefs.clear(); - StoreRefs.clear(); +std::pair<InstrListMap, InstrListMap> +Vectorizer::collectInstructions(BasicBlock *BB) { + InstrListMap LoadRefs; + InstrListMap StoreRefs; for (Instruction &I : *BB) { if (!I.mayReadOrWriteMemory()) @@ -505,6 +565,10 @@ void Vectorizer::collectInstructions(BasicBlock *BB) { if (!LI->isSimple()) continue; + // Skip if it's not legal. + if (!TTI.isLegalToVectorizeLoad(LI)) + continue; + Type *Ty = LI->getType(); if (!VectorType::isValidElementType(Ty->getScalarType())) continue; @@ -525,14 +589,11 @@ void Vectorizer::collectInstructions(BasicBlock *BB) { // Make sure all the users of a vector are constant-index extracts. if (isa<VectorType>(Ty) && !all_of(LI->users(), [LI](const User *U) { - const Instruction *UI = cast<Instruction>(U); - return isa<ExtractElementInst>(UI) && - isa<ConstantInt>(UI->getOperand(1)); + const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U); + return EEI && isa<ConstantInt>(EEI->getOperand(1)); })) continue; - // TODO: Target hook to filter types. - // Save the load locations. Value *ObjPtr = GetUnderlyingObject(Ptr, DL); LoadRefs[ObjPtr].push_back(LI); @@ -541,6 +602,10 @@ void Vectorizer::collectInstructions(BasicBlock *BB) { if (!SI->isSimple()) continue; + // Skip if it's not legal. + if (!TTI.isLegalToVectorizeStore(SI)) + continue; + Type *Ty = SI->getValueOperand()->getType(); if (!VectorType::isValidElementType(Ty->getScalarType())) continue; @@ -558,9 +623,8 @@ void Vectorizer::collectInstructions(BasicBlock *BB) { continue; if (isa<VectorType>(Ty) && !all_of(SI->users(), [SI](const User *U) { - const Instruction *UI = cast<Instruction>(U); - return isa<ExtractElementInst>(UI) && - isa<ConstantInt>(UI->getOperand(1)); + const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U); + return EEI && isa<ConstantInt>(EEI->getOperand(1)); })) continue; @@ -569,12 +633,14 @@ void Vectorizer::collectInstructions(BasicBlock *BB) { StoreRefs[ObjPtr].push_back(SI); } } + + return {LoadRefs, StoreRefs}; } -bool Vectorizer::vectorizeChains(ValueListMap &Map) { +bool Vectorizer::vectorizeChains(InstrListMap &Map) { bool Changed = false; - for (const std::pair<Value *, ValueList> &Chain : Map) { + for (const std::pair<Value *, InstrList> &Chain : Map) { unsigned Size = Chain.second.size(); if (Size < 2) continue; @@ -584,7 +650,7 @@ bool Vectorizer::vectorizeChains(ValueListMap &Map) { // Process the stores in chunks of 64. for (unsigned CI = 0, CE = Size; CI < CE; CI += 64) { unsigned Len = std::min<unsigned>(CE - CI, 64); - ArrayRef<Value *> Chunk(&Chain.second[CI], Len); + ArrayRef<Instruction *> Chunk(&Chain.second[CI], Len); Changed |= vectorizeInstructions(Chunk); } } @@ -592,9 +658,9 @@ bool Vectorizer::vectorizeChains(ValueListMap &Map) { return Changed; } -bool Vectorizer::vectorizeInstructions(ArrayRef<Value *> Instrs) { +bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) { DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size() << " instructions.\n"); - SmallSetVector<int, 16> Heads, Tails; + SmallVector<int, 16> Heads, Tails; int ConsecutiveChain[64]; // Do a quadratic search on all of the given stores and find all of the pairs @@ -613,34 +679,34 @@ bool Vectorizer::vectorizeInstructions(ArrayRef<Value *> Instrs) { continue; // Should not insert. } - Tails.insert(j); - Heads.insert(i); + Tails.push_back(j); + Heads.push_back(i); ConsecutiveChain[i] = j; } } } bool Changed = false; - SmallPtrSet<Value *, 16> InstructionsProcessed; + SmallPtrSet<Instruction *, 16> InstructionsProcessed; for (int Head : Heads) { if (InstructionsProcessed.count(Instrs[Head])) continue; - bool longerChainExists = false; + bool LongerChainExists = false; for (unsigned TIt = 0; TIt < Tails.size(); TIt++) if (Head == Tails[TIt] && !InstructionsProcessed.count(Instrs[Heads[TIt]])) { - longerChainExists = true; + LongerChainExists = true; break; } - if (longerChainExists) + if (LongerChainExists) continue; // We found an instr that starts a chain. Now follow the chain and try to // vectorize it. - SmallVector<Value *, 16> Operands; + SmallVector<Instruction *, 16> Operands; int I = Head; - while (I != -1 && (Tails.count(I) || Heads.count(I))) { + while (I != -1 && (is_contained(Tails, I) || is_contained(Heads, I))) { if (InstructionsProcessed.count(Instrs[I])) break; @@ -661,13 +727,14 @@ bool Vectorizer::vectorizeInstructions(ArrayRef<Value *> Instrs) { } bool Vectorizer::vectorizeStoreChain( - ArrayRef<Value *> Chain, SmallPtrSet<Value *, 16> *InstructionsProcessed) { + ArrayRef<Instruction *> Chain, + SmallPtrSet<Instruction *, 16> *InstructionsProcessed) { StoreInst *S0 = cast<StoreInst>(Chain[0]); // If the vector has an int element, default to int for the whole load. Type *StoreTy; - for (const auto &V : Chain) { - StoreTy = cast<StoreInst>(V)->getValueOperand()->getType(); + for (Instruction *I : Chain) { + StoreTy = cast<StoreInst>(I)->getValueOperand()->getType(); if (StoreTy->isIntOrIntVectorTy()) break; @@ -683,40 +750,34 @@ bool Vectorizer::vectorizeStoreChain( unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); unsigned VF = VecRegSize / Sz; unsigned ChainSize = Chain.size(); + unsigned Alignment = getAlignment(S0); if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) { InstructionsProcessed->insert(Chain.begin(), Chain.end()); return false; } - BasicBlock::iterator First, Last; - std::tie(First, Last) = getBoundaryInstrs(Chain); - unsigned StopChain = getVectorizablePrefixEndIdx(Chain, First, Last); - if (StopChain == 0) { - // There exists a side effect instruction, no vectorization possible. + ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain); + if (NewChain.empty()) { + // No vectorization possible. InstructionsProcessed->insert(Chain.begin(), Chain.end()); return false; } - if (StopChain == 1) { + if (NewChain.size() == 1) { // Failed after the first instruction. Discard it and try the smaller chain. - InstructionsProcessed->insert(Chain.front()); + InstructionsProcessed->insert(NewChain.front()); return false; } // Update Chain to the valid vectorizable subchain. - Chain = Chain.slice(0, StopChain); + Chain = NewChain; ChainSize = Chain.size(); - // Store size should be 1B, 2B or multiple of 4B. - // TODO: Target hook for size constraint? - unsigned SzInBytes = (Sz / 8) * ChainSize; - if (SzInBytes > 2 && SzInBytes % 4 != 0) { - DEBUG(dbgs() << "LSV: Size should be 1B, 2B " - "or multiple of 4B. Splitting.\n"); - if (SzInBytes == 3) - return vectorizeStoreChain(Chain.slice(0, ChainSize - 1), - InstructionsProcessed); - + // Check if it's legal to vectorize this chain. If not, split the chain and + // try again. + unsigned EltSzInBytes = Sz / 8; + unsigned SzInBytes = EltSzInBytes * ChainSize; + if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) { auto Chains = splitOddVectorElts(Chain, Sz); return vectorizeStoreChain(Chains.first, InstructionsProcessed) | vectorizeStoreChain(Chains.second, InstructionsProcessed); @@ -730,45 +791,41 @@ bool Vectorizer::vectorizeStoreChain( else VecTy = VectorType::get(StoreTy, Chain.size()); - // If it's more than the max vector size, break it into two pieces. - // TODO: Target hook to control types to split to. - if (ChainSize > VF) { - DEBUG(dbgs() << "LSV: Vector factor is too big." + // If it's more than the max vector size or the target has a better + // vector factor, break it into two pieces. + unsigned TargetVF = TTI.getStoreVectorFactor(VF, Sz, SzInBytes, VecTy); + if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) { + DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor." " Creating two separate arrays.\n"); - return vectorizeStoreChain(Chain.slice(0, VF), InstructionsProcessed) | - vectorizeStoreChain(Chain.slice(VF), InstructionsProcessed); + return vectorizeStoreChain(Chain.slice(0, TargetVF), + InstructionsProcessed) | + vectorizeStoreChain(Chain.slice(TargetVF), InstructionsProcessed); } DEBUG({ dbgs() << "LSV: Stores to vectorize:\n"; - for (Value *V : Chain) - V->dump(); + for (Instruction *I : Chain) + dbgs() << " " << *I << "\n"; }); // We won't try again to vectorize the elements of the chain, regardless of // whether we succeed below. InstructionsProcessed->insert(Chain.begin(), Chain.end()); - // Check alignment restrictions. - unsigned Alignment = getAlignment(S0); - // If the store is going to be misaligned, don't vectorize it. if (accessIsMisaligned(SzInBytes, AS, Alignment)) { if (S0->getPointerAddressSpace() != 0) return false; - // If we're storing to an object on the stack, we control its alignment, - // so we can cheat and change it! - Value *V = GetUnderlyingObject(S0->getPointerOperand(), DL); - if (AllocaInst *AI = dyn_cast_or_null<AllocaInst>(V)) { - AI->setAlignment(TargetBaseAlign); - Alignment = TargetBaseAlign; - } else { + unsigned NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(), + StackAdjustedAlignment, + DL, S0, nullptr, &DT); + if (NewAlign < StackAdjustedAlignment) return false; - } } - // Set insert point. + BasicBlock::iterator First, Last; + std::tie(First, Last) = getBoundaryInstrs(Chain); Builder.SetInsertPoint(&*Last); Value *Vec = UndefValue::get(VecTy); @@ -803,9 +860,11 @@ bool Vectorizer::vectorizeStoreChain( } } - Value *Bitcast = - Builder.CreateBitCast(S0->getPointerOperand(), VecTy->getPointerTo(AS)); - StoreInst *SI = cast<StoreInst>(Builder.CreateStore(Vec, Bitcast)); + // This cast is safe because Builder.CreateStore() always creates a bona fide + // StoreInst. + StoreInst *SI = cast<StoreInst>( + Builder.CreateStore(Vec, Builder.CreateBitCast(S0->getPointerOperand(), + VecTy->getPointerTo(AS)))); propagateMetadata(SI, Chain); SI->setAlignment(Alignment); @@ -816,7 +875,8 @@ bool Vectorizer::vectorizeStoreChain( } bool Vectorizer::vectorizeLoadChain( - ArrayRef<Value *> Chain, SmallPtrSet<Value *, 16> *InstructionsProcessed) { + ArrayRef<Instruction *> Chain, + SmallPtrSet<Instruction *, 16> *InstructionsProcessed) { LoadInst *L0 = cast<LoadInst>(Chain[0]); // If the vector has an int element, default to int for the whole load. @@ -838,39 +898,34 @@ bool Vectorizer::vectorizeLoadChain( unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); unsigned VF = VecRegSize / Sz; unsigned ChainSize = Chain.size(); + unsigned Alignment = getAlignment(L0); if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) { InstructionsProcessed->insert(Chain.begin(), Chain.end()); return false; } - BasicBlock::iterator First, Last; - std::tie(First, Last) = getBoundaryInstrs(Chain); - unsigned StopChain = getVectorizablePrefixEndIdx(Chain, First, Last); - if (StopChain == 0) { - // There exists a side effect instruction, no vectorization possible. + ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain); + if (NewChain.empty()) { + // No vectorization possible. InstructionsProcessed->insert(Chain.begin(), Chain.end()); return false; } - if (StopChain == 1) { + if (NewChain.size() == 1) { // Failed after the first instruction. Discard it and try the smaller chain. - InstructionsProcessed->insert(Chain.front()); + InstructionsProcessed->insert(NewChain.front()); return false; } // Update Chain to the valid vectorizable subchain. - Chain = Chain.slice(0, StopChain); + Chain = NewChain; ChainSize = Chain.size(); - // Load size should be 1B, 2B or multiple of 4B. - // TODO: Should size constraint be a target hook? - unsigned SzInBytes = (Sz / 8) * ChainSize; - if (SzInBytes > 2 && SzInBytes % 4 != 0) { - DEBUG(dbgs() << "LSV: Size should be 1B, 2B " - "or multiple of 4B. Splitting.\n"); - if (SzInBytes == 3) - return vectorizeLoadChain(Chain.slice(0, ChainSize - 1), - InstructionsProcessed); + // Check if it's legal to vectorize this chain. If not, split the chain and + // try again. + unsigned EltSzInBytes = Sz / 8; + unsigned SzInBytes = EltSzInBytes * ChainSize; + if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) { auto Chains = splitOddVectorElts(Chain, Sz); return vectorizeLoadChain(Chains.first, InstructionsProcessed) | vectorizeLoadChain(Chains.second, InstructionsProcessed); @@ -884,101 +939,99 @@ bool Vectorizer::vectorizeLoadChain( else VecTy = VectorType::get(LoadTy, Chain.size()); - // If it's more than the max vector size, break it into two pieces. - // TODO: Target hook to control types to split to. - if (ChainSize > VF) { - DEBUG(dbgs() << "LSV: Vector factor is too big. " - "Creating two separate arrays.\n"); - return vectorizeLoadChain(Chain.slice(0, VF), InstructionsProcessed) | - vectorizeLoadChain(Chain.slice(VF), InstructionsProcessed); + // If it's more than the max vector size or the target has a better + // vector factor, break it into two pieces. + unsigned TargetVF = TTI.getLoadVectorFactor(VF, Sz, SzInBytes, VecTy); + if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) { + DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor." + " Creating two separate arrays.\n"); + return vectorizeLoadChain(Chain.slice(0, TargetVF), InstructionsProcessed) | + vectorizeLoadChain(Chain.slice(TargetVF), InstructionsProcessed); } // We won't try again to vectorize the elements of the chain, regardless of // whether we succeed below. InstructionsProcessed->insert(Chain.begin(), Chain.end()); - // Check alignment restrictions. - unsigned Alignment = getAlignment(L0); - // If the load is going to be misaligned, don't vectorize it. if (accessIsMisaligned(SzInBytes, AS, Alignment)) { if (L0->getPointerAddressSpace() != 0) return false; - // If we're loading from an object on the stack, we control its alignment, - // so we can cheat and change it! - Value *V = GetUnderlyingObject(L0->getPointerOperand(), DL); - if (AllocaInst *AI = dyn_cast_or_null<AllocaInst>(V)) { - AI->setAlignment(TargetBaseAlign); - Alignment = TargetBaseAlign; - } else { + unsigned NewAlign = getOrEnforceKnownAlignment(L0->getPointerOperand(), + StackAdjustedAlignment, + DL, L0, nullptr, &DT); + if (NewAlign < StackAdjustedAlignment) return false; - } + + Alignment = NewAlign; } DEBUG({ dbgs() << "LSV: Loads to vectorize:\n"; - for (Value *V : Chain) - V->dump(); + for (Instruction *I : Chain) + I->dump(); }); - // Set insert point. + // getVectorizablePrefix already computed getBoundaryInstrs. The value of + // Last may have changed since then, but the value of First won't have. If it + // matters, we could compute getBoundaryInstrs only once and reuse it here. + BasicBlock::iterator First, Last; + std::tie(First, Last) = getBoundaryInstrs(Chain); Builder.SetInsertPoint(&*First); Value *Bitcast = Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS)); - + // This cast is safe because Builder.CreateLoad always creates a bona fide + // LoadInst. LoadInst *LI = cast<LoadInst>(Builder.CreateLoad(Bitcast)); propagateMetadata(LI, Chain); LI->setAlignment(Alignment); if (VecLoadTy) { SmallVector<Instruction *, 16> InstrsToErase; - SmallVector<Instruction *, 16> InstrsToReorder; - InstrsToReorder.push_back(cast<Instruction>(Bitcast)); unsigned VecWidth = VecLoadTy->getNumElements(); for (unsigned I = 0, E = Chain.size(); I != E; ++I) { for (auto Use : Chain[I]->users()) { + // All users of vector loads are ExtractElement instructions with + // constant indices, otherwise we would have bailed before now. Instruction *UI = cast<Instruction>(Use); unsigned Idx = cast<ConstantInt>(UI->getOperand(1))->getZExtValue(); unsigned NewIdx = Idx + I * VecWidth; - Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(NewIdx)); - Instruction *Extracted = cast<Instruction>(V); - if (Extracted->getType() != UI->getType()) - Extracted = cast<Instruction>( - Builder.CreateBitCast(Extracted, UI->getType())); + Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(NewIdx), + UI->getName()); + if (V->getType() != UI->getType()) + V = Builder.CreateBitCast(V, UI->getType()); // Replace the old instruction. - UI->replaceAllUsesWith(Extracted); + UI->replaceAllUsesWith(V); InstrsToErase.push_back(UI); } } - for (Instruction *ModUser : InstrsToReorder) - reorder(ModUser); + // Bitcast might not be an Instruction, if the value being loaded is a + // constant. In that case, no need to reorder anything. + if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast)) + reorder(BitcastInst); for (auto I : InstrsToErase) I->eraseFromParent(); } else { - SmallVector<Instruction *, 16> InstrsToReorder; - InstrsToReorder.push_back(cast<Instruction>(Bitcast)); - for (unsigned I = 0, E = Chain.size(); I != E; ++I) { - Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(I)); - Instruction *Extracted = cast<Instruction>(V); - Instruction *UI = cast<Instruction>(Chain[I]); - if (Extracted->getType() != UI->getType()) { - Extracted = cast<Instruction>( - Builder.CreateBitOrPointerCast(Extracted, UI->getType())); + Value *CV = Chain[I]; + Value *V = + Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName()); + if (V->getType() != CV->getType()) { + V = Builder.CreateBitOrPointerCast(V, CV->getType()); } // Replace the old instruction. - UI->replaceAllUsesWith(Extracted); + CV->replaceAllUsesWith(V); } - for (Instruction *ModUser : InstrsToReorder) - reorder(ModUser); + if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast)) + reorder(BitcastInst); } eraseInstructions(Chain); @@ -990,10 +1043,14 @@ bool Vectorizer::vectorizeLoadChain( bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace, unsigned Alignment) { + if (Alignment % SzInBytes == 0) + return false; + bool Fast = false; - bool Allows = TTI.allowsMisalignedMemoryAccesses(SzInBytes * 8, AddressSpace, + bool Allows = TTI.allowsMisalignedMemoryAccesses(F.getParent()->getContext(), + SzInBytes * 8, AddressSpace, Alignment, &Fast); - // TODO: Remove TargetBaseAlign - return !(Allows && Fast) && (Alignment % SzInBytes) != 0 && - (Alignment % TargetBaseAlign) != 0; + DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows + << " and fast? " << Fast << "\n";); + return !Allows || !Fast; } diff --git a/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index ee5733d..dac7032 100644 --- a/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -80,6 +80,7 @@ #include "llvm/IR/Module.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" +#include "llvm/IR/User.h" #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" #include "llvm/IR/Verifier.h" @@ -191,7 +192,7 @@ static cl::opt<bool> EnableIndVarRegisterHeur( cl::desc("Count the induction variable only once when interleaving")); static cl::opt<bool> EnableCondStoresVectorization( - "enable-cond-stores-vec", cl::init(false), cl::Hidden, + "enable-cond-stores-vec", cl::init(true), cl::Hidden, cl::desc("Enable if predication of stores during vectorization.")); static cl::opt<unsigned> MaxNestedScalarReductionIC( @@ -213,6 +214,32 @@ static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold( cl::desc("The maximum number of SCEV checks allowed with a " "vectorize(enable) pragma")); +/// Create an analysis remark that explains why vectorization failed +/// +/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p +/// RemarkName is the identifier for the remark. If \p I is passed it is an +/// instruction that prevents vectorization. Otherwise \p TheLoop is used for +/// the location of the remark. \return the remark object that can be +/// streamed to. +static OptimizationRemarkAnalysis +createMissedAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop, + Instruction *I = nullptr) { + Value *CodeRegion = TheLoop->getHeader(); + DebugLoc DL = TheLoop->getStartLoc(); + + if (I) { + CodeRegion = I->getParent(); + // If there is no debug location attached to the instruction, revert back to + // using the loop's. + if (I->getDebugLoc()) + DL = I->getDebugLoc(); + } + + OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); + R << "loop not vectorized: "; + return R; +} + namespace { // Forward declarations. @@ -221,70 +248,13 @@ class LoopVectorizationLegality; class LoopVectorizationCostModel; class LoopVectorizationRequirements; -// A traits type that is intended to be used in graph algorithms. The graph it -// models starts at the loop header, and traverses the BasicBlocks that are in -// the loop body, but not the loop header. Since the loop header is skipped, -// the back edges are excluded. -struct LoopBodyTraits { - using NodeRef = std::pair<const Loop *, BasicBlock *>; - - // This wraps a const Loop * into the iterator, so we know which edges to - // filter out. - class WrappedSuccIterator - : public iterator_adaptor_base< - WrappedSuccIterator, succ_iterator, - typename std::iterator_traits<succ_iterator>::iterator_category, - NodeRef, std::ptrdiff_t, NodeRef *, NodeRef> { - using BaseT = iterator_adaptor_base< - WrappedSuccIterator, succ_iterator, - typename std::iterator_traits<succ_iterator>::iterator_category, - NodeRef, std::ptrdiff_t, NodeRef *, NodeRef>; - - const Loop *L; - - public: - WrappedSuccIterator(succ_iterator Begin, const Loop *L) - : BaseT(Begin), L(L) {} - - NodeRef operator*() const { return {L, *I}; } - }; - - struct LoopBodyFilter { - bool operator()(NodeRef N) const { - const Loop *L = N.first; - return N.second != L->getHeader() && L->contains(N.second); - } - }; - - using ChildIteratorType = - filter_iterator<WrappedSuccIterator, LoopBodyFilter>; - - static NodeRef getEntryNode(const Loop &G) { return {&G, G.getHeader()}; } - - static ChildIteratorType child_begin(NodeRef Node) { - return make_filter_range(make_range<WrappedSuccIterator>( - {succ_begin(Node.second), Node.first}, - {succ_end(Node.second), Node.first}), - LoopBodyFilter{}) - .begin(); - } - - static ChildIteratorType child_end(NodeRef Node) { - return make_filter_range(make_range<WrappedSuccIterator>( - {succ_begin(Node.second), Node.first}, - {succ_end(Node.second), Node.first}), - LoopBodyFilter{}) - .end(); - } -}; - /// Returns true if the given loop body has a cycle, excluding the loop /// itself. static bool hasCyclesInLoopBody(const Loop &L) { if (!L.empty()) return true; - for (const auto SCC : + for (const auto &SCC : make_range(scc_iterator<Loop, LoopBodyTraits>::begin(L), scc_iterator<Loop, LoopBodyTraits>::end(L))) { if (SCC.size() > 1) { @@ -346,6 +316,41 @@ static GetElementPtrInst *getGEPInstruction(Value *Ptr) { return nullptr; } +/// A helper function that returns the pointer operand of a load or store +/// instruction. +static Value *getPointerOperand(Value *I) { + if (auto *LI = dyn_cast<LoadInst>(I)) + return LI->getPointerOperand(); + if (auto *SI = dyn_cast<StoreInst>(I)) + return SI->getPointerOperand(); + return nullptr; +} + +/// A helper function that returns true if the given type is irregular. The +/// type is irregular if its allocated size doesn't equal the store size of an +/// element of the corresponding vector type at the given vectorization factor. +static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) { + + // Determine if an array of VF elements of type Ty is "bitcast compatible" + // with a <VF x Ty> vector. + if (VF > 1) { + auto *VectorTy = VectorType::get(Ty, VF); + return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy); + } + + // If the vectorization factor is one, we just check if an array of type Ty + // requires padding between elements. + return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); +} + +/// A helper function that returns the reciprocal of the block probability of +/// predicated blocks. If we return X, we are assuming the predicated block +/// will execute once for for every X iterations of the loop header. +/// +/// TODO: We should use actual block probability here, if available. Currently, +/// we always assume predicated blocks have a 50% chance of executing. +static unsigned getReciprocalPredBlockProb() { return 2; } + /// InnerLoopVectorizer vectorizes loops which contain only one basic /// block to a specified vectorization factor (VF). /// This class performs the widening of scalars into vectors, or multiple @@ -366,29 +371,21 @@ public: LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, - unsigned VecWidth, unsigned UnrollFactor) + OptimizationRemarkEmitter *ORE, unsigned VecWidth, + unsigned UnrollFactor, LoopVectorizationLegality *LVL, + LoopVectorizationCostModel *CM) : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), - AC(AC), VF(VecWidth), UF(UnrollFactor), + AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()), Induction(nullptr), - OldInduction(nullptr), WidenMap(UnrollFactor), TripCount(nullptr), - VectorTripCount(nullptr), Legal(nullptr), AddedSafetyChecks(false) {} + OldInduction(nullptr), VectorLoopValueMap(UnrollFactor, VecWidth), + TripCount(nullptr), VectorTripCount(nullptr), Legal(LVL), Cost(CM), + AddedSafetyChecks(false) {} // Perform the actual loop widening (vectorization). - // MinimumBitWidths maps scalar integer values to the smallest bitwidth they - // can be validly truncated to. The cost model has assumed this truncation - // will happen when vectorizing. VecValuesToIgnore contains scalar values - // that the cost model has chosen to ignore because they will not be - // vectorized. - void vectorize(LoopVectorizationLegality *L, - const MapVector<Instruction *, uint64_t> &MinimumBitWidths, - SmallPtrSetImpl<const Value *> &VecValuesToIgnore) { - MinBWs = &MinimumBitWidths; - ValuesNotWidened = &VecValuesToIgnore; - Legal = L; + void vectorize() { // Create a new empty loop. Unlink the old loop and connect the new one. createEmptyLoop(); // Widen each instruction in the old loop to a new one in the new loop. - // Use the Legality module to find the induction and reduction variables. vectorizeLoop(); } @@ -400,11 +397,18 @@ public: protected: /// A small list of PHINodes. typedef SmallVector<PHINode *, 4> PhiVector; - /// When we unroll loops we have multiple vector values for each scalar. - /// This data structure holds the unrolled and vectorized values that - /// originated from one scalar instruction. + + /// A type for vectorized values in the new loop. Each value from the + /// original loop, when vectorized, is represented by UF vector values in the + /// new unrolled loop, where UF is the unroll factor. typedef SmallVector<Value *, 2> VectorParts; + /// A type for scalarized values in the new loop. Each value from the + /// original loop, when scalarized, is represented by UF x VF scalar values + /// in the new unrolled loop, where UF is the unroll factor and VF is the + /// vectorization factor. + typedef SmallVector<SmallVector<Value *, 4>, 2> ScalarParts; + // When we if-convert we need to create edge masks. We have to cache values // so that we don't end up with exponential recursion/IR. typedef DenseMap<std::pair<BasicBlock *, BasicBlock *>, VectorParts> @@ -434,7 +438,20 @@ protected: /// See PR14725. void fixLCSSAPHIs(); - /// Shrinks vector element sizes based on information in "MinBWs". + /// Iteratively sink the scalarized operands of a predicated instruction into + /// the block that was created for it. + void sinkScalarOperands(Instruction *PredInst); + + /// Predicate conditional instructions that require predication on their + /// respective conditions. + void predicateInstructions(); + + /// Collect the instructions from the original loop that would be trivially + /// dead in the vectorized loop if generated. + void collectTriviallyDeadInstructions(); + + /// Shrinks vector element sizes to the smallest bitwidth they can be legally + /// represented as. void truncateToMinimalBitwidths(); /// A helper function that computes the predicate of the block BB, assuming @@ -451,19 +468,19 @@ protected: /// Vectorize a single PHINode in a block. This method handles the induction /// variable canonicalization. It supports both VF = 1 for unrolled loops and /// arbitrary length vectors. - void widenPHIInstruction(Instruction *PN, VectorParts &Entry, unsigned UF, - unsigned VF, PhiVector *PV); + void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF, + PhiVector *PV); /// Insert the new loop to the loop hierarchy and pass manager /// and update the analysis passes. void updateAnalysis(); /// This instruction is un-vectorizable. Implement it as a sequence - /// of scalars. If \p IfPredicateStore is true we need to 'hide' each + /// of scalars. If \p IfPredicateInstr is true we need to 'hide' each /// scalarized instruction behind an if block predicated on the control /// dependence of the instruction. virtual void scalarizeInstruction(Instruction *Instr, - bool IfPredicateStore = false); + bool IfPredicateInstr = false); /// Vectorize Load and Store instructions, virtual void vectorizeMemoryInstruction(Instruction *Instr); @@ -477,7 +494,10 @@ protected: /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) /// to each vector element of Val. The sequence starts at StartIndex. - virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step); + /// \p Opcode is relevant for FP induction variable. + virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, + Instruction::BinaryOps Opcode = + Instruction::BinaryOpsEnd); /// Compute scalar induction steps. \p ScalarIV is the scalar induction /// variable on which to base the steps, \p Step is the size of the step, and @@ -488,23 +508,39 @@ protected: /// Create a vector induction phi node based on an existing scalar one. This /// currently only works for integer induction variables with a constant - /// step. If \p TruncType is non-null, instead of widening the original IV, - /// we widen a version of the IV truncated to \p TruncType. + /// step. \p EntryVal is the value from the original loop that maps to the + /// vector phi node. If \p EntryVal is a truncate instruction, instead of + /// widening the original IV, we widen a version of the IV truncated to \p + /// EntryVal's type. void createVectorIntInductionPHI(const InductionDescriptor &II, - VectorParts &Entry, IntegerType *TruncType); + Instruction *EntryVal); /// Widen an integer induction variable \p IV. If \p Trunc is provided, the - /// induction variable will first be truncated to the corresponding type. The - /// widened values are placed in \p Entry. - void widenIntInduction(PHINode *IV, VectorParts &Entry, - TruncInst *Trunc = nullptr); - - /// When we go over instructions in the basic block we rely on previous - /// values within the current basic block or on loop invariant values. - /// When we widen (vectorize) values we place them in the map. If the values - /// are not within the map, they have to be loop invariant, so we simply - /// broadcast them into a vector. - VectorParts &getVectorValue(Value *V); + /// induction variable will first be truncated to the corresponding type. + void widenIntInduction(PHINode *IV, TruncInst *Trunc = nullptr); + + /// Returns true if an instruction \p I should be scalarized instead of + /// vectorized for the chosen vectorization factor. + bool shouldScalarizeInstruction(Instruction *I) const; + + /// Returns true if we should generate a scalar version of \p IV. + bool needsScalarInduction(Instruction *IV) const; + + /// Return a constant reference to the VectorParts corresponding to \p V from + /// the original loop. If the value has already been vectorized, the + /// corresponding vector entry in VectorLoopValueMap is returned. If, + /// however, the value has a scalar entry in VectorLoopValueMap, we construct + /// new vector values on-demand by inserting the scalar values into vectors + /// with an insertelement sequence. If the value has been neither vectorized + /// nor scalarized, it must be loop invariant, so we simply broadcast the + /// value into vectors. + const VectorParts &getVectorValue(Value *V); + + /// Return a value in the new loop corresponding to \p V from the original + /// loop at unroll index \p Part and vector index \p Lane. If the value has + /// been vectorized but not scalarized, the necessary extractelement + /// instruction will be generated. + Value *getScalarValue(Value *V, unsigned Part, unsigned Lane); /// Try to vectorize the interleaved access group that \p Instr belongs to. void vectorizeInterleaveGroup(Instruction *Instr); @@ -547,44 +583,112 @@ protected: /// vector of instructions. void addMetadata(ArrayRef<Value *> To, Instruction *From); - /// This is a helper class that holds the vectorizer state. It maps scalar - /// instructions to vector instructions. When the code is 'unrolled' then - /// then a single scalar value is mapped to multiple vector parts. The parts - /// are stored in the VectorPart type. + /// This is a helper class for maintaining vectorization state. It's used for + /// mapping values from the original loop to their corresponding values in + /// the new loop. Two mappings are maintained: one for vectorized values and + /// one for scalarized values. Vectorized values are represented with UF + /// vector values in the new loop, and scalarized values are represented with + /// UF x VF scalar values in the new loop. UF and VF are the unroll and + /// vectorization factors, respectively. + /// + /// Entries can be added to either map with initVector and initScalar, which + /// initialize and return a constant reference to the new entry. If a + /// non-constant reference to a vector entry is required, getVector can be + /// used to retrieve a mutable entry. We currently directly modify the mapped + /// values during "fix-up" operations that occur once the first phase of + /// widening is complete. These operations include type truncation and the + /// second phase of recurrence widening. + /// + /// Otherwise, entries from either map should be accessed using the + /// getVectorValue or getScalarValue functions from InnerLoopVectorizer. + /// getVectorValue and getScalarValue coordinate to generate a vector or + /// scalar value on-demand if one is not yet available. When vectorizing a + /// loop, we visit the definition of an instruction before its uses. When + /// visiting the definition, we either vectorize or scalarize the + /// instruction, creating an entry for it in the corresponding map. (In some + /// cases, such as induction variables, we will create both vector and scalar + /// entries.) Then, as we encounter uses of the definition, we derive values + /// for each scalar or vector use unless such a value is already available. + /// For example, if we scalarize a definition and one of its uses is vector, + /// we build the required vector on-demand with an insertelement sequence + /// when visiting the use. Otherwise, if the use is scalar, we can use the + /// existing scalar definition. struct ValueMap { - /// C'tor. UnrollFactor controls the number of vectors ('parts') that - /// are mapped. - ValueMap(unsigned UnrollFactor) : UF(UnrollFactor) {} - - /// \return True if 'Key' is saved in the Value Map. - bool has(Value *Key) const { return MapStorage.count(Key); } - - /// Initializes a new entry in the map. Sets all of the vector parts to the - /// save value in 'Val'. - /// \return A reference to a vector with splat values. - VectorParts &splat(Value *Key, Value *Val) { - VectorParts &Entry = MapStorage[Key]; - Entry.assign(UF, Val); - return Entry; + + /// Construct an empty map with the given unroll and vectorization factors. + ValueMap(unsigned UnrollFactor, unsigned VecWidth) + : UF(UnrollFactor), VF(VecWidth) { + // The unroll and vectorization factors are only used in asserts builds + // to verify map entries are sized appropriately. + (void)UF; + (void)VF; } - ///\return A reference to the value that is stored at 'Key'. - VectorParts &get(Value *Key) { - VectorParts &Entry = MapStorage[Key]; - if (Entry.empty()) - Entry.resize(UF); - assert(Entry.size() == UF); - return Entry; + /// \return True if the map has a vector entry for \p Key. + bool hasVector(Value *Key) const { return VectorMapStorage.count(Key); } + + /// \return True if the map has a scalar entry for \p Key. + bool hasScalar(Value *Key) const { return ScalarMapStorage.count(Key); } + + /// \brief Map \p Key to the given VectorParts \p Entry, and return a + /// constant reference to the new vector map entry. The given key should + /// not already be in the map, and the given VectorParts should be + /// correctly sized for the current unroll factor. + const VectorParts &initVector(Value *Key, const VectorParts &Entry) { + assert(!hasVector(Key) && "Vector entry already initialized"); + assert(Entry.size() == UF && "VectorParts has wrong dimensions"); + VectorMapStorage[Key] = Entry; + return VectorMapStorage[Key]; } + /// \brief Map \p Key to the given ScalarParts \p Entry, and return a + /// constant reference to the new scalar map entry. The given key should + /// not already be in the map, and the given ScalarParts should be + /// correctly sized for the current unroll and vectorization factors. + const ScalarParts &initScalar(Value *Key, const ScalarParts &Entry) { + assert(!hasScalar(Key) && "Scalar entry already initialized"); + assert(Entry.size() == UF && + all_of(make_range(Entry.begin(), Entry.end()), + [&](const SmallVectorImpl<Value *> &Values) -> bool { + return Values.size() == VF; + }) && + "ScalarParts has wrong dimensions"); + ScalarMapStorage[Key] = Entry; + return ScalarMapStorage[Key]; + } + + /// \return A reference to the vector map entry corresponding to \p Key. + /// The key should already be in the map. This function should only be used + /// when it's necessary to update values that have already been vectorized. + /// This is the case for "fix-up" operations including type truncation and + /// the second phase of recurrence vectorization. If a non-const reference + /// isn't required, getVectorValue should be used instead. + VectorParts &getVector(Value *Key) { + assert(hasVector(Key) && "Vector entry not initialized"); + return VectorMapStorage.find(Key)->second; + } + + /// Retrieve an entry from the vector or scalar maps. The preferred way to + /// access an existing mapped entry is with getVectorValue or + /// getScalarValue from InnerLoopVectorizer. Until those functions can be + /// moved inside ValueMap, we have to declare them as friends. + friend const VectorParts &InnerLoopVectorizer::getVectorValue(Value *V); + friend Value *InnerLoopVectorizer::getScalarValue(Value *V, unsigned Part, + unsigned Lane); + private: - /// The unroll factor. Each entry in the map stores this number of vector - /// elements. + /// The unroll factor. Each entry in the vector map contains UF vector + /// values. unsigned UF; - /// Map storage. We use std::map and not DenseMap because insertions to a - /// dense map invalidates its iterators. - std::map<Value *, VectorParts> MapStorage; + /// The vectorization factor. Each entry in the scalar map contains UF x VF + /// scalar values. + unsigned VF; + + /// The vector and scalar map storage. We use std::map and not DenseMap + /// because insertions to DenseMap invalidate its iterators. + std::map<Value *, VectorParts> VectorMapStorage; + std::map<Value *, ScalarParts> ScalarMapStorage; }; /// The original loop. @@ -605,6 +709,8 @@ protected: const TargetTransformInfo *TTI; /// Assumption Cache. AssumptionCache *AC; + /// Interface to emit optimization remarks. + OptimizationRemarkEmitter *ORE; /// \brief LoopVersioning. It's only set up (non-null) if memchecks were /// used. @@ -646,41 +752,42 @@ protected: PHINode *Induction; /// The induction variable of the old basic block. PHINode *OldInduction; - /// Maps scalars to widened vectors. - ValueMap WidenMap; - - /// A map of induction variables from the original loop to their - /// corresponding VF * UF scalarized values in the vectorized loop. The - /// purpose of ScalarIVMap is similar to that of WidenMap. Whereas WidenMap - /// maps original loop values to their vector versions in the new loop, - /// ScalarIVMap maps induction variables from the original loop that are not - /// vectorized to their scalar equivalents in the vector loop. Maintaining a - /// separate map for scalarized induction variables allows us to avoid - /// unnecessary scalar-to-vector-to-scalar conversions. - DenseMap<Value *, SmallVector<Value *, 8>> ScalarIVMap; + + /// Maps values from the original loop to their corresponding values in the + /// vectorized loop. A key value can map to either vector values, scalar + /// values or both kinds of values, depending on whether the key was + /// vectorized and scalarized. + ValueMap VectorLoopValueMap; /// Store instructions that should be predicated, as a pair /// <StoreInst, Predicate> - SmallVector<std::pair<StoreInst *, Value *>, 4> PredicatedStores; + SmallVector<std::pair<Instruction *, Value *>, 4> PredicatedInstructions; EdgeMaskCache MaskCache; /// Trip count of the original loop. Value *TripCount; /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) Value *VectorTripCount; - /// Map of scalar integer values to the smallest bitwidth they can be legally - /// represented as. The vector equivalents of these values should be truncated - /// to this type. - const MapVector<Instruction *, uint64_t> *MinBWs; - - /// A set of values that should not be widened. This is taken from - /// VecValuesToIgnore in the cost model. - SmallPtrSetImpl<const Value *> *ValuesNotWidened; - + /// The legality analysis. LoopVectorizationLegality *Legal; + /// The profitablity analysis. + LoopVectorizationCostModel *Cost; + // Record whether runtime checks are added. bool AddedSafetyChecks; + + // Holds instructions from the original loop whose counterparts in the + // vectorized loop would be trivially dead if generated. For example, + // original induction update instructions can become dead because we + // separately emit induction "steps" when generating code for the new loop. + // Similarly, we create a new latch condition when setting up the structure + // of the new loop, so the old one can become dead. + SmallPtrSet<Instruction *, 4> DeadInstructions; + + // Holds the end values for each induction variable. We save the end values + // so we can later fix-up the external users of the induction variables. + DenseMap<PHINode *, Value *> IVEndValues; }; class InnerLoopUnroller : public InnerLoopVectorizer { @@ -689,16 +796,20 @@ public: LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, - unsigned UnrollFactor) - : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, 1, - UnrollFactor) {} + OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, + LoopVectorizationLegality *LVL, + LoopVectorizationCostModel *CM) + : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1, + UnrollFactor, LVL, CM) {} private: void scalarizeInstruction(Instruction *Instr, - bool IfPredicateStore = false) override; + bool IfPredicateInstr = false) override; void vectorizeMemoryInstruction(Instruction *Instr) override; Value *getBroadcastInstrs(Value *V) override; - Value *getStepVector(Value *Val, int StartIdx, Value *Step) override; + Value *getStepVector(Value *Val, int StartIdx, Value *Step, + Instruction::BinaryOps Opcode = + Instruction::BinaryOpsEnd) override; Value *reverseVector(Value *Vec) override; }; @@ -1149,12 +1260,13 @@ public: FK_Enabled = 1, ///< Forcing enabled. }; - LoopVectorizeHints(const Loop *L, bool DisableInterleaving) + LoopVectorizeHints(const Loop *L, bool DisableInterleaving, + OptimizationRemarkEmitter &ORE) : Width("vectorize.width", VectorizerParams::VectorizationFactor, HK_WIDTH), Interleave("interleave.count", DisableInterleaving, HK_UNROLL), Force("vectorize.enable", FK_Undefined, HK_FORCE), - PotentiallyUnsafe(false), TheLoop(L) { + PotentiallyUnsafe(false), TheLoop(L), ORE(ORE) { // Populate values with existing loop metadata. getHintsFromMetadata(); @@ -1176,17 +1288,13 @@ public: bool allowVectorization(Function *F, Loop *L, bool AlwaysVectorize) const { if (getForce() == LoopVectorizeHints::FK_Disabled) { DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n"); - emitOptimizationRemarkAnalysis(F->getContext(), - vectorizeAnalysisPassName(), *F, - L->getStartLoc(), emitRemark()); + emitRemarkWithHints(); return false; } if (!AlwaysVectorize && getForce() != LoopVectorizeHints::FK_Enabled) { DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n"); - emitOptimizationRemarkAnalysis(F->getContext(), - vectorizeAnalysisPassName(), *F, - L->getStartLoc(), emitRemark()); + emitRemarkWithHints(); return false; } @@ -1197,11 +1305,12 @@ public: // FIXME: Add interleave.disable metadata. This will allow // vectorize.disable to be used without disabling the pass and errors // to differentiate between disabled vectorization and a width of 1. - emitOptimizationRemarkAnalysis( - F->getContext(), vectorizeAnalysisPassName(), *F, L->getStartLoc(), - "loop not vectorized: vectorization and interleaving are explicitly " - "disabled, or vectorize width and interleave count are both set to " - "1"); + ORE.emit(OptimizationRemarkAnalysis(vectorizeAnalysisPassName(), + "AllDisabled", L->getStartLoc(), + L->getHeader()) + << "loop not vectorized: vectorization and interleaving are " + "explicitly disabled, or vectorize width and interleave " + "count are both set to 1"); return false; } @@ -1209,23 +1318,27 @@ public: } /// Dumps all the hint information. - std::string emitRemark() const { - VectorizationReport R; + void emitRemarkWithHints() const { + using namespace ore; if (Force.Value == LoopVectorizeHints::FK_Disabled) - R << "vectorization is explicitly disabled"; + ORE.emit(OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled", + TheLoop->getStartLoc(), + TheLoop->getHeader()) + << "loop not vectorized: vectorization is explicitly disabled"); else { - R << "use -Rpass-analysis=loop-vectorize for more info"; + OptimizationRemarkMissed R(LV_NAME, "MissedDetails", + TheLoop->getStartLoc(), TheLoop->getHeader()); + R << "loop not vectorized"; if (Force.Value == LoopVectorizeHints::FK_Enabled) { - R << " (Force=true"; + R << " (Force=" << NV("Force", true); if (Width.Value != 0) - R << ", Vector Width=" << Width.Value; + R << ", Vector Width=" << NV("VectorWidth", Width.Value); if (Interleave.Value != 0) - R << ", Interleave Count=" << Interleave.Value; + R << ", Interleave Count=" << NV("InterleaveCount", Interleave.Value); R << ")"; } + ORE.emit(R); } - - return R.str(); } unsigned getWidth() const { return Width.Value; } @@ -1241,7 +1354,7 @@ public: return LV_NAME; if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth() == 0) return LV_NAME; - return DiagnosticInfoOptimizationRemarkAnalysis::AlwaysPrint; + return OptimizationRemarkAnalysis::AlwaysPrint; } bool allowReordering() const { @@ -1379,19 +1492,23 @@ private: /// The loop these hints belong to. const Loop *TheLoop; + + /// Interface to emit optimization remarks. + OptimizationRemarkEmitter &ORE; }; -static void emitAnalysisDiag(const Function *TheFunction, const Loop *TheLoop, +static void emitAnalysisDiag(const Loop *TheLoop, const LoopVectorizeHints &Hints, + OptimizationRemarkEmitter &ORE, const LoopAccessReport &Message) { const char *Name = Hints.vectorizeAnalysisPassName(); - LoopAccessReport::emitAnalysis(Message, TheFunction, TheLoop, Name); + LoopAccessReport::emitAnalysis(Message, TheLoop, Name, ORE); } static void emitMissedWarning(Function *F, Loop *L, - const LoopVectorizeHints &LH) { - emitOptimizationRemarkMissed(F->getContext(), LV_NAME, *F, L->getStartLoc(), - LH.emitRemark()); + const LoopVectorizeHints &LH, + OptimizationRemarkEmitter *ORE) { + LH.emitRemarkWithHints(); if (LH.getForce() == LoopVectorizeHints::FK_Enabled) { if (LH.getWidth() != 1) @@ -1425,12 +1542,12 @@ public: TargetLibraryInfo *TLI, AliasAnalysis *AA, Function *F, const TargetTransformInfo *TTI, std::function<const LoopAccessInfo &(Loop &)> *GetLAA, LoopInfo *LI, - LoopVectorizationRequirements *R, LoopVectorizeHints *H) - : NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TheFunction(F), - TTI(TTI), DT(DT), GetLAA(GetLAA), LAI(nullptr), - InterleaveInfo(PSE, L, DT, LI), Induction(nullptr), - WidestIndTy(nullptr), HasFunNoNaNAttr(false), Requirements(R), - Hints(H) {} + OptimizationRemarkEmitter *ORE, LoopVectorizationRequirements *R, + LoopVectorizeHints *H) + : NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TTI(TTI), DT(DT), + GetLAA(GetLAA), LAI(nullptr), ORE(ORE), InterleaveInfo(PSE, L, DT, LI), + Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false), + Requirements(R), Hints(H) {} /// ReductionList contains the reduction descriptors for all /// of the reductions that were found in the loop. @@ -1490,9 +1607,12 @@ public: /// Returns true if the value V is uniform within the loop. bool isUniform(Value *V); - /// Returns true if this instruction will remain scalar after vectorization. + /// Returns true if \p I is known to be uniform after vectorization. bool isUniformAfterVectorization(Instruction *I) { return Uniforms.count(I); } + /// Returns true if \p I is known to be scalar after vectorization. + bool isScalarAfterVectorization(Instruction *I) { return Scalars.count(I); } + /// Returns the information that we collected about runtime memory check. const RuntimePointerChecking *getRuntimePointerChecking() const { return LAI->getRuntimePointerChecking(); @@ -1545,6 +1665,17 @@ public: bool isLegalMaskedGather(Type *DataType) { return TTI->isLegalMaskedGather(DataType); } + /// Returns true if the target machine can represent \p V as a masked gather + /// or scatter operation. + bool isLegalGatherOrScatter(Value *V) { + auto *LI = dyn_cast<LoadInst>(V); + auto *SI = dyn_cast<StoreInst>(V); + if (!LI && !SI) + return false; + auto *Ptr = getPointerOperand(V); + auto *Ty = cast<PointerType>(Ptr->getType())->getElementType(); + return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty)); + } /// Returns true if vector representation of the instruction \p I /// requires mask. @@ -1553,6 +1684,21 @@ public: unsigned getNumLoads() const { return LAI->getNumLoads(); } unsigned getNumPredStores() const { return NumPredStores; } + /// Returns true if \p I is an instruction that will be scalarized with + /// predication. Such instructions include conditional stores and + /// instructions that may divide by zero. + bool isScalarWithPredication(Instruction *I); + + /// Returns true if \p I is a memory instruction that has a consecutive or + /// consecutive-like pointer operand. Consecutive-like pointers are pointers + /// that are treated like consecutive pointers during vectorization. The + /// pointer operands of interleaved accesses are an example. + bool hasConsecutiveLikePtrOperand(Instruction *I); + + /// Returns true if \p I is a memory instruction that must be scalarized + /// during vectorization. + bool memoryInstructionMustBeScalarized(Instruction *I, unsigned VF = 1); + private: /// Check if a single basic block loop is vectorizable. /// At this point we know that this is a loop with a constant trip count @@ -1569,9 +1715,24 @@ private: /// transformation. bool canVectorizeWithIfConvert(); - /// Collect the variables that need to stay uniform after vectorization. + /// Collect the instructions that are uniform after vectorization. An + /// instruction is uniform if we represent it with a single scalar value in + /// the vectorized loop corresponding to each vector iteration. Examples of + /// uniform instructions include pointer operands of consecutive or + /// interleaved memory accesses. Note that although uniformity implies an + /// instruction will be scalar, the reverse is not true. In general, a + /// scalarized instruction will be represented by VF scalar values in the + /// vectorized loop, each corresponding to an iteration of the original + /// scalar loop. void collectLoopUniforms(); + /// Collect the instructions that are scalar after vectorization. An + /// instruction is scalar if it is known to be uniform or will be scalarized + /// during vectorization. Non-uniform scalarized instructions will be + /// represented by VF values in the vectorized loop, each corresponding to an + /// iteration of the original scalar loop. + void collectLoopScalars(); + /// Return true if all of the instructions in the block can be speculatively /// executed. \p SafePtrs is a list of addresses that are known to be legal /// and we know that we can read from them without segfault. @@ -1588,7 +1749,19 @@ private: /// VectorizationReport because the << operator of VectorizationReport returns /// LoopAccessReport. void emitAnalysis(const LoopAccessReport &Message) const { - emitAnalysisDiag(TheFunction, TheLoop, *Hints, Message); + emitAnalysisDiag(TheLoop, *Hints, *ORE, Message); + } + + /// Create an analysis remark that explains why vectorization failed + /// + /// \p RemarkName is the identifier for the remark. If \p I is passed it is + /// an instruction that prevents vectorization. Otherwise the loop is used + /// for the location of the remark. \return the remark object that can be + /// streamed to. + OptimizationRemarkAnalysis + createMissedAnalysis(StringRef RemarkName, Instruction *I = nullptr) const { + return ::createMissedAnalysis(Hints->vectorizeAnalysisPassName(), + RemarkName, TheLoop, I); } /// \brief If an access has a symbolic strides, this maps the pointer value to @@ -1613,8 +1786,6 @@ private: PredicatedScalarEvolution &PSE; /// Target Library Info. TargetLibraryInfo *TLI; - /// Parent function - Function *TheFunction; /// Target Transform Info const TargetTransformInfo *TTI; /// Dominator Tree. @@ -1624,6 +1795,8 @@ private: // And the loop-accesses info corresponding to this loop. This pointer is // null until canVectorizeMemory sets it up. const LoopAccessInfo *LAI; + /// Interface to emit optimization remarks. + OptimizationRemarkEmitter *ORE; /// The interleave access information contains groups of interleaved accesses /// with the same stride and close to each other. @@ -1648,10 +1821,13 @@ private: /// Allowed outside users. This holds the induction and reduction /// vars which can be accessed from outside the loop. SmallPtrSet<Value *, 4> AllowedExit; - /// This set holds the variables which are known to be uniform after - /// vectorization. + + /// Holds the instructions known to be uniform after vectorization. SmallPtrSet<Instruction *, 4> Uniforms; + /// Holds the instructions known to be scalar after vectorization. + SmallPtrSet<Instruction *, 4> Scalars; + /// Can we assume the absence of NaNs. bool HasFunNoNaNAttr; @@ -1679,10 +1855,11 @@ public: LoopInfo *LI, LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI, DemandedBits *DB, - AssumptionCache *AC, const Function *F, + AssumptionCache *AC, + OptimizationRemarkEmitter *ORE, const Function *F, const LoopVectorizeHints *Hints) : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB), - AC(AC), TheFunction(F), Hints(Hints) {} + AC(AC), ORE(ORE), TheFunction(F), Hints(Hints) {} /// Information about vectorization costs struct VectorizationFactor { @@ -1707,13 +1884,6 @@ public: unsigned selectInterleaveCount(bool OptForSize, unsigned VF, unsigned LoopCost); - /// \return The most profitable unroll factor. - /// This method finds the best unroll-factor based on register pressure and - /// other parameters. VF and LoopCost are the selected vectorization factor - /// and the cost of the selected VF. - unsigned computeInterleaveCount(bool OptForSize, unsigned VF, - unsigned LoopCost); - /// \brief A struct that represents some properties of the register usage /// of a loop. struct RegisterUsage { @@ -1732,6 +1902,29 @@ public: /// Collect values we want to ignore in the cost model. void collectValuesToIgnore(); + /// \returns The smallest bitwidth each instruction can be represented with. + /// The vector equivalents of these instructions should be truncated to this + /// type. + const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { + return MinBWs; + } + + /// \returns True if it is more profitable to scalarize instruction \p I for + /// vectorization factor \p VF. + bool isProfitableToScalarize(Instruction *I, unsigned VF) const { + auto Scalars = InstsToScalarize.find(VF); + assert(Scalars != InstsToScalarize.end() && + "VF not yet analyzed for scalarization profitability"); + return Scalars->second.count(I); + } + + /// \returns True if instruction \p I can be truncated to a smaller bitwidth + /// for vectorization factor \p VF. + bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const { + return VF > 1 && MinBWs.count(I) && !isProfitableToScalarize(I, VF) && + !Legal->isScalarAfterVectorization(I); + } + private: /// The vectorization cost is a combination of the cost itself and a boolean /// indicating whether any of the contributing operations will actually @@ -1760,20 +1953,44 @@ private: /// as a vector operation. bool isConsecutiveLoadOrStore(Instruction *I); - /// Report an analysis message to assist the user in diagnosing loops that are - /// not vectorized. These are handled as LoopAccessReport rather than - /// VectorizationReport because the << operator of VectorizationReport returns - /// LoopAccessReport. - void emitAnalysis(const LoopAccessReport &Message) const { - emitAnalysisDiag(TheFunction, TheLoop, *Hints, Message); + /// Create an analysis remark that explains why vectorization failed + /// + /// \p RemarkName is the identifier for the remark. \return the remark object + /// that can be streamed to. + OptimizationRemarkAnalysis createMissedAnalysis(StringRef RemarkName) { + return ::createMissedAnalysis(Hints->vectorizeAnalysisPassName(), + RemarkName, TheLoop); } -public: /// Map of scalar integer values to the smallest bitwidth they can be legally /// represented as. The vector equivalents of these values should be truncated /// to this type. MapVector<Instruction *, uint64_t> MinBWs; + /// A type representing the costs for instructions if they were to be + /// scalarized rather than vectorized. The entries are Instruction-Cost + /// pairs. + typedef DenseMap<Instruction *, unsigned> ScalarCostsTy; + + /// A map holding scalar costs for different vectorization factors. The + /// presence of a cost for an instruction in the mapping indicates that the + /// instruction will be scalarized when vectorizing with the associated + /// vectorization factor. The entries are VF-ScalarCostTy pairs. + DenseMap<unsigned, ScalarCostsTy> InstsToScalarize; + + /// Returns the expected difference in cost from scalarizing the expression + /// feeding a predicated instruction \p PredInst. The instructions to + /// scalarize and their scalar costs are collected in \p ScalarCosts. A + /// non-negative return value implies the expression will be scalarized. + /// Currently, only single-use chains are considered for scalarization. + int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, + unsigned VF); + + /// Collects the instructions to scalarize for each predicated instruction in + /// the loop. + void collectInstsToScalarize(unsigned VF); + +public: /// The loop that we evaluate. Loop *TheLoop; /// Predicated scalar evolution analysis. @@ -1790,6 +2007,9 @@ public: DemandedBits *DB; /// Assumption cache. AssumptionCache *AC; + /// Interface to emit optimization remarks. + OptimizationRemarkEmitter *ORE; + const Function *TheFunction; /// Loop Vectorize Hint. const LoopVectorizeHints *Hints; @@ -1813,8 +2033,8 @@ public: /// followed by a non-expert user. class LoopVectorizationRequirements { public: - LoopVectorizationRequirements() - : NumRuntimePointerChecks(0), UnsafeAlgebraInst(nullptr) {} + LoopVectorizationRequirements(OptimizationRemarkEmitter &ORE) + : NumRuntimePointerChecks(0), UnsafeAlgebraInst(nullptr), ORE(ORE) {} void addUnsafeAlgebraInst(Instruction *I) { // First unsafe algebra instruction. @@ -1825,13 +2045,15 @@ public: void addRuntimePointerChecks(unsigned Num) { NumRuntimePointerChecks = Num; } bool doesNotMeet(Function *F, Loop *L, const LoopVectorizeHints &Hints) { - const char *Name = Hints.vectorizeAnalysisPassName(); + const char *PassName = Hints.vectorizeAnalysisPassName(); bool Failed = false; if (UnsafeAlgebraInst && !Hints.allowReordering()) { - emitOptimizationRemarkAnalysisFPCommute( - F->getContext(), Name, *F, UnsafeAlgebraInst->getDebugLoc(), - VectorizationReport() << "cannot prove it is safe to reorder " - "floating-point operations"); + ORE.emit( + OptimizationRemarkAnalysisFPCommute(PassName, "CantReorderFPOps", + UnsafeAlgebraInst->getDebugLoc(), + UnsafeAlgebraInst->getParent()) + << "loop not vectorized: cannot prove it is safe to reorder " + "floating-point operations"); Failed = true; } @@ -1842,10 +2064,11 @@ public: NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; if ((ThresholdReached && !Hints.allowReordering()) || PragmaThresholdReached) { - emitOptimizationRemarkAnalysisAliasing( - F->getContext(), Name, *F, L->getStartLoc(), - VectorizationReport() - << "cannot prove it is safe to reorder memory operations"); + ORE.emit(OptimizationRemarkAnalysisAliasing(PassName, "CantReorderMemOps", + L->getStartLoc(), + L->getHeader()) + << "loop not vectorized: cannot prove it is safe to reorder " + "memory operations"); DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); Failed = true; } @@ -1856,6 +2079,9 @@ public: private: unsigned NumRuntimePointerChecks; Instruction *UnsafeAlgebraInst; + + /// Interface to emit optimization remarks. + OptimizationRemarkEmitter &ORE; }; static void addAcyclicInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) { @@ -1897,12 +2123,13 @@ struct LoopVectorize : public FunctionPass { auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); + auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); std::function<const LoopAccessInfo &(Loop &)> GetLAA = [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, - GetLAA); + GetLAA, *ORE); } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -1917,6 +2144,7 @@ struct LoopVectorize : public FunctionPass { AU.addRequired<AAResultsWrapperPass>(); AU.addRequired<LoopAccessLegacyAnalysis>(); AU.addRequired<DemandedBitsWrapperPass>(); + AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); AU.addPreserved<LoopInfoWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); AU.addPreserved<BasicAAWrapperPass>(); @@ -1949,7 +2177,7 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { } void InnerLoopVectorizer::createVectorIntInductionPHI( - const InductionDescriptor &II, VectorParts &Entry, IntegerType *TruncType) { + const InductionDescriptor &II, Instruction *EntryVal) { Value *Start = II.getStartValue(); ConstantInt *Step = II.getConstIntStepValue(); assert(Step && "Can not widen an IV with a non-constant step"); @@ -1957,7 +2185,8 @@ void InnerLoopVectorizer::createVectorIntInductionPHI( // Construct the initial value of the vector IV in the vector loop preheader auto CurrIP = Builder.saveIP(); Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); - if (TruncType) { + if (isa<TruncInst>(EntryVal)) { + auto *TruncType = cast<IntegerType>(EntryVal->getType()); Step = ConstantInt::getSigned(TruncType, Step->getSExtValue()); Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); } @@ -1972,18 +2201,45 @@ void InnerLoopVectorizer::createVectorIntInductionPHI( // factor. The last of those goes into the PHI. PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", &*LoopVectorBody->getFirstInsertionPt()); - Value *LastInduction = VecInd; + Instruction *LastInduction = VecInd; + VectorParts Entry(UF); for (unsigned Part = 0; Part < UF; ++Part) { Entry[Part] = LastInduction; - LastInduction = Builder.CreateAdd(LastInduction, SplatVF, "step.add"); + LastInduction = cast<Instruction>( + Builder.CreateAdd(LastInduction, SplatVF, "step.add")); } + VectorLoopValueMap.initVector(EntryVal, Entry); + if (isa<TruncInst>(EntryVal)) + addMetadata(Entry, EntryVal); + + // Move the last step to the end of the latch block. This ensures consistent + // placement of all induction updates. + auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); + auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); + auto *ICmp = cast<Instruction>(Br->getCondition()); + LastInduction->moveBefore(ICmp); + LastInduction->setName("vec.ind.next"); VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); - VecInd->addIncoming(LastInduction, LoopVectorBody); + VecInd->addIncoming(LastInduction, LoopVectorLatch); } -void InnerLoopVectorizer::widenIntInduction(PHINode *IV, VectorParts &Entry, - TruncInst *Trunc) { +bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { + return Legal->isScalarAfterVectorization(I) || + Cost->isProfitableToScalarize(I, VF); +} + +bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { + if (shouldScalarizeInstruction(IV)) + return true; + auto isScalarInst = [&](User *U) -> bool { + auto *I = cast<Instruction>(U); + return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); + }; + return any_of(IV->users(), isScalarInst); +} + +void InnerLoopVectorizer::widenIntInduction(PHINode *IV, TruncInst *Trunc) { auto II = Legal->getInductionVars()->find(IV); assert(II != Legal->getInductionVars()->end() && "IV is not an induction"); @@ -1991,12 +2247,25 @@ void InnerLoopVectorizer::widenIntInduction(PHINode *IV, VectorParts &Entry, auto ID = II->second; assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); - // If a truncate instruction was provided, get the smaller type. - auto *TruncType = Trunc ? cast<IntegerType>(Trunc->getType()) : nullptr; + // The scalar value to broadcast. This will be derived from the canonical + // induction variable. + Value *ScalarIV = nullptr; // The step of the induction. Value *Step = nullptr; + // The value from the original loop to which we are mapping the new induction + // variable. + Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; + + // True if we have vectorized the induction variable. + auto VectorizedIV = false; + + // Determine if we want a scalar version of the induction variable. This is + // true if the induction variable itself is not widened, or if it has at + // least one user in the loop that is not widened. + auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal); + // If the induction variable has a constant integer step value, go ahead and // get it now. if (ID.getConstIntStepValue()) @@ -2006,40 +2275,50 @@ void InnerLoopVectorizer::widenIntInduction(PHINode *IV, VectorParts &Entry, // create the phi node, we will splat the scalar induction variable in each // loop iteration. if (VF > 1 && IV->getType() == Induction->getType() && Step && - !ValuesNotWidened->count(IV)) - return createVectorIntInductionPHI(ID, Entry, TruncType); - - // The scalar value to broadcast. This will be derived from the canonical - // induction variable. - Value *ScalarIV = nullptr; - - // Define the scalar induction variable and step values. If we were given a - // truncation type, truncate the canonical induction variable and constant - // step. Otherwise, derive these values from the induction descriptor. - if (TruncType) { - assert(Step && "Truncation requires constant integer step"); - auto StepInt = cast<ConstantInt>(Step)->getSExtValue(); - ScalarIV = Builder.CreateCast(Instruction::Trunc, Induction, TruncType); - Step = ConstantInt::getSigned(TruncType, StepInt); - } else { - ScalarIV = Induction; - auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); - if (IV != OldInduction) { - ScalarIV = Builder.CreateSExtOrTrunc(ScalarIV, IV->getType()); - ScalarIV = ID.transform(Builder, ScalarIV, PSE.getSE(), DL); - ScalarIV->setName("offset.idx"); - } - if (!Step) { - SCEVExpander Exp(*PSE.getSE(), DL, "induction"); - Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(), - &*Builder.GetInsertPoint()); + !shouldScalarizeInstruction(EntryVal)) { + createVectorIntInductionPHI(ID, EntryVal); + VectorizedIV = true; + } + + // If we haven't yet vectorized the induction variable, or if we will create + // a scalar one, we need to define the scalar induction variable and step + // values. If we were given a truncation type, truncate the canonical + // induction variable and constant step. Otherwise, derive these values from + // the induction descriptor. + if (!VectorizedIV || NeedsScalarIV) { + if (Trunc) { + auto *TruncType = cast<IntegerType>(Trunc->getType()); + assert(Step && "Truncation requires constant integer step"); + auto StepInt = cast<ConstantInt>(Step)->getSExtValue(); + ScalarIV = Builder.CreateCast(Instruction::Trunc, Induction, TruncType); + Step = ConstantInt::getSigned(TruncType, StepInt); + } else { + ScalarIV = Induction; + auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); + if (IV != OldInduction) { + ScalarIV = Builder.CreateSExtOrTrunc(ScalarIV, IV->getType()); + ScalarIV = ID.transform(Builder, ScalarIV, PSE.getSE(), DL); + ScalarIV->setName("offset.idx"); + } + if (!Step) { + SCEVExpander Exp(*PSE.getSE(), DL, "induction"); + Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(), + &*Builder.GetInsertPoint()); + } } } - // Splat the scalar induction variable, and build the necessary step vectors. - Value *Broadcasted = getBroadcastInstrs(ScalarIV); - for (unsigned Part = 0; Part < UF; ++Part) - Entry[Part] = getStepVector(Broadcasted, VF * Part, Step); + // If we haven't yet vectorized the induction variable, splat the scalar + // induction variable, and build the necessary step vectors. + if (!VectorizedIV) { + Value *Broadcasted = getBroadcastInstrs(ScalarIV); + VectorParts Entry(UF); + for (unsigned Part = 0; Part < UF; ++Part) + Entry[Part] = getStepVector(Broadcasted, VF * Part, Step); + VectorLoopValueMap.initVector(EntryVal, Entry); + if (Trunc) + addMetadata(Entry, Trunc); + } // If an induction variable is only used for counting loop iterations or // calculating addresses, it doesn't need to be widened. Create scalar steps @@ -2047,38 +2326,64 @@ void InnerLoopVectorizer::widenIntInduction(PHINode *IV, VectorParts &Entry, // addition of the scalar steps will not increase the number of instructions // in the loop in the common case prior to InstCombine. We will be trading // one vector extract for each scalar step. - if (VF > 1 && ValuesNotWidened->count(IV)) { - auto *EntryVal = Trunc ? cast<Value>(Trunc) : IV; + if (NeedsScalarIV) buildScalarSteps(ScalarIV, Step, EntryVal); - } } -Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, - Value *Step) { +Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, + Instruction::BinaryOps BinOp) { + // Create and check the types. assert(Val->getType()->isVectorTy() && "Must be a vector"); - assert(Val->getType()->getScalarType()->isIntegerTy() && - "Elem must be an integer"); - assert(Step->getType() == Val->getType()->getScalarType() && - "Step has wrong type"); - // Create the types. - Type *ITy = Val->getType()->getScalarType(); - VectorType *Ty = cast<VectorType>(Val->getType()); - int VLen = Ty->getNumElements(); + int VLen = Val->getType()->getVectorNumElements(); + + Type *STy = Val->getType()->getScalarType(); + assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && + "Induction Step must be an integer or FP"); + assert(Step->getType() == STy && "Step has wrong type"); + SmallVector<Constant *, 8> Indices; + if (STy->isIntegerTy()) { + // Create a vector of consecutive numbers from zero to VF. + for (int i = 0; i < VLen; ++i) + Indices.push_back(ConstantInt::get(STy, StartIdx + i)); + + // Add the consecutive indices to the vector value. + Constant *Cv = ConstantVector::get(Indices); + assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); + Step = Builder.CreateVectorSplat(VLen, Step); + assert(Step->getType() == Val->getType() && "Invalid step vec"); + // FIXME: The newly created binary instructions should contain nsw/nuw flags, + // which can be found from the original scalar operations. + Step = Builder.CreateMul(Cv, Step); + return Builder.CreateAdd(Val, Step, "induction"); + } + + // Floating point induction. + assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && + "Binary Opcode should be specified for FP induction"); // Create a vector of consecutive numbers from zero to VF. for (int i = 0; i < VLen; ++i) - Indices.push_back(ConstantInt::get(ITy, StartIdx + i)); + Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); // Add the consecutive indices to the vector value. Constant *Cv = ConstantVector::get(Indices); - assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); + Step = Builder.CreateVectorSplat(VLen, Step); - assert(Step->getType() == Val->getType() && "Invalid step vec"); - // FIXME: The newly created binary instructions should contain nsw/nuw flags, - // which can be found from the original scalar operations. - Step = Builder.CreateMul(Cv, Step); - return Builder.CreateAdd(Val, Step, "induction"); + + // Floating point operations had to be 'fast' to enable the induction. + FastMathFlags Flags; + Flags.setUnsafeAlgebra(); + + Value *MulOp = Builder.CreateFMul(Cv, Step); + if (isa<Instruction>(MulOp)) + // Have to check, MulOp may be a constant + cast<Instruction>(MulOp)->setFastMathFlags(Flags); + + Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); + if (isa<Instruction>(BOp)) + cast<Instruction>(BOp)->setFastMathFlags(Flags); + return BOp; } void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, @@ -2092,98 +2397,34 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, assert(ScalarIVTy->isIntegerTy() && ScalarIVTy == Step->getType() && "Val and Step should have the same integer type"); - // Compute the scalar steps and save the results in ScalarIVMap. - for (unsigned Part = 0; Part < UF; ++Part) - for (unsigned I = 0; I < VF; ++I) { - auto *StartIdx = ConstantInt::get(ScalarIVTy, VF * Part + I); + // Determine the number of scalars we need to generate for each unroll + // iteration. If EntryVal is uniform, we only need to generate the first + // lane. Otherwise, we generate all VF values. + unsigned Lanes = + Legal->isUniformAfterVectorization(cast<Instruction>(EntryVal)) ? 1 : VF; + + // Compute the scalar steps and save the results in VectorLoopValueMap. + ScalarParts Entry(UF); + for (unsigned Part = 0; Part < UF; ++Part) { + Entry[Part].resize(VF); + for (unsigned Lane = 0; Lane < Lanes; ++Lane) { + auto *StartIdx = ConstantInt::get(ScalarIVTy, VF * Part + Lane); auto *Mul = Builder.CreateMul(StartIdx, Step); auto *Add = Builder.CreateAdd(ScalarIV, Mul); - ScalarIVMap[EntryVal].push_back(Add); + Entry[Part][Lane] = Add; } + } + VectorLoopValueMap.initScalar(EntryVal, Entry); } int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { - assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr"); - auto *SE = PSE.getSE(); - // Make sure that the pointer does not point to structs. - if (Ptr->getType()->getPointerElementType()->isAggregateType()) - return 0; - - // If this value is a pointer induction variable, we know it is consecutive. - PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr); - if (Phi && Inductions.count(Phi)) { - InductionDescriptor II = Inductions[Phi]; - return II.getConsecutiveDirection(); - } - - GetElementPtrInst *Gep = getGEPInstruction(Ptr); - if (!Gep) - return 0; - - unsigned NumOperands = Gep->getNumOperands(); - Value *GpPtr = Gep->getPointerOperand(); - // If this GEP value is a consecutive pointer induction variable and all of - // the indices are constant, then we know it is consecutive. - Phi = dyn_cast<PHINode>(GpPtr); - if (Phi && Inductions.count(Phi)) { - - // Make sure that the pointer does not point to structs. - PointerType *GepPtrType = cast<PointerType>(GpPtr->getType()); - if (GepPtrType->getElementType()->isAggregateType()) - return 0; - - // Make sure that all of the index operands are loop invariant. - for (unsigned i = 1; i < NumOperands; ++i) - if (!SE->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)), TheLoop)) - return 0; - - InductionDescriptor II = Inductions[Phi]; - return II.getConsecutiveDirection(); - } - - unsigned InductionOperand = getGEPInductionOperand(Gep); - - // Check that all of the gep indices are uniform except for our induction - // operand. - for (unsigned i = 0; i != NumOperands; ++i) - if (i != InductionOperand && - !SE->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)), TheLoop)) - return 0; - - // We can emit wide load/stores only if the last non-zero index is the - // induction variable. - const SCEV *Last = nullptr; - if (!getSymbolicStrides() || !getSymbolicStrides()->count(Gep)) - Last = PSE.getSCEV(Gep->getOperand(InductionOperand)); - else { - // Because of the multiplication by a stride we can have a s/zext cast. - // We are going to replace this stride by 1 so the cast is safe to ignore. - // - // %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - // %0 = trunc i64 %indvars.iv to i32 - // %mul = mul i32 %0, %Stride1 - // %idxprom = zext i32 %mul to i64 << Safe cast. - // %arrayidx = getelementptr inbounds i32* %B, i64 %idxprom - // - Last = replaceSymbolicStrideSCEV(PSE, *getSymbolicStrides(), - Gep->getOperand(InductionOperand), Gep); - if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(Last)) - Last = - (C->getSCEVType() == scSignExtend || C->getSCEVType() == scZeroExtend) - ? C->getOperand() - : Last; - } - if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) { - const SCEV *Step = AR->getStepRecurrence(*SE); - // The memory is consecutive because the last index is consecutive - // and all other indices are loop invariant. - if (Step->isOne()) - return 1; - if (Step->isAllOnesValue()) - return -1; - } + const ValueToValueMap &Strides = getSymbolicStrides() ? *getSymbolicStrides() : + ValueToValueMap(); + int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, true, false); + if (Stride == 1 || Stride == -1) + return Stride; return 0; } @@ -2191,23 +2432,112 @@ bool LoopVectorizationLegality::isUniform(Value *V) { return LAI->isUniform(V); } -InnerLoopVectorizer::VectorParts & +const InnerLoopVectorizer::VectorParts & InnerLoopVectorizer::getVectorValue(Value *V) { assert(V != Induction && "The new induction variable should not be used."); assert(!V->getType()->isVectorTy() && "Can't widen a vector"); + assert(!V->getType()->isVoidTy() && "Type does not produce a value"); // If we have a stride that is replaced by one, do it here. if (Legal->hasStride(V)) V = ConstantInt::get(V->getType(), 1); // If we have this scalar in the map, return it. - if (WidenMap.has(V)) - return WidenMap.get(V); + if (VectorLoopValueMap.hasVector(V)) + return VectorLoopValueMap.VectorMapStorage[V]; + + // If the value has not been vectorized, check if it has been scalarized + // instead. If it has been scalarized, and we actually need the value in + // vector form, we will construct the vector values on demand. + if (VectorLoopValueMap.hasScalar(V)) { + + // Initialize a new vector map entry. + VectorParts Entry(UF); + + // If we've scalarized a value, that value should be an instruction. + auto *I = cast<Instruction>(V); + + // If we aren't vectorizing, we can just copy the scalar map values over to + // the vector map. + if (VF == 1) { + for (unsigned Part = 0; Part < UF; ++Part) + Entry[Part] = getScalarValue(V, Part, 0); + return VectorLoopValueMap.initVector(V, Entry); + } + + // Get the last scalar instruction we generated for V. If the value is + // known to be uniform after vectorization, this corresponds to lane zero + // of the last unroll iteration. Otherwise, the last instruction is the one + // we created for the last vector lane of the last unroll iteration. + unsigned LastLane = Legal->isUniformAfterVectorization(I) ? 0 : VF - 1; + auto *LastInst = cast<Instruction>(getScalarValue(V, UF - 1, LastLane)); + + // Set the insert point after the last scalarized instruction. This ensures + // the insertelement sequence will directly follow the scalar definitions. + auto OldIP = Builder.saveIP(); + auto NewIP = std::next(BasicBlock::iterator(LastInst)); + Builder.SetInsertPoint(&*NewIP); + + // However, if we are vectorizing, we need to construct the vector values. + // If the value is known to be uniform after vectorization, we can just + // broadcast the scalar value corresponding to lane zero for each unroll + // iteration. Otherwise, we construct the vector values using insertelement + // instructions. Since the resulting vectors are stored in + // VectorLoopValueMap, we will only generate the insertelements once. + for (unsigned Part = 0; Part < UF; ++Part) { + Value *VectorValue = nullptr; + if (Legal->isUniformAfterVectorization(I)) { + VectorValue = getBroadcastInstrs(getScalarValue(V, Part, 0)); + } else { + VectorValue = UndefValue::get(VectorType::get(V->getType(), VF)); + for (unsigned Lane = 0; Lane < VF; ++Lane) + VectorValue = Builder.CreateInsertElement( + VectorValue, getScalarValue(V, Part, Lane), + Builder.getInt32(Lane)); + } + Entry[Part] = VectorValue; + } + Builder.restoreIP(OldIP); + return VectorLoopValueMap.initVector(V, Entry); + } // If this scalar is unknown, assume that it is a constant or that it is // loop invariant. Broadcast V and save the value for future uses. Value *B = getBroadcastInstrs(V); - return WidenMap.splat(V, B); + return VectorLoopValueMap.initVector(V, VectorParts(UF, B)); +} + +Value *InnerLoopVectorizer::getScalarValue(Value *V, unsigned Part, + unsigned Lane) { + + // If the value is not an instruction contained in the loop, it should + // already be scalar. + if (OrigLoop->isLoopInvariant(V)) + return V; + + assert(Lane > 0 ? !Legal->isUniformAfterVectorization(cast<Instruction>(V)) + : true && "Uniform values only have lane zero"); + + // If the value from the original loop has not been vectorized, it is + // represented by UF x VF scalar values in the new loop. Return the requested + // scalar value. + if (VectorLoopValueMap.hasScalar(V)) + return VectorLoopValueMap.ScalarMapStorage[V][Part][Lane]; + + // If the value has not been scalarized, get its entry in VectorLoopValueMap + // for the given unroll part. If this entry is not a vector type (i.e., the + // vectorization factor is one), there is no need to generate an + // extractelement instruction. + auto *U = getVectorValue(V)[Part]; + if (!U->getType()->isVectorTy()) { + assert(VF == 1 && "Value not scalarized has non-vector type"); + return U; + } + + // Otherwise, the value from the original loop has been vectorized and is + // represented by UF vector values. Extract and return the requested scalar + // value from the appropriate vector lane. + return Builder.CreateExtractElement(U, Builder.getInt32(Lane)); } Value *InnerLoopVectorizer::reverseVector(Value *Vec) { @@ -2355,7 +2685,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { LoadInst *LI = dyn_cast<LoadInst>(Instr); StoreInst *SI = dyn_cast<StoreInst>(Instr); - Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); + Value *Ptr = getPointerOperand(Instr); // Prepare for the vector type of the interleaved load/store. Type *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); @@ -2365,15 +2695,20 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { // Prepare for the new pointers. setDebugLocFromInst(Builder, Ptr); - VectorParts &PtrParts = getVectorValue(Ptr); SmallVector<Value *, 2> NewPtrs; unsigned Index = Group->getIndex(Instr); + + // If the group is reverse, adjust the index to refer to the last vector lane + // instead of the first. We adjust the index from the first vector lane, + // rather than directly getting the pointer for lane VF - 1, because the + // pointer operand of the interleaved access is supposed to be uniform. For + // uniform instructions, we're only required to generate a value for the + // first vector lane in each unroll iteration. + if (Group->isReverse()) + Index += (VF - 1) * Group->getFactor(); + for (unsigned Part = 0; Part < UF; Part++) { - // Extract the pointer for current instruction from the pointer vector. A - // reverse access uses the pointer in the last lane. - Value *NewPtr = Builder.CreateExtractElement( - PtrParts[Part], - Group->isReverse() ? Builder.getInt32(VF - 1) : Builder.getInt32(0)); + Value *NewPtr = getScalarValue(Ptr, Part, 0); // Notice current instruction could be any index. Need to adjust the address // to the member of index 0. @@ -2397,20 +2732,30 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { // Vectorize the interleaved load group. if (LI) { + + // For each unroll part, create a wide load for the group. + SmallVector<Value *, 2> NewLoads; for (unsigned Part = 0; Part < UF; Part++) { - Instruction *NewLoadInstr = Builder.CreateAlignedLoad( + auto *NewLoad = Builder.CreateAlignedLoad( NewPtrs[Part], Group->getAlignment(), "wide.vec"); + addMetadata(NewLoad, Instr); + NewLoads.push_back(NewLoad); + } - for (unsigned i = 0; i < InterleaveFactor; i++) { - Instruction *Member = Group->getMember(i); + // For each member in the group, shuffle out the appropriate data from the + // wide loads. + for (unsigned I = 0; I < InterleaveFactor; ++I) { + Instruction *Member = Group->getMember(I); - // Skip the gaps in the group. - if (!Member) - continue; + // Skip the gaps in the group. + if (!Member) + continue; - Constant *StrideMask = getStridedMask(Builder, i, InterleaveFactor, VF); + VectorParts Entry(UF); + Constant *StrideMask = getStridedMask(Builder, I, InterleaveFactor, VF); + for (unsigned Part = 0; Part < UF; Part++) { Value *StridedVec = Builder.CreateShuffleVector( - NewLoadInstr, UndefVec, StrideMask, "strided.vec"); + NewLoads[Part], UndefVec, StrideMask, "strided.vec"); // If this member has different type, cast the result type. if (Member->getType() != ScalarTy) { @@ -2418,12 +2763,10 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { StridedVec = Builder.CreateBitOrPointerCast(StridedVec, OtherVTy); } - VectorParts &Entry = WidenMap.get(Member); Entry[Part] = Group->isReverse() ? reverseVector(StridedVec) : StridedVec; } - - addMetadata(NewLoadInstr, Instr); + VectorLoopValueMap.initVector(Member, Entry); } return; } @@ -2479,7 +2822,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType(); Type *DataTy = VectorType::get(ScalarDataTy, VF); - Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); + Value *Ptr = getPointerOperand(Instr); unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment(); // An alignment of 0 means target abi alignment. We need to use the scalar's // target abi alignment in such a case. @@ -2487,93 +2830,57 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { if (!Alignment) Alignment = DL.getABITypeAlignment(ScalarDataTy); unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); - uint64_t ScalarAllocatedSize = DL.getTypeAllocSize(ScalarDataTy); - uint64_t VectorElementSize = DL.getTypeStoreSize(DataTy) / VF; - if (SI && Legal->blockNeedsPredication(SI->getParent()) && - !Legal->isMaskRequired(SI)) - return scalarizeInstruction(Instr, true); + // Scalarize the memory instruction if necessary. + if (Legal->memoryInstructionMustBeScalarized(Instr, VF)) + return scalarizeInstruction(Instr, Legal->isScalarWithPredication(Instr)); - if (ScalarAllocatedSize != VectorElementSize) - return scalarizeInstruction(Instr); - - // If the pointer is loop invariant scalarize the load. - if (LI && Legal->isUniform(Ptr)) - return scalarizeInstruction(Instr); - - // If the pointer is non-consecutive and gather/scatter is not supported - // scalarize the instruction. + // Determine if the pointer operand of the access is either consecutive or + // reverse consecutive. int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); bool Reverse = ConsecutiveStride < 0; - bool CreateGatherScatter = - !ConsecutiveStride && ((LI && Legal->isLegalMaskedGather(ScalarDataTy)) || - (SI && Legal->isLegalMaskedScatter(ScalarDataTy))); - if (!ConsecutiveStride && !CreateGatherScatter) - return scalarizeInstruction(Instr); + // Determine if either a gather or scatter operation is legal. + bool CreateGatherScatter = + !ConsecutiveStride && Legal->isLegalGatherOrScatter(Instr); - Constant *Zero = Builder.getInt32(0); - VectorParts &Entry = WidenMap.get(Instr); VectorParts VectorGep; // Handle consecutive loads/stores. GetElementPtrInst *Gep = getGEPInstruction(Ptr); if (ConsecutiveStride) { - if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) { - setDebugLocFromInst(Builder, Gep); - Value *PtrOperand = Gep->getPointerOperand(); - Value *FirstBasePtr = getVectorValue(PtrOperand)[0]; - FirstBasePtr = Builder.CreateExtractElement(FirstBasePtr, Zero); - - // Create the new GEP with the new induction variable. - GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); - Gep2->setOperand(0, FirstBasePtr); - Gep2->setName("gep.indvar.base"); - Ptr = Builder.Insert(Gep2); - } else if (Gep) { - setDebugLocFromInst(Builder, Gep); - assert(PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getPointerOperand()), - OrigLoop) && - "Base ptr must be invariant"); - // The last index does not have to be the induction. It can be - // consecutive and be a function of the index. For example A[I+1]; + if (Gep) { unsigned NumOperands = Gep->getNumOperands(); - unsigned InductionOperand = getGEPInductionOperand(Gep); - // Create the new GEP with the new induction variable. +#ifndef NDEBUG + // The original GEP that identified as a consecutive memory access + // should have only one loop-variant operand. + unsigned NumOfLoopVariantOps = 0; + for (unsigned i = 0; i < NumOperands; ++i) + if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)), + OrigLoop)) + NumOfLoopVariantOps++; + assert(NumOfLoopVariantOps == 1 && + "Consecutive GEP should have only one loop-variant operand"); +#endif GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); - - for (unsigned i = 0; i < NumOperands; ++i) { - Value *GepOperand = Gep->getOperand(i); - Instruction *GepOperandInst = dyn_cast<Instruction>(GepOperand); - - // Update last index or loop invariant instruction anchored in loop. - if (i == InductionOperand || - (GepOperandInst && OrigLoop->contains(GepOperandInst))) { - assert((i == InductionOperand || - PSE.getSE()->isLoopInvariant(PSE.getSCEV(GepOperandInst), - OrigLoop)) && - "Must be last index or loop invariant"); - - VectorParts &GEPParts = getVectorValue(GepOperand); - - // If GepOperand is an induction variable, and there's a scalarized - // version of it available, use it. Otherwise, we will need to create - // an extractelement instruction. - Value *Index = ScalarIVMap.count(GepOperand) - ? ScalarIVMap[GepOperand][0] - : Builder.CreateExtractElement(GEPParts[0], Zero); - - Gep2->setOperand(i, Index); - Gep2->setName("gep.indvar.idx"); - } - } + Gep2->setName("gep.indvar"); + + // A new GEP is created for a 0-lane value of the first unroll iteration. + // The GEPs for the rest of the unroll iterations are computed below as an + // offset from this GEP. + for (unsigned i = 0; i < NumOperands; ++i) + // We can apply getScalarValue() for all GEP indices. It returns an + // original value for loop-invariant operand and 0-lane for consecutive + // operand. + Gep2->setOperand(i, getScalarValue(Gep->getOperand(i), + 0, /* First unroll iteration */ + 0 /* 0-lane of the vector */ )); + setDebugLocFromInst(Builder, Gep); Ptr = Builder.Insert(Gep2); + } else { // No GEP - // Use the induction element ptr. - assert(isa<PHINode>(Ptr) && "Invalid induction ptr"); setDebugLocFromInst(Builder, Ptr); - VectorParts &PtrVal = getVectorValue(Ptr); - Ptr = Builder.CreateExtractElement(PtrVal[0], Zero); + Ptr = getScalarValue(Ptr, 0, 0); } } else { // At this point we should vector version of GEP for Gather or Scatter @@ -2660,6 +2967,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { // Handle loads. assert(LI && "Must have a load instruction"); setDebugLocFromInst(Builder, LI); + VectorParts Entry(UF); for (unsigned Part = 0; Part < UF; ++Part) { Instruction *NewLI; if (CreateGatherScatter) { @@ -2692,70 +3000,45 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { } addMetadata(NewLI, LI); } + VectorLoopValueMap.initVector(Instr, Entry); } void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, - bool IfPredicateStore) { + bool IfPredicateInstr) { assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); + DEBUG(dbgs() << "LV: Scalarizing" + << (IfPredicateInstr ? " and predicating:" : ":") << *Instr + << '\n'); // Holds vector parameters or scalars, in case of uniform vals. SmallVector<VectorParts, 4> Params; setDebugLocFromInst(Builder, Instr); - // Find all of the vectorized parameters. - for (Value *SrcOp : Instr->operands()) { - // If we are accessing the old induction variable, use the new one. - if (SrcOp == OldInduction) { - Params.push_back(getVectorValue(SrcOp)); - continue; - } - - // Try using previously calculated values. - auto *SrcInst = dyn_cast<Instruction>(SrcOp); - - // If the src is an instruction that appeared earlier in the basic block, - // then it should already be vectorized. - if (SrcInst && OrigLoop->contains(SrcInst)) { - assert(WidenMap.has(SrcInst) && "Source operand is unavailable"); - // The parameter is a vector value from earlier. - Params.push_back(WidenMap.get(SrcInst)); - } else { - // The parameter is a scalar from outside the loop. Maybe even a constant. - VectorParts Scalars; - Scalars.append(UF, SrcOp); - Params.push_back(Scalars); - } - } - - assert(Params.size() == Instr->getNumOperands() && - "Invalid number of operands"); - // Does this instruction return a value ? bool IsVoidRetTy = Instr->getType()->isVoidTy(); - Value *UndefVec = - IsVoidRetTy ? nullptr - : UndefValue::get(VectorType::get(Instr->getType(), VF)); - // Create a new entry in the WidenMap and initialize it to Undef or Null. - VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); + // Initialize a new scalar map entry. + ScalarParts Entry(UF); VectorParts Cond; - if (IfPredicateStore) { - assert(Instr->getParent()->getSinglePredecessor() && - "Only support single predecessor blocks"); - Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(), - Instr->getParent()); - } + if (IfPredicateInstr) + Cond = createBlockInMask(Instr->getParent()); + + // Determine the number of scalars we need to generate for each unroll + // iteration. If the instruction is uniform, we only need to generate the + // first lane. Otherwise, we generate all VF values. + unsigned Lanes = Legal->isUniformAfterVectorization(Instr) ? 1 : VF; // For each vector unroll 'part': for (unsigned Part = 0; Part < UF; ++Part) { + Entry[Part].resize(VF); // For each scalar that we create: - for (unsigned Width = 0; Width < VF; ++Width) { + for (unsigned Lane = 0; Lane < Lanes; ++Lane) { // Start if-block. Value *Cmp = nullptr; - if (IfPredicateStore) { - Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Width)); + if (IfPredicateInstr) { + Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Lane)); Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, ConstantInt::get(Cmp->getType(), 1)); } @@ -2763,18 +3046,11 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, Instruction *Cloned = Instr->clone(); if (!IsVoidRetTy) Cloned->setName(Instr->getName() + ".cloned"); - // Replace the operands of the cloned instructions with extracted scalars. - for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { - // If the operand is an induction variable, and there's a scalarized - // version of it available, use it. Otherwise, we will need to create - // an extractelement instruction if vectorizing. - auto *NewOp = Params[op][Part]; - auto *ScalarOp = Instr->getOperand(op); - if (ScalarIVMap.count(ScalarOp)) - NewOp = ScalarIVMap[ScalarOp][VF * Part + Width]; - else if (NewOp->getType()->isVectorTy()) - NewOp = Builder.CreateExtractElement(NewOp, Builder.getInt32(Width)); + // Replace the operands of the cloned instructions with their scalar + // equivalents in the new loop. + for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { + auto *NewOp = getScalarValue(Instr->getOperand(op), Part, Lane); Cloned->setOperand(op, NewOp); } addNewMetadata(Cloned, Instr); @@ -2782,22 +3058,20 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, // Place the cloned scalar in the new loop. Builder.Insert(Cloned); + // Add the cloned scalar to the scalar map entry. + Entry[Part][Lane] = Cloned; + // If we just cloned a new assumption, add it the assumption cache. if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) if (II->getIntrinsicID() == Intrinsic::assume) AC->registerAssumption(II); - // If the original scalar returns a value we need to place it in a vector - // so that future users will be able to use it. - if (!IsVoidRetTy) - VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned, - Builder.getInt32(Width)); // End if-block. - if (IfPredicateStore) - PredicatedStores.push_back( - std::make_pair(cast<StoreInst>(Cloned), Cmp)); + if (IfPredicateInstr) + PredicatedInstructions.push_back(std::make_pair(Cloned, Cmp)); } } + VectorLoopValueMap.initScalar(Instr, Entry); } PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, @@ -2811,10 +3085,12 @@ PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, Latch = Header; IRBuilder<> Builder(&*Header->getFirstInsertionPt()); - setDebugLocFromInst(Builder, getDebugLocFromInstOrOperands(OldInduction)); + Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); + setDebugLocFromInst(Builder, OldInst); auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); Builder.SetInsertPoint(Latch->getTerminator()); + setDebugLocFromInst(Builder, OldInst); // Create i+1 and fill the PHINode. Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); @@ -3146,14 +3422,16 @@ void InnerLoopVectorizer::createEmptyLoop() { // Create phi nodes to merge from the backedge-taken check block. PHINode *BCResumeVal = PHINode::Create( OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator()); - Value *EndValue; + Value *&EndValue = IVEndValues[OrigPhi]; if (OrigPhi == OldInduction) { // We know what the end value is. EndValue = CountRoundDown; } else { IRBuilder<> B(LoopBypassBlocks.back()->getTerminator()); - Value *CRD = B.CreateSExtOrTrunc(CountRoundDown, - II.getStep()->getType(), "cast.crd"); + Type *StepType = II.getStep()->getType(); + Instruction::CastOps CastOp = + CastInst::getCastOpcode(CountRoundDown, true, StepType, true); + Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd"); const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); EndValue = II.transform(B, CRD, PSE.getSE(), DL); EndValue->setName("ind.end"); @@ -3163,9 +3441,6 @@ void InnerLoopVectorizer::createEmptyLoop() { // or the value at the end of the vectorized loop. BCResumeVal->addIncoming(EndValue, MiddleBlock); - // Fix up external users of the induction variable. - fixupIVUsers(OrigPhi, II, CountRoundDown, EndValue, MiddleBlock); - // Fix the scalar body counter (PHI node). unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH); @@ -3201,7 +3476,7 @@ void InnerLoopVectorizer::createEmptyLoop() { if (MDNode *LID = OrigLoop->getLoopID()) Lp->setLoopID(LID); - LoopVectorizeHints Hints(Lp, true); + LoopVectorizeHints Hints(Lp, true, *ORE); Hints.setAlreadyVectorized(); } @@ -3324,8 +3599,9 @@ static Value *addFastMathFlag(Value *V) { return V; } -/// Estimate the overhead of scalarizing a value. Insert and Extract are set if -/// the result needs to be inserted and/or extracted from vectors. +/// \brief Estimate the overhead of scalarizing a value based on its type. +/// Insert and Extract are set if the result needs to be inserted and/or +/// extracted from vectors. static unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract, const TargetTransformInfo &TTI) { if (Ty->isVoidTy()) @@ -3335,15 +3611,46 @@ static unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract, unsigned Cost = 0; for (unsigned I = 0, E = Ty->getVectorNumElements(); I < E; ++I) { - if (Insert) - Cost += TTI.getVectorInstrCost(Instruction::InsertElement, Ty, I); if (Extract) Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, Ty, I); + if (Insert) + Cost += TTI.getVectorInstrCost(Instruction::InsertElement, Ty, I); } return Cost; } +/// \brief Estimate the overhead of scalarizing an Instruction based on the +/// types of its operands and return value. +static unsigned getScalarizationOverhead(SmallVectorImpl<Type *> &OpTys, + Type *RetTy, + const TargetTransformInfo &TTI) { + unsigned ScalarizationCost = + getScalarizationOverhead(RetTy, true, false, TTI); + + for (Type *Ty : OpTys) + ScalarizationCost += getScalarizationOverhead(Ty, false, true, TTI); + + return ScalarizationCost; +} + +/// \brief Estimate the overhead of scalarizing an instruction. This is a +/// convenience wrapper for the type-based getScalarizationOverhead API. +static unsigned getScalarizationOverhead(Instruction *I, unsigned VF, + const TargetTransformInfo &TTI) { + if (VF == 1) + return 0; + + Type *RetTy = ToVectorTy(I->getType(), VF); + + SmallVector<Type *, 4> OpTys; + unsigned OperandsNum = I->getNumOperands(); + for (unsigned OpInd = 0; OpInd < OperandsNum; ++OpInd) + OpTys.push_back(ToVectorTy(I->getOperand(OpInd)->getType(), VF)); + + return getScalarizationOverhead(OpTys, RetTy, TTI); +} + // Estimate cost of a call instruction CI if it were vectorized with factor VF. // Return the cost of the instruction, including scalarization overhead if it's // needed. The flag NeedToScalarize shows if the call needs to be scalarized - @@ -3374,10 +3681,7 @@ static unsigned getVectorCallCost(CallInst *CI, unsigned VF, // Compute costs of unpacking argument values for the scalar calls and // packing the return values to a vector. - unsigned ScalarizationCost = - getScalarizationOverhead(RetTy, true, false, TTI); - for (Type *Ty : Tys) - ScalarizationCost += getScalarizationOverhead(Ty, false, true, TTI); + unsigned ScalarizationCost = getScalarizationOverhead(Tys, RetTy, TTI); unsigned Cost = ScalarCallCost * VF + ScalarizationCost; @@ -3434,8 +3738,13 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() { // later and will remove any ext/trunc pairs. // SmallPtrSet<Value *, 4> Erased; - for (const auto &KV : *MinBWs) { - VectorParts &Parts = WidenMap.get(KV.first); + for (const auto &KV : Cost->getMinimalBitwidths()) { + // If the value wasn't vectorized, we must maintain the original scalar + // type. The absence of the value from VectorLoopValueMap indicates that it + // wasn't vectorized. + if (!VectorLoopValueMap.hasVector(KV.first)) + continue; + VectorParts &Parts = VectorLoopValueMap.getVector(KV.first); for (Value *&I : Parts) { if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) continue; @@ -3526,8 +3835,13 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() { } // We'll have created a bunch of ZExts that are now parentless. Clean up. - for (const auto &KV : *MinBWs) { - VectorParts &Parts = WidenMap.get(KV.first); + for (const auto &KV : Cost->getMinimalBitwidths()) { + // If the value wasn't vectorized, we must maintain the original scalar + // type. The absence of the value from VectorLoopValueMap indicates that it + // wasn't vectorized. + if (!VectorLoopValueMap.hasVector(KV.first)) + continue; + VectorParts &Parts = VectorLoopValueMap.getVector(KV.first); for (Value *&I : Parts) { ZExtInst *Inst = dyn_cast<ZExtInst>(I); if (Inst && Inst->use_empty()) { @@ -3558,6 +3872,11 @@ void InnerLoopVectorizer::vectorizeLoop() { // are vectorized, so we can use them to construct the PHI. PhiVector PHIsToFix; + // Collect instructions from the original loop that will become trivially + // dead in the vectorized loop. We don't need to vectorize these + // instructions. + collectTriviallyDeadInstructions(); + // Scan the loop in a topological order to ensure that defs are vectorized // before users. LoopBlocksDFS DFS(OrigLoop); @@ -3605,7 +3924,7 @@ void InnerLoopVectorizer::vectorizeLoop() { Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator()); // This is the vector-clone of the value that leaves the loop. - VectorParts &VectorExit = getVectorValue(LoopExitInst); + const VectorParts &VectorExit = getVectorValue(LoopExitInst); Type *VecTy = VectorExit[0]->getType(); // Find the reduction identity variable. Zero for addition, or, xor, @@ -3644,10 +3963,10 @@ void InnerLoopVectorizer::vectorizeLoop() { // Reductions do not have to start at zero. They can start with // any loop invariant values. - VectorParts &VecRdxPhi = WidenMap.get(Phi); + const VectorParts &VecRdxPhi = getVectorValue(Phi); BasicBlock *Latch = OrigLoop->getLoopLatch(); Value *LoopVal = Phi->getIncomingValueForBlock(Latch); - VectorParts &Val = getVectorValue(LoopVal); + const VectorParts &Val = getVectorValue(LoopVal); for (unsigned part = 0; part < UF; ++part) { // Make sure to add the reduction stat value only to the // first unroll part. @@ -3664,7 +3983,7 @@ void InnerLoopVectorizer::vectorizeLoop() { // instructions. Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); - VectorParts RdxParts = getVectorValue(LoopExitInst); + VectorParts &RdxParts = VectorLoopValueMap.getVector(LoopExitInst); setDebugLocFromInst(Builder, LoopExitInst); // If the vector reduction can be performed in a smaller type, we truncate @@ -3792,22 +4111,25 @@ void InnerLoopVectorizer::vectorizeLoop() { Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); } // end of for each Phi in PHIsToFix. - fixLCSSAPHIs(); - - // Make sure DomTree is updated. + // Update the dominator tree. + // + // FIXME: After creating the structure of the new loop, the dominator tree is + // no longer up-to-date, and it remains that way until we update it + // here. An out-of-date dominator tree is problematic for SCEV, + // because SCEVExpander uses it to guide code generation. The + // vectorizer use SCEVExpanders in several places. Instead, we should + // keep the dominator tree up-to-date as we go. updateAnalysis(); - // Predicate any stores. - for (auto KV : PredicatedStores) { - BasicBlock::iterator I(KV.first); - auto *BB = SplitBlock(I->getParent(), &*std::next(I), DT, LI); - auto *T = SplitBlockAndInsertIfThen(KV.second, &*I, /*Unreachable=*/false, - /*BranchWeights=*/nullptr, DT, LI); - I->moveBefore(T); - I->getParent()->setName("pred.store.if"); - BB->setName("pred.store.continue"); - } - DEBUG(DT->verifyDomTree()); + // Fix-up external users of the induction variables. + for (auto &Entry : *Legal->getInductionVars()) + fixupIVUsers(Entry.first, Entry.second, + getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), + IVEndValues[Entry.first], LoopMiddleBlock); + + fixLCSSAPHIs(); + predicateInstructions(); + // Remove redundant induction instructions. cse(LoopVectorBody); } @@ -3882,7 +4204,7 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // We constructed a temporary phi node in the first phase of vectorization. // This phi node will eventually be deleted. - auto &PhiParts = getVectorValue(Phi); + VectorParts &PhiParts = VectorLoopValueMap.getVector(Phi); Builder.SetInsertPoint(cast<Instruction>(PhiParts[0])); // Create a phi node for the new recurrence. The current value will either be @@ -3974,10 +4296,217 @@ void InnerLoopVectorizer::fixLCSSAPHIs() { } } +void InnerLoopVectorizer::collectTriviallyDeadInstructions() { + BasicBlock *Latch = OrigLoop->getLoopLatch(); + + // We create new control-flow for the vectorized loop, so the original + // condition will be dead after vectorization if it's only used by the + // branch. + auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); + if (Cmp && Cmp->hasOneUse()) + DeadInstructions.insert(Cmp); + + // We create new "steps" for induction variable updates to which the original + // induction variables map. An original update instruction will be dead if + // all its users except the induction variable are dead. + for (auto &Induction : *Legal->getInductionVars()) { + PHINode *Ind = Induction.first; + auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); + if (all_of(IndUpdate->users(), [&](User *U) -> bool { + return U == Ind || DeadInstructions.count(cast<Instruction>(U)); + })) + DeadInstructions.insert(IndUpdate); + } +} + +void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { + + // The basic block and loop containing the predicated instruction. + auto *PredBB = PredInst->getParent(); + auto *VectorLoop = LI->getLoopFor(PredBB); + + // Initialize a worklist with the operands of the predicated instruction. + SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); + + // Holds instructions that we need to analyze again. An instruction may be + // reanalyzed if we don't yet know if we can sink it or not. + SmallVector<Instruction *, 8> InstsToReanalyze; + + // Returns true if a given use occurs in the predicated block. Phi nodes use + // their operands in their corresponding predecessor blocks. + auto isBlockOfUsePredicated = [&](Use &U) -> bool { + auto *I = cast<Instruction>(U.getUser()); + BasicBlock *BB = I->getParent(); + if (auto *Phi = dyn_cast<PHINode>(I)) + BB = Phi->getIncomingBlock( + PHINode::getIncomingValueNumForOperand(U.getOperandNo())); + return BB == PredBB; + }; + + // Iteratively sink the scalarized operands of the predicated instruction + // into the block we created for it. When an instruction is sunk, it's + // operands are then added to the worklist. The algorithm ends after one pass + // through the worklist doesn't sink a single instruction. + bool Changed; + do { + + // Add the instructions that need to be reanalyzed to the worklist, and + // reset the changed indicator. + Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); + InstsToReanalyze.clear(); + Changed = false; + + while (!Worklist.empty()) { + auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); + + // We can't sink an instruction if it is a phi node, is already in the + // predicated block, is not in the loop, or may have side effects. + if (!I || isa<PHINode>(I) || I->getParent() == PredBB || + !VectorLoop->contains(I) || I->mayHaveSideEffects()) + continue; + + // It's legal to sink the instruction if all its uses occur in the + // predicated block. Otherwise, there's nothing to do yet, and we may + // need to reanalyze the instruction. + if (!all_of(I->uses(), isBlockOfUsePredicated)) { + InstsToReanalyze.push_back(I); + continue; + } + + // Move the instruction to the beginning of the predicated block, and add + // it's operands to the worklist. + I->moveBefore(&*PredBB->getFirstInsertionPt()); + Worklist.insert(I->op_begin(), I->op_end()); + + // The sinking may have enabled other instructions to be sunk, so we will + // need to iterate. + Changed = true; + } + } while (Changed); +} + +void InnerLoopVectorizer::predicateInstructions() { + + // For each instruction I marked for predication on value C, split I into its + // own basic block to form an if-then construct over C. Since I may be fed by + // an extractelement instruction or other scalar operand, we try to + // iteratively sink its scalar operands into the predicated block. If I feeds + // an insertelement instruction, we try to move this instruction into the + // predicated block as well. For non-void types, a phi node will be created + // for the resulting value (either vector or scalar). + // + // So for some predicated instruction, e.g. the conditional sdiv in: + // + // for.body: + // ... + // %add = add nsw i32 %mul, %0 + // %cmp5 = icmp sgt i32 %2, 7 + // br i1 %cmp5, label %if.then, label %if.end + // + // if.then: + // %div = sdiv i32 %0, %1 + // br label %if.end + // + // if.end: + // %x.0 = phi i32 [ %div, %if.then ], [ %add, %for.body ] + // + // the sdiv at this point is scalarized and if-converted using a select. + // The inactive elements in the vector are not used, but the predicated + // instruction is still executed for all vector elements, essentially: + // + // vector.body: + // ... + // %17 = add nsw <2 x i32> %16, %wide.load + // %29 = extractelement <2 x i32> %wide.load, i32 0 + // %30 = extractelement <2 x i32> %wide.load51, i32 0 + // %31 = sdiv i32 %29, %30 + // %32 = insertelement <2 x i32> undef, i32 %31, i32 0 + // %35 = extractelement <2 x i32> %wide.load, i32 1 + // %36 = extractelement <2 x i32> %wide.load51, i32 1 + // %37 = sdiv i32 %35, %36 + // %38 = insertelement <2 x i32> %32, i32 %37, i32 1 + // %predphi = select <2 x i1> %26, <2 x i32> %38, <2 x i32> %17 + // + // Predication will now re-introduce the original control flow to avoid false + // side-effects by the sdiv instructions on the inactive elements, yielding + // (after cleanup): + // + // vector.body: + // ... + // %5 = add nsw <2 x i32> %4, %wide.load + // %8 = icmp sgt <2 x i32> %wide.load52, <i32 7, i32 7> + // %9 = extractelement <2 x i1> %8, i32 0 + // br i1 %9, label %pred.sdiv.if, label %pred.sdiv.continue + // + // pred.sdiv.if: + // %10 = extractelement <2 x i32> %wide.load, i32 0 + // %11 = extractelement <2 x i32> %wide.load51, i32 0 + // %12 = sdiv i32 %10, %11 + // %13 = insertelement <2 x i32> undef, i32 %12, i32 0 + // br label %pred.sdiv.continue + // + // pred.sdiv.continue: + // %14 = phi <2 x i32> [ undef, %vector.body ], [ %13, %pred.sdiv.if ] + // %15 = extractelement <2 x i1> %8, i32 1 + // br i1 %15, label %pred.sdiv.if54, label %pred.sdiv.continue55 + // + // pred.sdiv.if54: + // %16 = extractelement <2 x i32> %wide.load, i32 1 + // %17 = extractelement <2 x i32> %wide.load51, i32 1 + // %18 = sdiv i32 %16, %17 + // %19 = insertelement <2 x i32> %14, i32 %18, i32 1 + // br label %pred.sdiv.continue55 + // + // pred.sdiv.continue55: + // %20 = phi <2 x i32> [ %14, %pred.sdiv.continue ], [ %19, %pred.sdiv.if54 ] + // %predphi = select <2 x i1> %8, <2 x i32> %20, <2 x i32> %5 + + for (auto KV : PredicatedInstructions) { + BasicBlock::iterator I(KV.first); + BasicBlock *Head = I->getParent(); + auto *BB = SplitBlock(Head, &*std::next(I), DT, LI); + auto *T = SplitBlockAndInsertIfThen(KV.second, &*I, /*Unreachable=*/false, + /*BranchWeights=*/nullptr, DT, LI); + I->moveBefore(T); + sinkScalarOperands(&*I); + + I->getParent()->setName(Twine("pred.") + I->getOpcodeName() + ".if"); + BB->setName(Twine("pred.") + I->getOpcodeName() + ".continue"); + + // If the instruction is non-void create a Phi node at reconvergence point. + if (!I->getType()->isVoidTy()) { + Value *IncomingTrue = nullptr; + Value *IncomingFalse = nullptr; + + if (I->hasOneUse() && isa<InsertElementInst>(*I->user_begin())) { + // If the predicated instruction is feeding an insert-element, move it + // into the Then block; Phi node will be created for the vector. + InsertElementInst *IEI = cast<InsertElementInst>(*I->user_begin()); + IEI->moveBefore(T); + IncomingTrue = IEI; // the new vector with the inserted element. + IncomingFalse = IEI->getOperand(0); // the unmodified vector + } else { + // Phi node will be created for the scalar predicated instruction. + IncomingTrue = &*I; + IncomingFalse = UndefValue::get(I->getType()); + } + + BasicBlock *PostDom = I->getParent()->getSingleSuccessor(); + assert(PostDom && "Then block has multiple successors"); + PHINode *Phi = + PHINode::Create(IncomingTrue->getType(), 2, "", &PostDom->front()); + IncomingTrue->replaceAllUsesWith(Phi); + Phi->addIncoming(IncomingFalse, Head); + Phi->addIncoming(IncomingTrue, I->getParent()); + } + } + + DEBUG(DT->verifyDomTree()); +} + InnerLoopVectorizer::VectorParts InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { - assert(std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) && - "Invalid edge"); + assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); // Look for cached value. std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); @@ -4033,12 +4562,12 @@ InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) { return BlockMask; } -void InnerLoopVectorizer::widenPHIInstruction( - Instruction *PN, InnerLoopVectorizer::VectorParts &Entry, unsigned UF, - unsigned VF, PhiVector *PV) { +void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, + unsigned VF, PhiVector *PV) { PHINode *P = cast<PHINode>(PN); // Handle recurrences. if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { + VectorParts Entry(UF); for (unsigned part = 0; part < UF; ++part) { // This is phase one of vectorizing PHIs. Type *VecTy = @@ -4046,6 +4575,7 @@ void InnerLoopVectorizer::widenPHIInstruction( Entry[part] = PHINode::Create( VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); } + VectorLoopValueMap.initVector(P, Entry); PV->push_back(P); return; } @@ -4066,10 +4596,11 @@ void InnerLoopVectorizer::widenPHIInstruction( // SELECT(Mask3, In3, // SELECT(Mask2, In2, // ( ...))) + VectorParts Entry(UF); for (unsigned In = 0; In < NumIncoming; In++) { VectorParts Cond = createEdgeMask(P->getIncomingBlock(In), P->getParent()); - VectorParts &In0 = getVectorValue(P->getIncomingValue(In)); + const VectorParts &In0 = getVectorValue(P->getIncomingValue(In)); for (unsigned part = 0; part < UF; ++part) { // We might have single edge PHIs (blocks) - use an identity @@ -4083,6 +4614,7 @@ void InnerLoopVectorizer::widenPHIInstruction( "predphi"); } } + VectorLoopValueMap.initVector(P, Entry); return; } @@ -4099,46 +4631,95 @@ void InnerLoopVectorizer::widenPHIInstruction( case InductionDescriptor::IK_NoInduction: llvm_unreachable("Unknown induction"); case InductionDescriptor::IK_IntInduction: - return widenIntInduction(P, Entry); - case InductionDescriptor::IK_PtrInduction: + return widenIntInduction(P); + case InductionDescriptor::IK_PtrInduction: { // Handle the pointer induction variable case. assert(P->getType()->isPointerTy() && "Unexpected type."); // This is the normalized GEP that starts counting at zero. Value *PtrInd = Induction; PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType()); - // This is the vector of results. Notice that we don't generate - // vector geps because scalar geps result in better code. - for (unsigned part = 0; part < UF; ++part) { - if (VF == 1) { - int EltIndex = part; - Constant *Idx = ConstantInt::get(PtrInd->getType(), EltIndex); - Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); - Value *SclrGep = II.transform(Builder, GlobalIdx, PSE.getSE(), DL); - SclrGep->setName("next.gep"); - Entry[part] = SclrGep; - continue; - } - - Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF)); - for (unsigned int i = 0; i < VF; ++i) { - int EltIndex = i + part * VF; - Constant *Idx = ConstantInt::get(PtrInd->getType(), EltIndex); + // Determine the number of scalars we need to generate for each unroll + // iteration. If the instruction is uniform, we only need to generate the + // first lane. Otherwise, we generate all VF values. + unsigned Lanes = Legal->isUniformAfterVectorization(P) ? 1 : VF; + // These are the scalar results. Notice that we don't generate vector GEPs + // because scalar GEPs result in better code. + ScalarParts Entry(UF); + for (unsigned Part = 0; Part < UF; ++Part) { + Entry[Part].resize(VF); + for (unsigned Lane = 0; Lane < Lanes; ++Lane) { + Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF); Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); Value *SclrGep = II.transform(Builder, GlobalIdx, PSE.getSE(), DL); SclrGep->setName("next.gep"); - VecVal = Builder.CreateInsertElement(VecVal, SclrGep, - Builder.getInt32(i), "insert.gep"); + Entry[Part][Lane] = SclrGep; } - Entry[part] = VecVal; } + VectorLoopValueMap.initScalar(P, Entry); return; } + case InductionDescriptor::IK_FpInduction: { + assert(P->getType() == II.getStartValue()->getType() && + "Types must match"); + // Handle other induction variables that are now based on the + // canonical one. + assert(P != OldInduction && "Primary induction can be integer only"); + + Value *V = Builder.CreateCast(Instruction::SIToFP, Induction, P->getType()); + V = II.transform(Builder, V, PSE.getSE(), DL); + V->setName("fp.offset.idx"); + + // Now we have scalar op: %fp.offset.idx = StartVal +/- Induction*StepVal + + Value *Broadcasted = getBroadcastInstrs(V); + // After broadcasting the induction variable we need to make the vector + // consecutive by adding StepVal*0, StepVal*1, StepVal*2, etc. + Value *StepVal = cast<SCEVUnknown>(II.getStep())->getValue(); + VectorParts Entry(UF); + for (unsigned part = 0; part < UF; ++part) + Entry[part] = getStepVector(Broadcasted, VF * part, StepVal, + II.getInductionOpcode()); + VectorLoopValueMap.initVector(P, Entry); + return; + } + } +} + +/// A helper function for checking whether an integer division-related +/// instruction may divide by zero (in which case it must be predicated if +/// executed conditionally in the scalar code). +/// TODO: It may be worthwhile to generalize and check isKnownNonZero(). +/// Non-zero divisors that are non compile-time constants will not be +/// converted into multiplication, so we will still end up scalarizing +/// the division, but can do so w/o predication. +static bool mayDivideByZero(Instruction &I) { + assert((I.getOpcode() == Instruction::UDiv || + I.getOpcode() == Instruction::SDiv || + I.getOpcode() == Instruction::URem || + I.getOpcode() == Instruction::SRem) && + "Unexpected instruction"); + Value *Divisor = I.getOperand(1); + auto *CInt = dyn_cast<ConstantInt>(Divisor); + return !CInt || CInt->isZero(); } void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { // For each instruction in the old loop. for (Instruction &I : *BB) { - VectorParts &Entry = WidenMap.get(&I); + + // If the instruction will become trivially dead when vectorized, we don't + // need to generate it. + if (DeadInstructions.count(&I)) + continue; + + // Scalarize instructions that should remain scalar after vectorization. + if (VF > 1 && + !(isa<BranchInst>(&I) || isa<PHINode>(&I) || + isa<DbgInfoIntrinsic>(&I)) && + shouldScalarizeInstruction(&I)) { + scalarizeInstruction(&I, Legal->isScalarWithPredication(&I)); + continue; + } switch (I.getOpcode()) { case Instruction::Br: @@ -4147,21 +4728,27 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { continue; case Instruction::PHI: { // Vectorize PHINodes. - widenPHIInstruction(&I, Entry, UF, VF, PV); + widenPHIInstruction(&I, UF, VF, PV); continue; } // End of PHI. + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::SRem: + case Instruction::URem: + // Scalarize with predication if this instruction may divide by zero and + // block execution is conditional, otherwise fallthrough. + if (Legal->isScalarWithPredication(&I)) { + scalarizeInstruction(&I, true); + continue; + } case Instruction::Add: case Instruction::FAdd: case Instruction::Sub: case Instruction::FSub: case Instruction::Mul: case Instruction::FMul: - case Instruction::UDiv: - case Instruction::SDiv: case Instruction::FDiv: - case Instruction::URem: - case Instruction::SRem: case Instruction::FRem: case Instruction::Shl: case Instruction::LShr: @@ -4172,10 +4759,11 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { // Just widen binops. auto *BinOp = cast<BinaryOperator>(&I); setDebugLocFromInst(Builder, BinOp); - VectorParts &A = getVectorValue(BinOp->getOperand(0)); - VectorParts &B = getVectorValue(BinOp->getOperand(1)); + const VectorParts &A = getVectorValue(BinOp->getOperand(0)); + const VectorParts &B = getVectorValue(BinOp->getOperand(1)); // Use this vector value for all users of the original instruction. + VectorParts Entry(UF); for (unsigned Part = 0; Part < UF; ++Part) { Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]); @@ -4185,6 +4773,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { Entry[Part] = V; } + VectorLoopValueMap.initVector(&I, Entry); addMetadata(Entry, BinOp); break; } @@ -4201,20 +4790,19 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { // loop. This means that we can't just use the original 'cond' value. // We have to take the 'vectorized' value and pick the first lane. // Instcombine will make this a no-op. - VectorParts &Cond = getVectorValue(I.getOperand(0)); - VectorParts &Op0 = getVectorValue(I.getOperand(1)); - VectorParts &Op1 = getVectorValue(I.getOperand(2)); + const VectorParts &Cond = getVectorValue(I.getOperand(0)); + const VectorParts &Op0 = getVectorValue(I.getOperand(1)); + const VectorParts &Op1 = getVectorValue(I.getOperand(2)); - Value *ScalarCond = - (VF == 1) - ? Cond[0] - : Builder.CreateExtractElement(Cond[0], Builder.getInt32(0)); + auto *ScalarCond = getScalarValue(I.getOperand(0), 0, 0); + VectorParts Entry(UF); for (unsigned Part = 0; Part < UF; ++Part) { Entry[Part] = Builder.CreateSelect( InvariantCond ? ScalarCond : Cond[Part], Op0[Part], Op1[Part]); } + VectorLoopValueMap.initVector(&I, Entry); addMetadata(Entry, &I); break; } @@ -4225,8 +4813,9 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { bool FCmp = (I.getOpcode() == Instruction::FCmp); auto *Cmp = dyn_cast<CmpInst>(&I); setDebugLocFromInst(Builder, Cmp); - VectorParts &A = getVectorValue(Cmp->getOperand(0)); - VectorParts &B = getVectorValue(Cmp->getOperand(1)); + const VectorParts &A = getVectorValue(Cmp->getOperand(0)); + const VectorParts &B = getVectorValue(Cmp->getOperand(1)); + VectorParts Entry(UF); for (unsigned Part = 0; Part < UF; ++Part) { Value *C = nullptr; if (FCmp) { @@ -4238,6 +4827,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { Entry[Part] = C; } + VectorLoopValueMap.initVector(&I, Entry); addMetadata(Entry, &I); break; } @@ -4268,8 +4858,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { auto ID = Legal->getInductionVars()->lookup(OldInduction); if (isa<TruncInst>(CI) && CI->getOperand(0) == OldInduction && ID.getConstIntStepValue()) { - widenIntInduction(OldInduction, Entry, cast<TruncInst>(CI)); - addMetadata(Entry, &I); + widenIntInduction(OldInduction, cast<TruncInst>(CI)); break; } @@ -4277,9 +4866,11 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { Type *DestTy = (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF); - VectorParts &A = getVectorValue(CI->getOperand(0)); + const VectorParts &A = getVectorValue(CI->getOperand(0)); + VectorParts Entry(UF); for (unsigned Part = 0; Part < UF; ++Part) Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy); + VectorLoopValueMap.initVector(&I, Entry); addMetadata(Entry, &I); break; } @@ -4318,6 +4909,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { break; } + VectorParts Entry(UF); for (unsigned Part = 0; Part < UF; ++Part) { SmallVector<Value *, 4> Args; for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { @@ -4325,7 +4917,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { // Some intrinsics have a scalar argument - don't replace it with a // vector. if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) { - VectorParts &VectorArg = getVectorValue(CI->getArgOperand(i)); + const VectorParts &VectorArg = getVectorValue(CI->getArgOperand(i)); Arg = VectorArg[Part]; } Args.push_back(Arg); @@ -4363,6 +4955,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { Entry[Part] = V; } + VectorLoopValueMap.initVector(&I, Entry); addMetadata(Entry, &I); break; } @@ -4414,7 +5007,8 @@ static bool canIfConvertPHINodes(BasicBlock *BB) { bool LoopVectorizationLegality::canVectorizeWithIfConvert() { if (!EnableIfConversion) { - emitAnalysis(VectorizationReport() << "if-conversion is disabled"); + ORE->emit(createMissedAnalysis("IfConversionDisabled") + << "if-conversion is disabled"); return false; } @@ -4428,12 +5022,9 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { if (blockNeedsPredication(BB)) continue; - for (Instruction &I : *BB) { - if (auto *LI = dyn_cast<LoadInst>(&I)) - SafePointes.insert(LI->getPointerOperand()); - else if (auto *SI = dyn_cast<StoreInst>(&I)) - SafePointes.insert(SI->getPointerOperand()); - } + for (Instruction &I : *BB) + if (auto *Ptr = getPointerOperand(&I)) + SafePointes.insert(Ptr); } // Collect the blocks that need predication. @@ -4441,21 +5032,21 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { for (BasicBlock *BB : TheLoop->blocks()) { // We don't support switch statements inside loops. if (!isa<BranchInst>(BB->getTerminator())) { - emitAnalysis(VectorizationReport(BB->getTerminator()) - << "loop contains a switch statement"); + ORE->emit(createMissedAnalysis("LoopContainsSwitch", BB->getTerminator()) + << "loop contains a switch statement"); return false; } // We must be able to predicate all blocks that need to be predicated. if (blockNeedsPredication(BB)) { if (!blockCanBePredicated(BB, SafePointes)) { - emitAnalysis(VectorizationReport(BB->getTerminator()) - << "control flow cannot be substituted for a select"); + ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator()) + << "control flow cannot be substituted for a select"); return false; } } else if (BB != Header && !canIfConvertPHINodes(BB)) { - emitAnalysis(VectorizationReport(BB->getTerminator()) - << "control flow cannot be substituted for a select"); + ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator()) + << "control flow cannot be substituted for a select"); return false; } } @@ -4468,8 +5059,8 @@ bool LoopVectorizationLegality::canVectorize() { // We must have a loop in canonical form. Loops with indirectbr in them cannot // be canonicalized. if (!TheLoop->getLoopPreheader()) { - emitAnalysis(VectorizationReport() - << "loop control flow is not understood by vectorizer"); + ORE->emit(createMissedAnalysis("CFGNotUnderstood") + << "loop control flow is not understood by vectorizer"); return false; } @@ -4478,21 +5069,22 @@ bool LoopVectorizationLegality::canVectorize() { // // We can only vectorize innermost loops. if (!TheLoop->empty()) { - emitAnalysis(VectorizationReport() << "loop is not the innermost loop"); + ORE->emit(createMissedAnalysis("NotInnermostLoop") + << "loop is not the innermost loop"); return false; } // We must have a single backedge. if (TheLoop->getNumBackEdges() != 1) { - emitAnalysis(VectorizationReport() - << "loop control flow is not understood by vectorizer"); + ORE->emit(createMissedAnalysis("CFGNotUnderstood") + << "loop control flow is not understood by vectorizer"); return false; } // We must have a single exiting block. if (!TheLoop->getExitingBlock()) { - emitAnalysis(VectorizationReport() - << "loop control flow is not understood by vectorizer"); + ORE->emit(createMissedAnalysis("CFGNotUnderstood") + << "loop control flow is not understood by vectorizer"); return false; } @@ -4500,8 +5092,8 @@ bool LoopVectorizationLegality::canVectorize() { // checked at the end of each iteration. With that we can assume that all // instructions in the loop are executed the same number of times. if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { - emitAnalysis(VectorizationReport() - << "loop control flow is not understood by vectorizer"); + ORE->emit(createMissedAnalysis("CFGNotUnderstood") + << "loop control flow is not understood by vectorizer"); return false; } @@ -4519,8 +5111,8 @@ bool LoopVectorizationLegality::canVectorize() { // ScalarEvolution needs to be able to find the exit count. const SCEV *ExitCount = PSE.getBackedgeTakenCount(); if (ExitCount == PSE.getSE()->getCouldNotCompute()) { - emitAnalysis(VectorizationReport() - << "could not determine number of loop iterations"); + ORE->emit(createMissedAnalysis("CantComputeNumberOfIterations") + << "could not determine number of loop iterations"); DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n"); return false; } @@ -4537,9 +5129,6 @@ bool LoopVectorizationLegality::canVectorize() { return false; } - // Collect all of the variables that remain uniform after vectorization. - collectLoopUniforms(); - DEBUG(dbgs() << "LV: We can vectorize this loop" << (LAI->getRuntimePointerChecking()->Need ? " (with a runtime bound check)" @@ -4556,14 +5145,20 @@ bool LoopVectorizationLegality::canVectorize() { if (UseInterleaved) InterleaveInfo.analyzeInterleaving(*getSymbolicStrides()); + // Collect all instructions that are known to be uniform after vectorization. + collectLoopUniforms(); + + // Collect all instructions that are known to be scalar after vectorization. + collectLoopScalars(); + unsigned SCEVThreshold = VectorizeSCEVCheckThreshold; if (Hints->getForce() == LoopVectorizeHints::FK_Enabled) SCEVThreshold = PragmaVectorizeSCEVCheckThreshold; if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) { - emitAnalysis(VectorizationReport() - << "Too many SCEV assumptions need to be made and checked " - << "at runtime"); + ORE->emit(createMissedAnalysis("TooManySCEVRunTimeChecks") + << "Too many SCEV assumptions need to be made and checked " + << "at runtime"); DEBUG(dbgs() << "LV: Too many SCEV checks needed.\n"); return false; } @@ -4621,10 +5216,12 @@ void LoopVectorizationLegality::addInductionPhi( const DataLayout &DL = Phi->getModule()->getDataLayout(); // Get the widest type. - if (!WidestIndTy) - WidestIndTy = convertPointerToIntegerType(DL, PhiTy); - else - WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy); + if (!PhiTy->isFloatingPointTy()) { + if (!WidestIndTy) + WidestIndTy = convertPointerToIntegerType(DL, PhiTy); + else + WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy); + } // Int inductions are special because we only allow one IV. if (ID.getKind() == InductionDescriptor::IK_IntInduction && @@ -4667,8 +5264,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // Check that this PHI type is allowed. if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() && !PhiTy->isPointerTy()) { - emitAnalysis(VectorizationReport(Phi) - << "loop control flow is not understood by vectorizer"); + ORE->emit(createMissedAnalysis("CFGNotUnderstood", Phi) + << "loop control flow is not understood by vectorizer"); DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n"); return false; } @@ -4681,16 +5278,16 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // identified reduction value with an outside user. if (!hasOutsideLoopUser(TheLoop, Phi, AllowedExit)) continue; - emitAnalysis(VectorizationReport(Phi) - << "value could not be identified as " - "an induction or reduction variable"); + ORE->emit(createMissedAnalysis("NeitherInductionNorReduction", Phi) + << "value could not be identified as " + "an induction or reduction variable"); return false; } // We only allow if-converted PHIs with exactly two incoming values. if (Phi->getNumIncomingValues() != 2) { - emitAnalysis(VectorizationReport(Phi) - << "control flow not understood by vectorizer"); + ORE->emit(createMissedAnalysis("CFGNotUnderstood", Phi) + << "control flow not understood by vectorizer"); DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); return false; } @@ -4705,8 +5302,10 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { } InductionDescriptor ID; - if (InductionDescriptor::isInductionPHI(Phi, PSE, ID)) { + if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID)) { addInductionPhi(Phi, ID, AllowedExit); + if (ID.hasUnsafeAlgebra() && !HasFunNoNaNAttr) + Requirements->addUnsafeAlgebraInst(ID.getUnsafeAlgebraInst()); continue; } @@ -4717,14 +5316,14 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // As a last resort, coerce the PHI to a AddRec expression // and re-try classifying it a an induction PHI. - if (InductionDescriptor::isInductionPHI(Phi, PSE, ID, true)) { + if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true)) { addInductionPhi(Phi, ID, AllowedExit); continue; } - emitAnalysis(VectorizationReport(Phi) - << "value that could not be identified as " - "reduction is used outside the loop"); + ORE->emit(createMissedAnalysis("NonReductionValueUsedOutsideLoop", Phi) + << "value that could not be identified as " + "reduction is used outside the loop"); DEBUG(dbgs() << "LV: Found an unidentified PHI." << *Phi << "\n"); return false; } // end of PHI handling @@ -4738,8 +5337,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { !isa<DbgInfoIntrinsic>(CI) && !(CI->getCalledFunction() && TLI && TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) { - emitAnalysis(VectorizationReport(CI) - << "call instruction cannot be vectorized"); + ORE->emit(createMissedAnalysis("CantVectorizeCall", CI) + << "call instruction cannot be vectorized"); DEBUG(dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n"); return false; } @@ -4750,8 +5349,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { getVectorIntrinsicIDForCall(CI, TLI), 1)) { auto *SE = PSE.getSE(); if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(1)), TheLoop)) { - emitAnalysis(VectorizationReport(CI) - << "intrinsic instruction cannot be vectorized"); + ORE->emit(createMissedAnalysis("CantVectorizeIntrinsic", CI) + << "intrinsic instruction cannot be vectorized"); DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n"); return false; } @@ -4762,8 +5361,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if ((!VectorType::isValidElementType(I.getType()) && !I.getType()->isVoidTy()) || isa<ExtractElementInst>(I)) { - emitAnalysis(VectorizationReport(&I) - << "instruction return type cannot be vectorized"); + ORE->emit(createMissedAnalysis("CantVectorizeInstructionReturnType", &I) + << "instruction return type cannot be vectorized"); DEBUG(dbgs() << "LV: Found unvectorizable type.\n"); return false; } @@ -4772,8 +5371,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (auto *ST = dyn_cast<StoreInst>(&I)) { Type *T = ST->getValueOperand()->getType(); if (!VectorType::isValidElementType(T)) { - emitAnalysis(VectorizationReport(ST) - << "store instruction cannot be vectorized"); + ORE->emit(createMissedAnalysis("CantVectorizeStore", ST) + << "store instruction cannot be vectorized"); return false; } @@ -4791,8 +5390,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // Reduction instructions are allowed to have exit users. // All other instructions must not have external users. if (hasOutsideLoopUser(TheLoop, &I, AllowedExit)) { - emitAnalysis(VectorizationReport(&I) - << "value cannot be used outside the loop"); + ORE->emit(createMissedAnalysis("ValueUsedOutsideLoop", &I) + << "value cannot be used outside the loop"); return false; } @@ -4802,8 +5401,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (!Induction) { DEBUG(dbgs() << "LV: Did not find one integer induction var.\n"); if (Inductions.empty()) { - emitAnalysis(VectorizationReport() - << "loop induction variable could not be identified"); + ORE->emit(createMissedAnalysis("NoInductionVariable") + << "loop induction variable could not be identified"); return false; } } @@ -4817,12 +5416,132 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { return true; } +void LoopVectorizationLegality::collectLoopScalars() { + + // If an instruction is uniform after vectorization, it will remain scalar. + Scalars.insert(Uniforms.begin(), Uniforms.end()); + + // Collect the getelementptr instructions that will not be vectorized. A + // getelementptr instruction is only vectorized if it is used for a legal + // gather or scatter operation. + for (auto *BB : TheLoop->blocks()) + for (auto &I : *BB) { + if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) { + Scalars.insert(GEP); + continue; + } + auto *Ptr = getPointerOperand(&I); + if (!Ptr) + continue; + auto *GEP = getGEPInstruction(Ptr); + if (GEP && isLegalGatherOrScatter(&I)) + Scalars.erase(GEP); + } + + // An induction variable will remain scalar if all users of the induction + // variable and induction variable update remain scalar. + auto *Latch = TheLoop->getLoopLatch(); + for (auto &Induction : *getInductionVars()) { + auto *Ind = Induction.first; + auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); + + // Determine if all users of the induction variable are scalar after + // vectorization. + auto ScalarInd = all_of(Ind->users(), [&](User *U) -> bool { + auto *I = cast<Instruction>(U); + return I == IndUpdate || !TheLoop->contains(I) || Scalars.count(I); + }); + if (!ScalarInd) + continue; + + // Determine if all users of the induction variable update instruction are + // scalar after vectorization. + auto ScalarIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool { + auto *I = cast<Instruction>(U); + return I == Ind || !TheLoop->contains(I) || Scalars.count(I); + }); + if (!ScalarIndUpdate) + continue; + + // The induction variable and its update instruction will remain scalar. + Scalars.insert(Ind); + Scalars.insert(IndUpdate); + } +} + +bool LoopVectorizationLegality::hasConsecutiveLikePtrOperand(Instruction *I) { + if (isAccessInterleaved(I)) + return true; + if (auto *Ptr = getPointerOperand(I)) + return isConsecutivePtr(Ptr); + return false; +} + +bool LoopVectorizationLegality::isScalarWithPredication(Instruction *I) { + if (!blockNeedsPredication(I->getParent())) + return false; + switch(I->getOpcode()) { + default: + break; + case Instruction::Store: + return !isMaskRequired(I); + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::SRem: + case Instruction::URem: + return mayDivideByZero(*I); + } + return false; +} + +bool LoopVectorizationLegality::memoryInstructionMustBeScalarized( + Instruction *I, unsigned VF) { + + // If the memory instruction is in an interleaved group, it will be + // vectorized and its pointer will remain uniform. + if (isAccessInterleaved(I)) + return false; + + // Get and ensure we have a valid memory instruction. + LoadInst *LI = dyn_cast<LoadInst>(I); + StoreInst *SI = dyn_cast<StoreInst>(I); + assert((LI || SI) && "Invalid memory instruction"); + + // If the pointer operand is uniform (loop invariant), the memory instruction + // will be scalarized. + auto *Ptr = getPointerOperand(I); + if (LI && isUniform(Ptr)) + return true; + + // If the pointer operand is non-consecutive and neither a gather nor a + // scatter operation is legal, the memory instruction will be scalarized. + if (!isConsecutivePtr(Ptr) && !isLegalGatherOrScatter(I)) + return true; + + // If the instruction is a store located in a predicated block, it will be + // scalarized. + if (isScalarWithPredication(I)) + return true; + + // If the instruction's allocated size doesn't equal it's type size, it + // requires padding and will be scalarized. + auto &DL = I->getModule()->getDataLayout(); + auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); + if (hasIrregularType(ScalarTy, DL, VF)) + return true; + + // Otherwise, the memory instruction should be vectorized if the rest of the + // loop is. + return false; +} + void LoopVectorizationLegality::collectLoopUniforms() { // We now know that the loop is vectorizable! - // Collect variables that will remain uniform after vectorization. + // Collect instructions inside the loop that will remain uniform after + // vectorization. - // If V is not an instruction inside the current loop, it is a Value - // outside of the scope which we are interesting in. + // Global values, params and instructions outside of current loop are out of + // scope. auto isOutOfScope = [&](Value *V) -> bool { Instruction *I = dyn_cast<Instruction>(V); return (!I || !TheLoop->contains(I)); @@ -4830,30 +5549,82 @@ void LoopVectorizationLegality::collectLoopUniforms() { SetVector<Instruction *> Worklist; BasicBlock *Latch = TheLoop->getLoopLatch(); - // Start with the conditional branch. - if (!isOutOfScope(Latch->getTerminator()->getOperand(0))) { - Instruction *Cmp = cast<Instruction>(Latch->getTerminator()->getOperand(0)); + + // Start with the conditional branch. If the branch condition is an + // instruction contained in the loop that is only used by the branch, it is + // uniform. + auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); + if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) { Worklist.insert(Cmp); DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n"); } - // Also add all consecutive pointer values; these values will be uniform - // after vectorization (and subsequent cleanup). - for (auto *BB : TheLoop->blocks()) { + // Holds consecutive and consecutive-like pointers. Consecutive-like pointers + // are pointers that are treated like consecutive pointers during + // vectorization. The pointer operands of interleaved accesses are an + // example. + SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs; + + // Holds pointer operands of instructions that are possibly non-uniform. + SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs; + + // Iterate over the instructions in the loop, and collect all + // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible + // that a consecutive-like pointer operand will be scalarized, we collect it + // in PossibleNonUniformPtrs instead. We use two sets here because a single + // getelementptr instruction can be used by both vectorized and scalarized + // memory instructions. For example, if a loop loads and stores from the same + // location, but the store is conditional, the store will be scalarized, and + // the getelementptr won't remain uniform. + for (auto *BB : TheLoop->blocks()) for (auto &I : *BB) { - if (I.getType()->isPointerTy() && isConsecutivePtr(&I)) { - Worklist.insert(&I); - DEBUG(dbgs() << "LV: Found uniform instruction: " << I << "\n"); - } + + // If there's no pointer operand, there's nothing to do. + auto *Ptr = dyn_cast_or_null<Instruction>(getPointerOperand(&I)); + if (!Ptr) + continue; + + // True if all users of Ptr are memory accesses that have Ptr as their + // pointer operand. + auto UsersAreMemAccesses = all_of(Ptr->users(), [&](User *U) -> bool { + return getPointerOperand(U) == Ptr; + }); + + // Ensure the memory instruction will not be scalarized, making its + // pointer operand non-uniform. If the pointer operand is used by some + // instruction other than a memory access, we're not going to check if + // that other instruction may be scalarized here. Thus, conservatively + // assume the pointer operand may be non-uniform. + if (!UsersAreMemAccesses || memoryInstructionMustBeScalarized(&I)) + PossibleNonUniformPtrs.insert(Ptr); + + // If the memory instruction will be vectorized and its pointer operand + // is consecutive-like, the pointer operand should remain uniform. + else if (hasConsecutiveLikePtrOperand(&I)) + ConsecutiveLikePtrs.insert(Ptr); + + // Otherwise, if the memory instruction will be vectorized and its + // pointer operand is non-consecutive-like, the memory instruction should + // be a gather or scatter operation. Its pointer operand will be + // non-uniform. + else + PossibleNonUniformPtrs.insert(Ptr); + } + + // Add to the Worklist all consecutive and consecutive-like pointers that + // aren't also identified as possibly non-uniform. + for (auto *V : ConsecutiveLikePtrs) + if (!PossibleNonUniformPtrs.count(V)) { + DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n"); + Worklist.insert(V); } - } // Expand Worklist in topological order: whenever a new instruction // is added , its users should be either already inside Worklist, or // out of scope. It ensures a uniform instruction will only be used // by uniform instructions or out of scope instructions. unsigned idx = 0; - do { + while (idx != Worklist.size()) { Instruction *I = Worklist[idx++]; for (auto OV : I->operand_values()) { @@ -4867,32 +5638,49 @@ void LoopVectorizationLegality::collectLoopUniforms() { DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n"); } } - } while (idx != Worklist.size()); + } + + // Returns true if Ptr is the pointer operand of a memory access instruction + // I, and I is known to not require scalarization. + auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { + return getPointerOperand(I) == Ptr && !memoryInstructionMustBeScalarized(I); + }; // For an instruction to be added into Worklist above, all its users inside - // the current loop should be already added into Worklist. This condition - // cannot be true for phi instructions which is always in a dependence loop. - // Because any instruction in the dependence cycle always depends on others - // in the cycle to be added into Worklist first, the result is no ones in - // the cycle will be added into Worklist in the end. - // That is why we process PHI separately. - for (auto &Induction : *getInductionVars()) { - auto *PN = Induction.first; - auto *UpdateV = PN->getIncomingValueForBlock(TheLoop->getLoopLatch()); - if (all_of(PN->users(), - [&](User *U) -> bool { - return U == UpdateV || isOutOfScope(U) || - Worklist.count(cast<Instruction>(U)); - }) && - all_of(UpdateV->users(), [&](User *U) -> bool { - return U == PN || isOutOfScope(U) || - Worklist.count(cast<Instruction>(U)); - })) { - Worklist.insert(cast<Instruction>(PN)); - Worklist.insert(cast<Instruction>(UpdateV)); - DEBUG(dbgs() << "LV: Found uniform instruction: " << *PN << "\n"); - DEBUG(dbgs() << "LV: Found uniform instruction: " << *UpdateV << "\n"); - } + // the loop should also be in Worklist. However, this condition cannot be + // true for phi nodes that form a cyclic dependence. We must process phi + // nodes separately. An induction variable will remain uniform if all users + // of the induction variable and induction variable update remain uniform. + // The code below handles both pointer and non-pointer induction variables. + for (auto &Induction : Inductions) { + auto *Ind = Induction.first; + auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); + + // Determine if all users of the induction variable are uniform after + // vectorization. + auto UniformInd = all_of(Ind->users(), [&](User *U) -> bool { + auto *I = cast<Instruction>(U); + return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || + isVectorizedMemAccessUse(I, Ind); + }); + if (!UniformInd) + continue; + + // Determine if all users of the induction variable update instruction are + // uniform after vectorization. + auto UniformIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool { + auto *I = cast<Instruction>(U); + return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || + isVectorizedMemAccessUse(I, IndUpdate); + }); + if (!UniformIndUpdate) + continue; + + // The induction variable and its update instruction will remain uniform. + Worklist.insert(Ind); + Worklist.insert(IndUpdate); + DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n"); + DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate << "\n"); } Uniforms.insert(Worklist.begin(), Worklist.end()); @@ -4901,16 +5689,18 @@ void LoopVectorizationLegality::collectLoopUniforms() { bool LoopVectorizationLegality::canVectorizeMemory() { LAI = &(*GetLAA)(*TheLoop); InterleaveInfo.setLAI(LAI); - auto &OptionalReport = LAI->getReport(); - if (OptionalReport) - emitAnalysis(VectorizationReport(*OptionalReport)); + const OptimizationRemarkAnalysis *LAR = LAI->getReport(); + if (LAR) { + OptimizationRemarkAnalysis VR(Hints->vectorizeAnalysisPassName(), + "loop not vectorized: ", *LAR); + ORE->emit(VR); + } if (!LAI->canVectorizeMemory()) return false; if (LAI->hasStoreToLoopInvariantAddress()) { - emitAnalysis( - VectorizationReport() - << "write to a loop invariant address could not be vectorized"); + ORE->emit(createMissedAnalysis("CantVectorizeStoreToLoopInvariantAddress") + << "write to a loop invariant address could not be vectorized"); DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n"); return false; } @@ -4967,7 +5757,6 @@ bool LoopVectorizationLegality::blockCanBePredicated( } } - // We don't predicate stores at the moment. if (I.mayWriteToMemory()) { auto *SI = dyn_cast<StoreInst>(&I); // We only support predication of stores in basic blocks with one @@ -4992,17 +5781,6 @@ bool LoopVectorizationLegality::blockCanBePredicated( } if (I.mayThrow()) return false; - - // The instructions below can trap. - switch (I.getOpcode()) { - default: - continue; - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::URem: - case Instruction::SRem: - return false; - } } return true; @@ -5029,8 +5807,16 @@ void InterleavedAccessInfo::collectConstStrideAccesses( if (!LI && !SI) continue; - Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); - int64_t Stride = getPtrStride(PSE, Ptr, TheLoop, Strides); + Value *Ptr = getPointerOperand(&I); + // We don't check wrapping here because we don't know yet if Ptr will be + // part of a full group or a group with gaps. Checking wrapping for all + // pointers (even those that end up in groups with no gaps) will be overly + // conservative. For full groups, wrapping should be ok since if we would + // wrap around the address space we would do a memory access at nullptr + // even without the transformation. The wrapping checks are therefore + // deferred until after we've formed the interleaved groups. + int64_t Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, + /*Assume=*/true, /*ShouldCheckWrap=*/false); const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr); PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType()); @@ -5234,20 +6020,66 @@ void InterleavedAccessInfo::analyzeInterleaving( if (Group->getNumMembers() != Group->getFactor()) releaseGroup(Group); - // If there is a non-reversed interleaved load group with gaps, we will need - // to execute at least one scalar epilogue iteration. This will ensure that - // we don't speculatively access memory out-of-bounds. Note that we only need - // to look for a member at index factor - 1, since every group must have a - // member at index zero. - for (InterleaveGroup *Group : LoadGroups) - if (!Group->getMember(Group->getFactor() - 1)) { + // Remove interleaved groups with gaps (currently only loads) whose memory + // accesses may wrap around. We have to revisit the getPtrStride analysis, + // this time with ShouldCheckWrap=true, since collectConstStrideAccesses does + // not check wrapping (see documentation there). + // FORNOW we use Assume=false; + // TODO: Change to Assume=true but making sure we don't exceed the threshold + // of runtime SCEV assumptions checks (thereby potentially failing to + // vectorize altogether). + // Additional optional optimizations: + // TODO: If we are peeling the loop and we know that the first pointer doesn't + // wrap then we can deduce that all pointers in the group don't wrap. + // This means that we can forcefully peel the loop in order to only have to + // check the first pointer for no-wrap. When we'll change to use Assume=true + // we'll only need at most one runtime check per interleaved group. + // + for (InterleaveGroup *Group : LoadGroups) { + + // Case 1: A full group. Can Skip the checks; For full groups, if the wide + // load would wrap around the address space we would do a memory access at + // nullptr even without the transformation. + if (Group->getNumMembers() == Group->getFactor()) + continue; + + // Case 2: If first and last members of the group don't wrap this implies + // that all the pointers in the group don't wrap. + // So we check only group member 0 (which is always guaranteed to exist), + // and group member Factor - 1; If the latter doesn't exist we rely on + // peeling (if it is a non-reveresed accsess -- see Case 3). + Value *FirstMemberPtr = getPointerOperand(Group->getMember(0)); + if (!getPtrStride(PSE, FirstMemberPtr, TheLoop, Strides, /*Assume=*/false, + /*ShouldCheckWrap=*/true)) { + DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to " + "first group member potentially pointer-wrapping.\n"); + releaseGroup(Group); + continue; + } + Instruction *LastMember = Group->getMember(Group->getFactor() - 1); + if (LastMember) { + Value *LastMemberPtr = getPointerOperand(LastMember); + if (!getPtrStride(PSE, LastMemberPtr, TheLoop, Strides, /*Assume=*/false, + /*ShouldCheckWrap=*/true)) { + DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to " + "last group member potentially pointer-wrapping.\n"); + releaseGroup(Group); + } + } + else { + // Case 3: A non-reversed interleaved load group with gaps: We need + // to execute at least one scalar epilogue iteration. This will ensure + // we don't speculatively access memory out-of-bounds. We only need + // to look for a member at index factor - 1, since every group must have + // a member at index zero. if (Group->isReverse()) { releaseGroup(Group); - } else { - DEBUG(dbgs() << "LV: Interleaved group requires epilogue iteration.\n"); - RequiresScalarEpilogue = true; + continue; } + DEBUG(dbgs() << "LV: Interleaved group requires epilogue iteration.\n"); + RequiresScalarEpilogue = true; } + } } LoopVectorizationCostModel::VectorizationFactor @@ -5255,28 +6087,22 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { // Width 1 means no vectorize VectorizationFactor Factor = {1U, 0U}; if (OptForSize && Legal->getRuntimePointerChecking()->Need) { - emitAnalysis( - VectorizationReport() - << "runtime pointer checks needed. Enable vectorization of this " - "loop with '#pragma clang loop vectorize(enable)' when " - "compiling with -Os/-Oz"); + ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize") + << "runtime pointer checks needed. Enable vectorization of this " + "loop with '#pragma clang loop vectorize(enable)' when " + "compiling with -Os/-Oz"); DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n"); return Factor; } if (!EnableCondStoresVectorization && Legal->getNumPredStores()) { - emitAnalysis( - VectorizationReport() - << "store that is conditionally executed prevents vectorization"); + ORE->emit(createMissedAnalysis("ConditionalStore") + << "store that is conditionally executed prevents vectorization"); DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n"); return Factor; } - // Find the trip count. - unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); - DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); - MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); unsigned SmallestType, WidestType; std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); @@ -5334,10 +6160,13 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { // If we optimize the program for size, avoid creating the tail loop. if (OptForSize) { - // If we are unable to calculate the trip count then don't try to vectorize. + unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); + DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); + + // If we don't know the precise trip count, don't try to vectorize. if (TC < 2) { - emitAnalysis( - VectorizationReport() + ORE->emit( + createMissedAnalysis("UnknownLoopCountComplexCFG") << "unable to calculate the loop count due to complex control flow"); DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n"); return Factor; @@ -5351,11 +6180,11 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { else { // If the trip count that we found modulo the vectorization factor is not // zero then we require a tail. - emitAnalysis(VectorizationReport() - << "cannot optimize for size and vectorize at the " - "same time. Enable vectorization of this loop " - "with '#pragma clang loop vectorize(enable)' " - "when compiling with -Os/-Oz"); + ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize") + << "cannot optimize for size and vectorize at the " + "same time. Enable vectorization of this loop " + "with '#pragma clang loop vectorize(enable)' " + "when compiling with -Os/-Oz"); DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n"); return Factor; } @@ -5367,6 +6196,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); Factor.Width = UserVF; + collectInstsToScalarize(UserVF); return Factor; } @@ -5712,15 +6542,16 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { for (unsigned int i = 0; i < Index; ++i) { Instruction *I = IdxToInstr[i]; - // Ignore instructions that are never used within the loop. - if (!Ends.count(I)) - continue; // Remove all of the instructions that end at this location. InstrList &List = TransposeEnds[i]; for (Instruction *ToRemove : List) OpenIntervals.erase(ToRemove); + // Ignore instructions that are never used within the loop. + if (!Ends.count(I)) + continue; + // Skip ignored values. if (ValuesToIgnore.count(I)) continue; @@ -5772,10 +6603,160 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { return RUs; } +void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { + + // If we aren't vectorizing the loop, or if we've already collected the + // instructions to scalarize, there's nothing to do. Collection may already + // have occurred if we have a user-selected VF and are now computing the + // expected cost for interleaving. + if (VF < 2 || InstsToScalarize.count(VF)) + return; + + // Initialize a mapping for VF in InstsToScalalarize. If we find that it's + // not profitable to scalarize any instructions, the presence of VF in the + // map will indicate that we've analyzed it already. + ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; + + // Find all the instructions that are scalar with predication in the loop and + // determine if it would be better to not if-convert the blocks they are in. + // If so, we also record the instructions to scalarize. + for (BasicBlock *BB : TheLoop->blocks()) { + if (!Legal->blockNeedsPredication(BB)) + continue; + for (Instruction &I : *BB) + if (Legal->isScalarWithPredication(&I)) { + ScalarCostsTy ScalarCosts; + if (computePredInstDiscount(&I, ScalarCosts, VF) >= 0) + ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); + } + } +} + +int LoopVectorizationCostModel::computePredInstDiscount( + Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts, + unsigned VF) { + + assert(!Legal->isUniformAfterVectorization(PredInst) && + "Instruction marked uniform-after-vectorization will be predicated"); + + // Initialize the discount to zero, meaning that the scalar version and the + // vector version cost the same. + int Discount = 0; + + // Holds instructions to analyze. The instructions we visit are mapped in + // ScalarCosts. Those instructions are the ones that would be scalarized if + // we find that the scalar version costs less. + SmallVector<Instruction *, 8> Worklist; + + // Returns true if the given instruction can be scalarized. + auto canBeScalarized = [&](Instruction *I) -> bool { + + // We only attempt to scalarize instructions forming a single-use chain + // from the original predicated block that would otherwise be vectorized. + // Although not strictly necessary, we give up on instructions we know will + // already be scalar to avoid traversing chains that are unlikely to be + // beneficial. + if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || + Legal->isScalarAfterVectorization(I)) + return false; + + // If the instruction is scalar with predication, it will be analyzed + // separately. We ignore it within the context of PredInst. + if (Legal->isScalarWithPredication(I)) + return false; + + // If any of the instruction's operands are uniform after vectorization, + // the instruction cannot be scalarized. This prevents, for example, a + // masked load from being scalarized. + // + // We assume we will only emit a value for lane zero of an instruction + // marked uniform after vectorization, rather than VF identical values. + // Thus, if we scalarize an instruction that uses a uniform, we would + // create uses of values corresponding to the lanes we aren't emitting code + // for. This behavior can be changed by allowing getScalarValue to clone + // the lane zero values for uniforms rather than asserting. + for (Use &U : I->operands()) + if (auto *J = dyn_cast<Instruction>(U.get())) + if (Legal->isUniformAfterVectorization(J)) + return false; + + // Otherwise, we can scalarize the instruction. + return true; + }; + + // Returns true if an operand that cannot be scalarized must be extracted + // from a vector. We will account for this scalarization overhead below. Note + // that the non-void predicated instructions are placed in their own blocks, + // and their return values are inserted into vectors. Thus, an extract would + // still be required. + auto needsExtract = [&](Instruction *I) -> bool { + return TheLoop->contains(I) && !Legal->isScalarAfterVectorization(I); + }; + + // Compute the expected cost discount from scalarizing the entire expression + // feeding the predicated instruction. We currently only consider expressions + // that are single-use instruction chains. + Worklist.push_back(PredInst); + while (!Worklist.empty()) { + Instruction *I = Worklist.pop_back_val(); + + // If we've already analyzed the instruction, there's nothing to do. + if (ScalarCosts.count(I)) + continue; + + // Compute the cost of the vector instruction. Note that this cost already + // includes the scalarization overhead of the predicated instruction. + unsigned VectorCost = getInstructionCost(I, VF).first; + + // Compute the cost of the scalarized instruction. This cost is the cost of + // the instruction as if it wasn't if-converted and instead remained in the + // predicated block. We will scale this cost by block probability after + // computing the scalarization overhead. + unsigned ScalarCost = VF * getInstructionCost(I, 1).first; + + // Compute the scalarization overhead of needed insertelement instructions + // and phi nodes. + if (Legal->isScalarWithPredication(I) && !I->getType()->isVoidTy()) { + ScalarCost += getScalarizationOverhead(ToVectorTy(I->getType(), VF), true, + false, TTI); + ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI); + } + + // Compute the scalarization overhead of needed extractelement + // instructions. For each of the instruction's operands, if the operand can + // be scalarized, add it to the worklist; otherwise, account for the + // overhead. + for (Use &U : I->operands()) + if (auto *J = dyn_cast<Instruction>(U.get())) { + assert(VectorType::isValidElementType(J->getType()) && + "Instruction has non-scalar type"); + if (canBeScalarized(J)) + Worklist.push_back(J); + else if (needsExtract(J)) + ScalarCost += getScalarizationOverhead(ToVectorTy(J->getType(), VF), + false, true, TTI); + } + + // Scale the total scalar cost by block probability. + ScalarCost /= getReciprocalPredBlockProb(); + + // Compute the discount. A non-negative discount means the vector version + // of the instruction costs more, and scalarizing would be beneficial. + Discount += VectorCost - ScalarCost; + ScalarCosts[I] = ScalarCost; + } + + return Discount; +} + LoopVectorizationCostModel::VectorizationCostTy LoopVectorizationCostModel::expectedCost(unsigned VF) { VectorizationCostTy Cost; + // Collect the instructions (and their associated costs) that will be more + // profitable to scalarize. + collectInstsToScalarize(VF); + // For each block. for (BasicBlock *BB : TheLoop->blocks()) { VectorizationCostTy BlockCost; @@ -5802,11 +6783,14 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) { << VF << " For instruction: " << I << '\n'); } - // We assume that if-converted blocks have a 50% chance of being executed. - // When the code is scalar then some of the blocks are avoided due to CF. - // When the code is vectorized we execute all code paths. + // If we are vectorizing a predicated block, it will have been + // if-converted. This means that the block's instructions (aside from + // stores and instructions that may divide by zero) will now be + // unconditionally executed. For the scalar case, we may not always execute + // the predicated block. Thus, scale the block's cost by the probability of + // executing it. if (VF == 1 && Legal->blockNeedsPredication(BB)) - BlockCost.first /= 2; + BlockCost.first /= getReciprocalPredBlockProb(); Cost.first += BlockCost.first; Cost.second |= BlockCost.second; @@ -5815,35 +6799,19 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) { return Cost; } -/// \brief Check if the load/store instruction \p I may be translated into -/// gather/scatter during vectorization. -/// -/// Pointer \p Ptr specifies address in memory for the given scalar memory -/// instruction. We need it to retrieve data type. -/// Using gather/scatter is possible when it is supported by target. -static bool isGatherOrScatterLegal(Instruction *I, Value *Ptr, - LoopVectorizationLegality *Legal) { - auto *DataTy = cast<PointerType>(Ptr->getType())->getElementType(); - return (isa<LoadInst>(I) && Legal->isLegalMaskedGather(DataTy)) || - (isa<StoreInst>(I) && Legal->isLegalMaskedScatter(DataTy)); -} - -/// \brief Check whether the address computation for a non-consecutive memory -/// access looks like an unlikely candidate for being merged into the indexing -/// mode. +/// \brief Gets Address Access SCEV after verifying that the access pattern +/// is loop invariant except the induction variable dependence. /// -/// We look for a GEP which has one index that is an induction variable and all -/// other indices are loop invariant. If the stride of this access is also -/// within a small bound we decide that this address computation can likely be -/// merged into the addressing mode. -/// In all other cases, we identify the address computation as complex. -static bool isLikelyComplexAddressComputation(Value *Ptr, - LoopVectorizationLegality *Legal, - ScalarEvolution *SE, - const Loop *TheLoop) { +/// This SCEV can be sent to the Target in order to estimate the address +/// calculation cost. +static const SCEV *getAddressAccessSCEV( + Value *Ptr, + LoopVectorizationLegality *Legal, + ScalarEvolution *SE, + const Loop *TheLoop) { auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); if (!Gep) - return true; + return nullptr; // We are looking for a gep with all loop invariant indices except for one // which should be an induction variable. @@ -5852,33 +6820,11 @@ static bool isLikelyComplexAddressComputation(Value *Ptr, Value *Opd = Gep->getOperand(i); if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && !Legal->isInductionVariable(Opd)) - return true; + return nullptr; } - // Now we know we have a GEP ptr, %inv, %ind, %inv. Make sure that the step - // can likely be merged into the address computation. - unsigned MaxMergeDistance = 64; - - const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Ptr)); - if (!AddRec) - return true; - - // Check the step is constant. - const SCEV *Step = AddRec->getStepRecurrence(*SE); - // Calculate the pointer stride and check if it is consecutive. - const auto *C = dyn_cast<SCEVConstant>(Step); - if (!C) - return true; - - const APInt &APStepVal = C->getAPInt(); - - // Huge step value - give up. - if (APStepVal.getBitWidth() > 64) - return true; - - int64_t StepVal = APStepVal.getSExtValue(); - - return StepVal > MaxMergeDistance; + // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. + return SE->getSCEV(Ptr); } static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { @@ -5893,6 +6839,9 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { if (Legal->isUniformAfterVectorization(I)) VF = 1; + if (VF > 1 && isProfitableToScalarize(I, VF)) + return VectorizationCostTy(InstsToScalarize[VF][I], false); + Type *VectorTy; unsigned C = getInstructionCost(I, VF, VectorTy); @@ -5905,7 +6854,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy) { Type *RetTy = I->getType(); - if (VF > 1 && MinBWs.count(I)) + if (canTruncateToMinimalBitwidth(I, VF)) RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); VectorTy = ToVectorTy(RetTy, VF); auto SE = PSE.getSE(); @@ -5932,17 +6881,42 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, // TODO: IF-converted IFs become selects. return 0; } + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::URem: + case Instruction::SRem: + // If we have a predicated instruction, it may not be executed for each + // vector lane. Get the scalarization cost and scale this amount by the + // probability of executing the predicated block. If the instruction is not + // predicated, we fall through to the next case. + if (VF > 1 && Legal->isScalarWithPredication(I)) { + unsigned Cost = 0; + + // These instructions have a non-void type, so account for the phi nodes + // that we will create. This cost is likely to be zero. The phi node + // cost, if any, should be scaled by the block probability because it + // models a copy at the end of each predicated block. + Cost += VF * TTI.getCFInstrCost(Instruction::PHI); + + // The cost of the non-predicated instruction. + Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy); + + // The cost of insertelement and extractelement instructions needed for + // scalarization. + Cost += getScalarizationOverhead(I, VF, TTI); + + // Scale the cost by the probability of executing the predicated blocks. + // This assumes the predicated block for each vector lane is equally + // likely. + return Cost / getReciprocalPredBlockProb(); + } case Instruction::Add: case Instruction::FAdd: case Instruction::Sub: case Instruction::FSub: case Instruction::Mul: case Instruction::FMul: - case Instruction::UDiv: - case Instruction::SDiv: case Instruction::FDiv: - case Instruction::URem: - case Instruction::SRem: case Instruction::FRem: case Instruction::Shl: case Instruction::LShr: @@ -5965,7 +6939,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, TargetTransformInfo::OP_None; Value *Op2 = I->getOperand(1); - // Check for a splat of a constant or for a non uniform vector of constants. + // Check for a splat or for a non uniform vector of constants. if (isa<ConstantInt>(Op2)) { ConstantInt *CInt = cast<ConstantInt>(Op2); if (CInt && CInt->getValue().isPowerOf2()) @@ -5980,10 +6954,12 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, Op2VP = TargetTransformInfo::OP_PowerOf2; Op2VK = TargetTransformInfo::OK_UniformConstantValue; } + } else if (Legal->isUniform(Op2)) { + Op2VK = TargetTransformInfo::OK_UniformValue; } - - return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK, - Op1VP, Op2VP); + SmallVector<const Value *, 4> Operands(I->operand_values()); + return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, + Op2VK, Op1VP, Op2VP, Operands); } case Instruction::Select: { SelectInst *SI = cast<SelectInst>(I); @@ -5999,9 +6975,8 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, case Instruction::FCmp: { Type *ValTy = I->getOperand(0)->getType(); Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); - auto It = MinBWs.find(Op0AsInstruction); - if (VF > 1 && It != MinBWs.end()) - ValTy = IntegerType::get(ValTy->getContext(), It->second); + if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) + ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); VectorTy = ToVectorTy(ValTy, VF); return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy); } @@ -6015,7 +6990,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned Alignment = SI ? SI->getAlignment() : LI->getAlignment(); unsigned AS = SI ? SI->getPointerAddressSpace() : LI->getPointerAddressSpace(); - Value *Ptr = SI ? SI->getPointerOperand() : LI->getPointerOperand(); + Value *Ptr = getPointerOperand(I); // We add the cost of address computation here instead of with the gep // instruction because only here we know whether the operation is // scalarized. @@ -6072,41 +7047,43 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, return Cost; } - // Scalarized loads/stores. - int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); - bool UseGatherOrScatter = - (ConsecutiveStride == 0) && isGatherOrScatterLegal(I, Ptr, Legal); - - bool Reverse = ConsecutiveStride < 0; - const DataLayout &DL = I->getModule()->getDataLayout(); - uint64_t ScalarAllocatedSize = DL.getTypeAllocSize(ValTy); - uint64_t VectorElementSize = DL.getTypeStoreSize(VectorTy) / VF; - if ((!ConsecutiveStride && !UseGatherOrScatter) || - ScalarAllocatedSize != VectorElementSize) { - bool IsComplexComputation = - isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop); + // Check if the memory instruction will be scalarized. + if (Legal->memoryInstructionMustBeScalarized(I, VF)) { unsigned Cost = 0; - // The cost of extracting from the value vector and pointer vector. Type *PtrTy = ToVectorTy(Ptr->getType(), VF); - for (unsigned i = 0; i < VF; ++i) { - // The cost of extracting the pointer operand. - Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, PtrTy, i); - // In case of STORE, the cost of ExtractElement from the vector. - // In case of LOAD, the cost of InsertElement into the returned - // vector. - Cost += TTI.getVectorInstrCost(SI ? Instruction::ExtractElement - : Instruction::InsertElement, - VectorTy, i); - } - // The cost of the scalar loads/stores. - Cost += VF * TTI.getAddressComputationCost(PtrTy, IsComplexComputation); + // Figure out whether the access is strided and get the stride value + // if it's known in compile time + const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, SE, TheLoop); + + // Get the cost of the scalar memory instruction and address computation. + Cost += VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, AS); + + // Get the overhead of the extractelement and insertelement instructions + // we might create due to scalarization. + Cost += getScalarizationOverhead(I, VF, TTI); + + // If we have a predicated store, it may not be executed for each vector + // lane. Scale the cost by the probability of executing the predicated + // block. + if (Legal->isScalarWithPredication(I)) + Cost /= getReciprocalPredBlockProb(); + return Cost; } + // Determine if the pointer operand of the access is either consecutive or + // reverse consecutive. + int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); + bool Reverse = ConsecutiveStride < 0; + + // Determine if either a gather or scatter operation is legal. + bool UseGatherOrScatter = + !ConsecutiveStride && Legal->isLegalGatherOrScatter(I); + unsigned Cost = TTI.getAddressComputationCost(VectorTy); if (UseGatherOrScatter) { assert(ConsecutiveStride == 0 && @@ -6147,7 +7124,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, Type *SrcScalarTy = I->getOperand(0)->getType(); Type *SrcVecTy = ToVectorTy(SrcScalarTy, VF); - if (VF > 1 && MinBWs.count(I)) { + if (canTruncateToMinimalBitwidth(I, VF)) { // This cast is going to be shrunk. This may remove the cast or it might // turn it into slightly different cast. For example, if MinBW == 16, // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". @@ -6176,28 +7153,11 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, return std::min(CallCost, getVectorIntrinsicCost(CI, VF, TTI, TLI)); return CallCost; } - default: { - // We are scalarizing the instruction. Return the cost of the scalar - // instruction, plus the cost of insert and extract into vector - // elements, times the vector width. - unsigned Cost = 0; - - if (!RetTy->isVoidTy() && VF != 1) { - unsigned InsCost = - TTI.getVectorInstrCost(Instruction::InsertElement, VectorTy); - unsigned ExtCost = - TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy); - - // The cost of inserting the results plus extracting each one of the - // operands. - Cost += VF * (InsCost + ExtCost * I->getNumOperands()); - } - + default: // The cost of executing VF copies of the scalar instruction. This opcode // is unknown. Assume that it is the same as 'mul'. - Cost += VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy); - return Cost; - } + return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) + + getScalarizationOverhead(I, VF, TTI); } // end of switch. } @@ -6217,6 +7177,7 @@ INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) namespace llvm { @@ -6226,14 +7187,11 @@ Pass *createLoopVectorizePass(bool NoUnrolling, bool AlwaysVectorize) { } bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { - // Check for a store. - if (auto *ST = dyn_cast<StoreInst>(Inst)) - return Legal->isConsecutivePtr(ST->getPointerOperand()) != 0; - - // Check for a load. - if (auto *LI = dyn_cast<LoadInst>(Inst)) - return Legal->isConsecutivePtr(LI->getPointerOperand()) != 0; + // Check if the pointer operand of a load or store instruction is + // consecutive. + if (auto *Ptr = getPointerOperand(Inst)) + return Legal->isConsecutivePtr(Ptr); return false; } @@ -6249,123 +7207,46 @@ void LoopVectorizationCostModel::collectValuesToIgnore() { VecValuesToIgnore.insert(Casts.begin(), Casts.end()); } - // Ignore induction phis that are only used in either GetElementPtr or ICmp - // instruction to exit loop. Induction variables usually have large types and - // can have big impact when estimating register usage. - // This is for when VF > 1. - for (auto &Induction : *Legal->getInductionVars()) { - auto *PN = Induction.first; - auto *UpdateV = PN->getIncomingValueForBlock(TheLoop->getLoopLatch()); - - // Check that the PHI is only used by the induction increment (UpdateV) or - // by GEPs. Then check that UpdateV is only used by a compare instruction, - // the loop header PHI, or by GEPs. - // FIXME: Need precise def-use analysis to determine if this instruction - // variable will be vectorized. - if (all_of(PN->users(), - [&](const User *U) -> bool { - return U == UpdateV || isa<GetElementPtrInst>(U); - }) && - all_of(UpdateV->users(), [&](const User *U) -> bool { - return U == PN || isa<ICmpInst>(U) || isa<GetElementPtrInst>(U); - })) { - VecValuesToIgnore.insert(PN); - VecValuesToIgnore.insert(UpdateV); - } - } - - // Ignore instructions that will not be vectorized. - // This is for when VF > 1. - for (BasicBlock *BB : TheLoop->blocks()) { - for (auto &Inst : *BB) { - switch (Inst.getOpcode()) - case Instruction::GetElementPtr: { - // Ignore GEP if its last operand is an induction variable so that it is - // a consecutive load/store and won't be vectorized as scatter/gather - // pattern. - - GetElementPtrInst *Gep = cast<GetElementPtrInst>(&Inst); - unsigned NumOperands = Gep->getNumOperands(); - unsigned InductionOperand = getGEPInductionOperand(Gep); - bool GepToIgnore = true; - - // Check that all of the gep indices are uniform except for the - // induction operand. - for (unsigned i = 0; i != NumOperands; ++i) { - if (i != InductionOperand && - !PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)), - TheLoop)) { - GepToIgnore = false; - break; - } - } - - if (GepToIgnore) - VecValuesToIgnore.insert(&Inst); - break; - } - } - } + // Insert values known to be scalar into VecValuesToIgnore. This is a + // conservative estimation of the values that will later be scalarized. + // + // FIXME: Even though an instruction is not scalar-after-vectoriztion, it may + // still be scalarized. For example, we may find an instruction to be + // more profitable for a given vectorization factor if it were to be + // scalarized. But at this point, we haven't yet computed the + // vectorization factor. + for (auto *BB : TheLoop->getBlocks()) + for (auto &I : *BB) + if (Legal->isScalarAfterVectorization(&I)) + VecValuesToIgnore.insert(&I); } void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr, - bool IfPredicateStore) { + bool IfPredicateInstr) { assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); // Holds vector parameters or scalars, in case of uniform vals. SmallVector<VectorParts, 4> Params; setDebugLocFromInst(Builder, Instr); - // Find all of the vectorized parameters. - for (Value *SrcOp : Instr->operands()) { - // If we are accessing the old induction variable, use the new one. - if (SrcOp == OldInduction) { - Params.push_back(getVectorValue(SrcOp)); - continue; - } - - // Try using previously calculated values. - Instruction *SrcInst = dyn_cast<Instruction>(SrcOp); - - // If the src is an instruction that appeared earlier in the basic block - // then it should already be vectorized. - if (SrcInst && OrigLoop->contains(SrcInst)) { - assert(WidenMap.has(SrcInst) && "Source operand is unavailable"); - // The parameter is a vector value from earlier. - Params.push_back(WidenMap.get(SrcInst)); - } else { - // The parameter is a scalar from outside the loop. Maybe even a constant. - VectorParts Scalars; - Scalars.append(UF, SrcOp); - Params.push_back(Scalars); - } - } - - assert(Params.size() == Instr->getNumOperands() && - "Invalid number of operands"); - // Does this instruction return a value ? bool IsVoidRetTy = Instr->getType()->isVoidTy(); - Value *UndefVec = IsVoidRetTy ? nullptr : UndefValue::get(Instr->getType()); - // Create a new entry in the WidenMap and initialize it to Undef or Null. - VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); + // Initialize a new scalar map entry. + ScalarParts Entry(UF); VectorParts Cond; - if (IfPredicateStore) { - assert(Instr->getParent()->getSinglePredecessor() && - "Only support single predecessor blocks"); - Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(), - Instr->getParent()); - } + if (IfPredicateInstr) + Cond = createBlockInMask(Instr->getParent()); // For each vector unroll 'part': for (unsigned Part = 0; Part < UF; ++Part) { + Entry[Part].resize(1); // For each scalar that we create: // Start an "if (pred) a[i] = ..." block. Value *Cmp = nullptr; - if (IfPredicateStore) { + if (IfPredicateInstr) { if (Cond[Part]->getType()->isVectorTy()) Cond[Part] = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(0)); @@ -6376,47 +7257,57 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr, Instruction *Cloned = Instr->clone(); if (!IsVoidRetTy) Cloned->setName(Instr->getName() + ".cloned"); - // Replace the operands of the cloned instructions with extracted scalars. + + // Replace the operands of the cloned instructions with their scalar + // equivalents in the new loop. for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { - Value *Op = Params[op][Part]; - Cloned->setOperand(op, Op); + auto *NewOp = getScalarValue(Instr->getOperand(op), Part, 0); + Cloned->setOperand(op, NewOp); } // Place the cloned scalar in the new loop. Builder.Insert(Cloned); + // Add the cloned scalar to the scalar map entry. + Entry[Part][0] = Cloned; + // If we just cloned a new assumption, add it the assumption cache. if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) if (II->getIntrinsicID() == Intrinsic::assume) AC->registerAssumption(II); - // If the original scalar returns a value we need to place it in a vector - // so that future users will be able to use it. - if (!IsVoidRetTy) - VecResults[Part] = Cloned; - // End if-block. - if (IfPredicateStore) - PredicatedStores.push_back(std::make_pair(cast<StoreInst>(Cloned), Cmp)); + if (IfPredicateInstr) + PredicatedInstructions.push_back(std::make_pair(Cloned, Cmp)); } + VectorLoopValueMap.initScalar(Instr, Entry); } void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) { auto *SI = dyn_cast<StoreInst>(Instr); - bool IfPredicateStore = (SI && Legal->blockNeedsPredication(SI->getParent())); + bool IfPredicateInstr = (SI && Legal->blockNeedsPredication(SI->getParent())); - return scalarizeInstruction(Instr, IfPredicateStore); + return scalarizeInstruction(Instr, IfPredicateInstr); } Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } -Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step) { +Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, + Instruction::BinaryOps BinOp) { // When unrolling and the VF is 1, we only need to add a simple scalar. - Type *ITy = Val->getType(); - assert(!ITy->isVectorTy() && "Val must be a scalar"); - Constant *C = ConstantInt::get(ITy, StartIdx); + Type *Ty = Val->getType(); + assert(!Ty->isVectorTy() && "Val must be a scalar"); + + if (Ty->isFloatingPointTy()) { + Constant *C = ConstantFP::get(Ty, (double)StartIdx); + + // Floating point operations had to be 'fast' to enable the unrolling. + Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); + return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); + } + Constant *C = ConstantInt::get(Ty, StartIdx); return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); } @@ -6465,7 +7356,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { << L->getHeader()->getParent()->getName() << "\" from " << DebugLocStr << "\n"); - LoopVectorizeHints Hints(L, DisableUnrolling); + LoopVectorizeHints Hints(L, DisableUnrolling, *ORE); DEBUG(dbgs() << "LV: Loop hints:" << " force=" @@ -6483,8 +7374,8 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Looking at the diagnostic output is the only way to determine if a loop // was vectorized (other than looking at the IR or machine code), so it // is important to generate an optimization remark for each loop. Most of - // these messages are generated by emitOptimizationRemarkAnalysis. Remarks - // generated by emitOptimizationRemark and emitOptimizationRemarkMissed are + // these messages are generated as OptimizationRemarkAnalysis. Remarks + // generated as OptimizationRemark and OptimizationRemarkMissed are // less verbose reporting vectorized loops and unvectorized loops that may // benefit from vectorization, respectively. @@ -6495,17 +7386,18 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Check the loop for a trip count threshold: // do not vectorize loops with a tiny trip count. - const unsigned TC = SE->getSmallConstantTripCount(L); - if (TC > 0u && TC < TinyTripCountVectorThreshold) { + const unsigned MaxTC = SE->getSmallConstantMaxTripCount(L); + if (MaxTC > 0u && MaxTC < TinyTripCountVectorThreshold) { DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " << "This loop is not worth vectorizing."); if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); else { DEBUG(dbgs() << "\n"); - emitAnalysisDiag(F, L, Hints, VectorizationReport() - << "vectorization is not beneficial " - "and is not explicitly forced"); + ORE->emit(createMissedAnalysis(Hints.vectorizeAnalysisPassName(), + "NotBeneficial", L) + << "vectorization is not beneficial " + "and is not explicitly forced"); return false; } } @@ -6513,17 +7405,17 @@ bool LoopVectorizePass::processLoop(Loop *L) { PredicatedScalarEvolution PSE(*SE, *L); // Check if it is legal to vectorize the loop. - LoopVectorizationRequirements Requirements; - LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, GetLAA, LI, + LoopVectorizationRequirements Requirements(*ORE); + LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, GetLAA, LI, ORE, &Requirements, &Hints); if (!LVL.canVectorize()) { DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); - emitMissedWarning(F, L, Hints); + emitMissedWarning(F, L, Hints, ORE); return false; } // Use the cost model. - LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, F, + LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F, &Hints); CM.collectValuesToIgnore(); @@ -6551,11 +7443,10 @@ bool LoopVectorizePass::processLoop(Loop *L) { if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat" "attribute is used.\n"); - emitAnalysisDiag( - F, L, Hints, - VectorizationReport() - << "loop not vectorized due to NoImplicitFloat attribute"); - emitMissedWarning(F, L, Hints); + ORE->emit(createMissedAnalysis(Hints.vectorizeAnalysisPassName(), + "NoImplicitFloat", L) + << "loop not vectorized due to NoImplicitFloat attribute"); + emitMissedWarning(F, L, Hints, ORE); return false; } @@ -6566,10 +7457,10 @@ bool LoopVectorizePass::processLoop(Loop *L) { if (Hints.isPotentiallyUnsafe() && TTI->isFPVectorizationPotentiallyUnsafe()) { DEBUG(dbgs() << "LV: Potentially unsafe FP op prevents vectorization.\n"); - emitAnalysisDiag(F, L, Hints, - VectorizationReport() - << "loop not vectorized due to unsafe FP support."); - emitMissedWarning(F, L, Hints); + ORE->emit( + createMissedAnalysis(Hints.vectorizeAnalysisPassName(), "UnsafeFP", L) + << "loop not vectorized due to unsafe FP support."); + emitMissedWarning(F, L, Hints, ORE); return false; } @@ -6584,38 +7475,43 @@ bool LoopVectorizePass::processLoop(Loop *L) { unsigned UserIC = Hints.getInterleave(); // Identify the diagnostic messages that should be produced. - std::string VecDiagMsg, IntDiagMsg; + std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; bool VectorizeLoop = true, InterleaveLoop = true; - if (Requirements.doesNotMeet(F, L, Hints)) { DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " "requirements.\n"); - emitMissedWarning(F, L, Hints); + emitMissedWarning(F, L, Hints, ORE); return false; } if (VF.Width == 1) { DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); - VecDiagMsg = - "the cost-model indicates that vectorization is not beneficial"; + VecDiagMsg = std::make_pair( + "VectorizationNotBeneficial", + "the cost-model indicates that vectorization is not beneficial"); VectorizeLoop = false; } if (IC == 1 && UserIC <= 1) { // Tell the user interleaving is not beneficial. DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); - IntDiagMsg = - "the cost-model indicates that interleaving is not beneficial"; + IntDiagMsg = std::make_pair( + "InterleavingNotBeneficial", + "the cost-model indicates that interleaving is not beneficial"); InterleaveLoop = false; - if (UserIC == 1) - IntDiagMsg += + if (UserIC == 1) { + IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; + IntDiagMsg.second += " and is explicitly disabled or interleave count is set to 1"; + } } else if (IC > 1 && UserIC == 1) { // Tell the user interleaving is beneficial, but it explicitly disabled. DEBUG(dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); - IntDiagMsg = "the cost-model indicates that interleaving is beneficial " - "but is explicitly disabled or interleave count is set to 1"; + IntDiagMsg = std::make_pair( + "InterleavingBeneficialButDisabled", + "the cost-model indicates that interleaving is beneficial " + "but is explicitly disabled or interleave count is set to 1"); InterleaveLoop = false; } @@ -6626,40 +7522,48 @@ bool LoopVectorizePass::processLoop(Loop *L) { const char *VAPassName = Hints.vectorizeAnalysisPassName(); if (!VectorizeLoop && !InterleaveLoop) { // Do not vectorize or interleaving the loop. - emitOptimizationRemarkAnalysis(F->getContext(), VAPassName, *F, - L->getStartLoc(), VecDiagMsg); - emitOptimizationRemarkAnalysis(F->getContext(), LV_NAME, *F, - L->getStartLoc(), IntDiagMsg); + ORE->emit(OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, + L->getStartLoc(), L->getHeader()) + << VecDiagMsg.second); + ORE->emit(OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, + L->getStartLoc(), L->getHeader()) + << IntDiagMsg.second); return false; } else if (!VectorizeLoop && InterleaveLoop) { DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); - emitOptimizationRemarkAnalysis(F->getContext(), VAPassName, *F, - L->getStartLoc(), VecDiagMsg); + ORE->emit(OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, + L->getStartLoc(), L->getHeader()) + << VecDiagMsg.second); } else if (VectorizeLoop && !InterleaveLoop) { DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in " << DebugLocStr << '\n'); - emitOptimizationRemarkAnalysis(F->getContext(), LV_NAME, *F, - L->getStartLoc(), IntDiagMsg); + ORE->emit(OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, + L->getStartLoc(), L->getHeader()) + << IntDiagMsg.second); } else if (VectorizeLoop && InterleaveLoop) { DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in " << DebugLocStr << '\n'); DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); } + using namespace ore; if (!VectorizeLoop) { assert(IC > 1 && "interleave count should not be 1 or 0"); // If we decided that it is not legal to vectorize the loop, then // interleave it. - InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, IC); - Unroller.vectorize(&LVL, CM.MinBWs, CM.VecValuesToIgnore); - - emitOptimizationRemark(F->getContext(), LV_NAME, *F, L->getStartLoc(), - Twine("interleaved loop (interleaved count: ") + - Twine(IC) + ")"); + InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, + &CM); + Unroller.vectorize(); + + ORE->emit(OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), + L->getHeader()) + << "interleaved loop (interleaved count: " + << NV("InterleaveCount", IC) << ")"); } else { // If we decided that it is *legal* to vectorize the loop, then do it. - InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, VF.Width, IC); - LB.vectorize(&LVL, CM.MinBWs, CM.VecValuesToIgnore); + InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, + &LVL, &CM); + LB.vectorize(); ++LoopsVectorized; // Add metadata to disable runtime unrolling a scalar loop when there are @@ -6669,10 +7573,11 @@ bool LoopVectorizePass::processLoop(Loop *L) { AddRuntimeUnrollDisableMetaData(L); // Report the vectorization decision. - emitOptimizationRemark(F->getContext(), LV_NAME, *F, L->getStartLoc(), - Twine("vectorized loop (vectorization width: ") + - Twine(VF.Width) + ", interleaved count: " + - Twine(IC) + ")"); + ORE->emit(OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), + L->getHeader()) + << "vectorized loop (vectorization width: " + << NV("VectorizationFactor", VF.Width) + << ", interleaved count: " << NV("InterleaveCount", IC) << ")"); } // Mark the loop as already vectorized to avoid vectorizing again. @@ -6686,7 +7591,8 @@ bool LoopVectorizePass::runImpl( Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_, - std::function<const LoopAccessInfo &(Loop &)> &GetLAA_) { + std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, + OptimizationRemarkEmitter &ORE_) { SE = &SE_; LI = &LI_; @@ -6698,6 +7604,7 @@ bool LoopVectorizePass::runImpl( AC = &AC_; GetLAA = &GetLAA_; DB = &DB_; + ORE = &ORE_; // Compute some weights outside of the loop over the loops. Compute this // using a BranchProbability to re-use its scaling math. @@ -6742,17 +7649,20 @@ PreservedAnalyses LoopVectorizePass::run(Function &F, auto &TTI = AM.getResult<TargetIRAnalysis>(F); auto &DT = AM.getResult<DominatorTreeAnalysis>(F); auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); - auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F); + auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); auto &AA = AM.getResult<AAManager>(F); auto &AC = AM.getResult<AssumptionAnalysis>(F); auto &DB = AM.getResult<DemandedBitsAnalysis>(F); + auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); std::function<const LoopAccessInfo &(Loop &)> GetLAA = [&](Loop &L) -> const LoopAccessInfo & { - return LAM.getResult<LoopAccessAnalysis>(L); + LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI}; + return LAM.getResult<LoopAccessAnalysis>(L, AR); }; - bool Changed = runImpl(F, SE, LI, TTI, DT, BFI, TLI, DB, AA, AC, GetLAA); + bool Changed = + runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE); if (!Changed) return PreservedAnalyses::all(); PreservedAnalyses PA; diff --git a/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index fb94f79..328f270 100644 --- a/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -115,22 +115,22 @@ static bool isValidElementType(Type *Ty) { !Ty->isPPC_FP128Ty(); } -/// \returns the parent basic block if all of the instructions in \p VL -/// are in the same block or null otherwise. -static BasicBlock *getSameBlock(ArrayRef<Value *> VL) { +/// \returns true if all of the instructions in \p VL are in the same block or +/// false otherwise. +static bool allSameBlock(ArrayRef<Value *> VL) { Instruction *I0 = dyn_cast<Instruction>(VL[0]); if (!I0) - return nullptr; + return false; BasicBlock *BB = I0->getParent(); for (int i = 1, e = VL.size(); i < e; i++) { Instruction *I = dyn_cast<Instruction>(VL[i]); if (!I) - return nullptr; + return false; if (BB != I->getParent()) - return nullptr; + return false; } - return BB; + return true; } /// \returns True if all of the values in \p VL are constants. @@ -211,12 +211,12 @@ static unsigned getSameOpcode(ArrayRef<Value *> VL) { /// of each scalar operation (VL) that will be converted into a vector (I). /// Flag set: NSW, NUW, exact, and all of fast-math. static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) { - if (auto *VecOp = dyn_cast<BinaryOperator>(I)) { - if (auto *Intersection = dyn_cast<BinaryOperator>(VL[0])) { + if (auto *VecOp = dyn_cast<Instruction>(I)) { + if (auto *Intersection = dyn_cast<Instruction>(VL[0])) { // Intersection is initialized to the 0th scalar, // so start counting from index '1'. for (int i = 1, e = VL.size(); i < e; ++i) { - if (auto *Scalar = dyn_cast<BinaryOperator>(VL[i])) + if (auto *Scalar = dyn_cast<Instruction>(VL[i])) Intersection->andIRFlags(Scalar); } VecOp->copyIRFlags(Intersection); @@ -224,15 +224,15 @@ static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) { } } -/// \returns The type that all of the values in \p VL have or null if there -/// are different types. -static Type* getSameType(ArrayRef<Value *> VL) { +/// \returns true if all of the values in \p VL have the same type or false +/// otherwise. +static bool allSameType(ArrayRef<Value *> VL) { Type *Ty = VL[0]->getType(); for (int i = 1, e = VL.size(); i < e; i++) if (VL[i]->getType() != Ty) - return nullptr; + return false; - return Ty; + return true; } /// \returns True if Extract{Value,Element} instruction extracts element Idx. @@ -393,6 +393,10 @@ public: /// \returns number of elements in vector if isomorphism exists, 0 otherwise. unsigned canMapToVector(Type *T, const DataLayout &DL) const; + /// \returns True if the VectorizableTree is both tiny and not fully + /// vectorizable. We do not vectorize such trees. + bool isTreeTinyAndNotFullyVectorizable(); + private: struct TreeEntry; @@ -883,10 +887,10 @@ private: /// List of users to ignore during scheduling and that don't need extracting. ArrayRef<Value *> UserIgnoreList; - // Number of load-bundles, which contain consecutive loads. + // Number of load bundles that contain consecutive loads. int NumLoadsWantToKeepOrder; - // Number of load-bundles of size 2, which are consecutive loads if reversed. + // Number of load bundles that contain consecutive loads in reversed order. int NumLoadsWantToChangeOrder; // Analysis and block reference. @@ -906,8 +910,11 @@ private: IRBuilder<> Builder; /// A map of scalar integer values to the smallest bit width with which they - /// can legally be represented. - MapVector<Value *, uint64_t> MinBWs; + /// can legally be represented. The values map to (width, signed) pairs, + /// where "width" indicates the minimum bit width and "signed" is True if the + /// value must be signed-extended, rather than zero-extended, back to its + /// original width. + MapVector<Value *, std::pair<uint64_t, bool>> MinBWs; }; } // end namespace llvm @@ -917,7 +924,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots, ArrayRef<Value *> UserIgnoreLst) { deleteTree(); UserIgnoreList = UserIgnoreLst; - if (!getSameType(Roots)) + if (!allSameType(Roots)) return; buildTree_rec(Roots, 0); @@ -958,8 +965,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots, } // Ignore users in the user ignore list. - if (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), UserInst) != - UserIgnoreList.end()) + if (is_contained(UserIgnoreList, UserInst)) continue; DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " << @@ -972,9 +978,8 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots, void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { - bool SameTy = allConstant(VL) || getSameType(VL); (void)SameTy; bool isAltShuffle = false; - assert(SameTy && "Invalid types!"); + assert((allConstant(VL) || allSameType(VL)) && "Invalid types!"); if (Depth == RecursionMaxDepth) { DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); @@ -1007,7 +1012,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { } // If all of the operands are identical or constant we have a simple solution. - if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || !Opcode) { + if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !Opcode) { DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); newTreeEntry(VL, false); return; @@ -1159,7 +1164,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); return; } - // Check if the loads are consecutive or of we need to swizzle them. + + // Make sure all loads in the bundle are simple - we can't vectorize + // atomic or volatile loads. for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) { LoadInst *L = cast<LoadInst>(VL[i]); if (!L->isSimple()) { @@ -1168,20 +1175,47 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); return; } + } + // Check if the loads are consecutive, reversed, or neither. + // TODO: What we really want is to sort the loads, but for now, check + // the two likely directions. + bool Consecutive = true; + bool ReverseConsecutive = true; + for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) { if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) { - if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0], *DL, *SE)) { - ++NumLoadsWantToChangeOrder; - } - BS.cancelScheduling(VL); - newTreeEntry(VL, false); - DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); - return; + Consecutive = false; + break; + } else { + ReverseConsecutive = false; } } - ++NumLoadsWantToKeepOrder; - newTreeEntry(VL, true); - DEBUG(dbgs() << "SLP: added a vector of loads.\n"); + + if (Consecutive) { + ++NumLoadsWantToKeepOrder; + newTreeEntry(VL, true); + DEBUG(dbgs() << "SLP: added a vector of loads.\n"); + return; + } + + // If none of the load pairs were consecutive when checked in order, + // check the reverse order. + if (ReverseConsecutive) + for (unsigned i = VL.size() - 1; i > 0; --i) + if (!isConsecutiveAccess(VL[i], VL[i - 1], *DL, *SE)) { + ReverseConsecutive = false; + break; + } + + BS.cancelScheduling(VL); + newTreeEntry(VL, false); + + if (ReverseConsecutive) { + ++NumLoadsWantToChangeOrder; + DEBUG(dbgs() << "SLP: Gathering reversed loads.\n"); + } else { + DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); + } return; } case Instruction::ZExt: @@ -1541,8 +1575,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { // If we have computed a smaller type for the expression, update VecTy so // that the costs will be accurate. if (MinBWs.count(VL[0])) - VecTy = VectorType::get(IntegerType::get(F->getContext(), MinBWs[VL[0]]), - VL.size()); + VecTy = VectorType::get( + IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size()); if (E->NeedToGather) { if (allConstant(VL)) @@ -1553,7 +1587,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { return getGatherCost(E->Scalars); } unsigned Opcode = getSameOpcode(VL); - assert(Opcode && getSameType(VL) && getSameBlock(VL) && "Invalid VL"); + assert(Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL"); Instruction *VL0 = cast<Instruction>(VL[0]); switch (Opcode) { case Instruction::PHI: { @@ -1762,7 +1796,10 @@ bool BoUpSLP::isFullyVectorizableTinyTree() { DEBUG(dbgs() << "SLP: Check whether the tree with height " << VectorizableTree.size() << " is fully vectorizable .\n"); - // We only handle trees of height 2. + // We only handle trees of heights 1 and 2. + if (VectorizableTree.size() == 1 && !VectorizableTree[0].NeedToGather) + return true; + if (VectorizableTree.size() != 2) return false; @@ -1779,6 +1816,27 @@ bool BoUpSLP::isFullyVectorizableTinyTree() { return true; } +bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() { + + // We can vectorize the tree if its size is greater than or equal to the + // minimum size specified by the MinTreeSize command line option. + if (VectorizableTree.size() >= MinTreeSize) + return false; + + // If we have a tiny tree (a tree whose size is less than MinTreeSize), we + // can vectorize it if we can prove it fully vectorizable. + if (isFullyVectorizableTinyTree()) + return false; + + assert(VectorizableTree.empty() + ? ExternalUses.empty() + : true && "We shouldn't have any external users"); + + // Otherwise, we can't vectorize the tree. It is both tiny and not fully + // vectorizable. + return true; +} + int BoUpSLP::getSpillCost() { // Walk from the bottom of the tree to the top, tracking which values are // live. When we see a call instruction that is not part of our tree, @@ -1816,9 +1874,9 @@ int BoUpSLP::getSpillCost() { ); // Now find the sequence of instructions between PrevInst and Inst. - BasicBlock::reverse_iterator InstIt(Inst->getIterator()), - PrevInstIt(PrevInst->getIterator()); - --PrevInstIt; + BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(), + PrevInstIt = + PrevInst->getIterator().getReverse(); while (InstIt != PrevInstIt) { if (PrevInstIt == PrevInst->getParent()->rend()) { PrevInstIt = Inst->getParent()->rbegin(); @@ -1846,14 +1904,6 @@ int BoUpSLP::getTreeCost() { DEBUG(dbgs() << "SLP: Calculating cost for tree of size " << VectorizableTree.size() << ".\n"); - // We only vectorize tiny trees if it is fully vectorizable. - if (VectorizableTree.size() < MinTreeSize && !isFullyVectorizableTinyTree()) { - if (VectorizableTree.empty()) { - assert(!ExternalUses.size() && "We should not have any external users"); - } - return INT_MAX; - } - unsigned BundleWidth = VectorizableTree[0].Scalars.size(); for (TreeEntry &TE : VectorizableTree) { @@ -1882,10 +1932,12 @@ int BoUpSLP::getTreeCost() { auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth); auto *ScalarRoot = VectorizableTree[0].Scalars[0]; if (MinBWs.count(ScalarRoot)) { - auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot]); + auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first); + auto Extend = + MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt; VecTy = VectorType::get(MinTy, BundleWidth); - ExtractCost += TTI->getExtractWithExtendCost( - Instruction::SExt, EU.Scalar->getType(), VecTy, EU.Lane); + ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(), + VecTy, EU.Lane); } else { ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane); @@ -2182,7 +2234,7 @@ void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) { // Set the insertion point after the last instruction in the bundle. Set the // debug location to Front. - Builder.SetInsertPoint(BB, next(BasicBlock::iterator(LastInst))); + Builder.SetInsertPoint(BB, ++LastInst->getIterator()); Builder.SetCurrentDebugLocation(Front->getDebugLoc()); } @@ -2383,6 +2435,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { V = Builder.CreateICmp(P0, L, R); E->VectorizedValue = V; + propagateIRFlags(E->VectorizedValue, E->Scalars); ++NumVectorInstructions; return V; } @@ -2440,10 +2493,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *LHS = vectorizeTree(LHSVL); Value *RHS = vectorizeTree(RHSVL); - if (LHS == RHS && isa<Instruction>(LHS)) { - assert((VL0->getOperand(0) == VL0->getOperand(1)) && "Invalid order"); - } - if (Value *V = alreadyVectorized(E->Scalars)) return V; @@ -2593,6 +2642,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0)); E->VectorizedValue = V; + propagateIRFlags(E->VectorizedValue, E->Scalars); ++NumVectorInstructions; return V; } @@ -2669,7 +2719,7 @@ Value *BoUpSLP::vectorizeTree() { if (auto *I = dyn_cast<Instruction>(VectorRoot)) Builder.SetInsertPoint(&*++BasicBlock::iterator(I)); auto BundleWidth = VectorizableTree[0].Scalars.size(); - auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot]); + auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first); auto *VecTy = VectorType::get(MinTy, BundleWidth); auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy); VectorizableTree[0].VectorizedValue = Trunc; @@ -2677,6 +2727,16 @@ Value *BoUpSLP::vectorizeTree() { DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n"); + // If necessary, sign-extend or zero-extend ScalarRoot to the larger type + // specified by ScalarType. + auto extend = [&](Value *ScalarRoot, Value *Ex, Type *ScalarType) { + if (!MinBWs.count(ScalarRoot)) + return Ex; + if (MinBWs[ScalarRoot].second) + return Builder.CreateSExt(Ex, ScalarType); + return Builder.CreateZExt(Ex, ScalarType); + }; + // Extract all of the elements with the external uses. for (const auto &ExternalUse : ExternalUses) { Value *Scalar = ExternalUse.Scalar; @@ -2684,8 +2744,7 @@ Value *BoUpSLP::vectorizeTree() { // Skip users that we already RAUW. This happens when one instruction // has multiple uses of the same value. - if (std::find(Scalar->user_begin(), Scalar->user_end(), User) == - Scalar->user_end()) + if (!is_contained(Scalar->users(), User)) continue; assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar"); @@ -2712,8 +2771,7 @@ Value *BoUpSLP::vectorizeTree() { Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator()); } Value *Ex = Builder.CreateExtractElement(Vec, Lane); - if (MinBWs.count(ScalarRoot)) - Ex = Builder.CreateSExt(Ex, Scalar->getType()); + Ex = extend(ScalarRoot, Ex, Scalar->getType()); CSEBlocks.insert(PH->getIncomingBlock(i)); PH->setOperand(i, Ex); } @@ -2721,16 +2779,14 @@ Value *BoUpSLP::vectorizeTree() { } else { Builder.SetInsertPoint(cast<Instruction>(User)); Value *Ex = Builder.CreateExtractElement(Vec, Lane); - if (MinBWs.count(ScalarRoot)) - Ex = Builder.CreateSExt(Ex, Scalar->getType()); + Ex = extend(ScalarRoot, Ex, Scalar->getType()); CSEBlocks.insert(cast<Instruction>(User)->getParent()); User->replaceUsesOfWith(Scalar, Ex); } } else { Builder.SetInsertPoint(&F->getEntryBlock().front()); Value *Ex = Builder.CreateExtractElement(Vec, Lane); - if (MinBWs.count(ScalarRoot)) - Ex = Builder.CreateSExt(Ex, Scalar->getType()); + Ex = extend(ScalarRoot, Ex, Scalar->getType()); CSEBlocks.insert(&F->getEntryBlock()); User->replaceUsesOfWith(Scalar, Ex); } @@ -2759,8 +2815,7 @@ Value *BoUpSLP::vectorizeTree() { assert((ScalarToTreeEntry.count(U) || // It is legal to replace users in the ignorelist by undef. - (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), U) != - UserIgnoreList.end())) && + is_contained(UserIgnoreList, U)) && "Replacing out-of-tree value with undef"); } #endif @@ -2853,7 +2908,7 @@ void BoUpSLP::optimizeGatherSequence() { } } if (In) { - assert(std::find(Visited.begin(), Visited.end(), In) == Visited.end()); + assert(!is_contained(Visited, In)); Visited.push_back(In); } } @@ -2994,9 +3049,10 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) { } // Search up and down at the same time, because we don't know if the new // instruction is above or below the existing scheduling region. - BasicBlock::reverse_iterator UpIter(ScheduleStart->getIterator()); + BasicBlock::reverse_iterator UpIter = + ++ScheduleStart->getIterator().getReverse(); BasicBlock::reverse_iterator UpperEnd = BB->rend(); - BasicBlock::iterator DownIter(ScheduleEnd); + BasicBlock::iterator DownIter = ScheduleEnd->getIterator(); BasicBlock::iterator LowerEnd = BB->end(); for (;;) { if (++ScheduleRegionSize > ScheduleRegionSizeLimit) { @@ -3451,6 +3507,11 @@ void BoUpSLP::computeMinimumValueSizes() { Mask.getBitWidth() - Mask.countLeadingZeros(), MaxBitWidth); } + // True if the roots can be zero-extended back to their original type, rather + // than sign-extended. We know that if the leading bits are not demanded, we + // can safely zero-extend. So we initialize IsKnownPositive to True. + bool IsKnownPositive = true; + // If all the bits of the roots are demanded, we can try a little harder to // compute a narrower type. This can happen, for example, if the roots are // getelementptr indices. InstCombine promotes these indices to the pointer @@ -3462,11 +3523,41 @@ void BoUpSLP::computeMinimumValueSizes() { // compute the number of high-order bits we can truncate. if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType())) { MaxBitWidth = 8u; + + // Determine if the sign bit of all the roots is known to be zero. If not, + // IsKnownPositive is set to False. + IsKnownPositive = all_of(TreeRoot, [&](Value *R) { + bool KnownZero = false; + bool KnownOne = false; + ComputeSignBit(R, KnownZero, KnownOne, *DL); + return KnownZero; + }); + + // Determine the maximum number of bits required to store the scalar + // values. for (auto *Scalar : ToDemote) { auto NumSignBits = ComputeNumSignBits(Scalar, *DL, 0, AC, 0, DT); auto NumTypeBits = DL->getTypeSizeInBits(Scalar->getType()); MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth); } + + // If we can't prove that the sign bit is zero, we must add one to the + // maximum bit width to account for the unknown sign bit. This preserves + // the existing sign bit so we can safely sign-extend the root back to the + // original type. Otherwise, if we know the sign bit is zero, we will + // zero-extend the root instead. + // + // FIXME: This is somewhat suboptimal, as there will be cases where adding + // one to the maximum bit width will yield a larger-than-necessary + // type. In general, we need to add an extra bit only if we can't + // prove that the upper bit of the original type is equal to the + // upper bit of the proposed smaller type. If these two bits are the + // same (either zero or one) we know that sign-extending from the + // smaller type will result in the same value. Here, since we can't + // yet prove this, we are just making the proposed smaller type + // larger to ensure correctness. + if (!IsKnownPositive) + ++MaxBitWidth; } // Round MaxBitWidth up to the next power-of-two. @@ -3486,7 +3577,7 @@ void BoUpSLP::computeMinimumValueSizes() { // Finally, map the values we can demote to the maximum bit with we computed. for (auto *Scalar : ToDemote) - MinBWs[Scalar] = MaxBitWidth; + MinBWs[Scalar] = std::make_pair(MaxBitWidth, !IsKnownPositive); } namespace { @@ -3642,8 +3733,7 @@ static bool hasValueBeenRAUWed(ArrayRef<Value *> VL, ArrayRef<WeakVH> VH, return !std::equal(VL.begin(), VL.end(), VH.begin()); } -bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, - int CostThreshold, BoUpSLP &R, +bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R, unsigned VecRegSize) { unsigned ChainLen = Chain.size(); DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen @@ -3672,12 +3762,15 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, ArrayRef<Value *> Operands = Chain.slice(i, VF); R.buildTree(Operands); + if (R.isTreeTinyAndNotFullyVectorizable()) + continue; + R.computeMinimumValueSizes(); int Cost = R.getTreeCost(); DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n"); - if (Cost < CostThreshold) { + if (Cost < -SLPCostThreshold) { DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); R.vectorizeTree(); @@ -3691,7 +3784,7 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, } bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores, - int costThreshold, BoUpSLP &R) { + BoUpSLP &R) { SetVector<StoreInst *> Heads, Tails; SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain; @@ -3746,8 +3839,9 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores, // FIXME: Is division-by-2 the correct step? Should we assert that the // register size is a power-of-2? - for (unsigned Size = R.getMaxVecRegSize(); Size >= R.getMinVecRegSize(); Size /= 2) { - if (vectorizeStoreChain(Operands, costThreshold, R, Size)) { + for (unsigned Size = R.getMaxVecRegSize(); Size >= R.getMinVecRegSize(); + Size /= 2) { + if (vectorizeStoreChain(Operands, R, Size)) { // Mark the vectorized stores so that we don't vectorize them again. VectorizedStores.insert(Operands.begin(), Operands.end()); Changed = true; @@ -3805,11 +3899,12 @@ bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) { bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, ArrayRef<Value *> BuildVector, - bool allowReorder) { + bool AllowReorder) { if (VL.size() < 2) return false; - DEBUG(dbgs() << "SLP: Vectorizing a list of length = " << VL.size() << ".\n"); + DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = " << VL.size() + << ".\n"); // Check that all of the parts are scalar instructions of the same type. Instruction *I0 = dyn_cast<Instruction>(VL[0]); @@ -3818,10 +3913,11 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, unsigned Opcode0 = I0->getOpcode(); - // FIXME: Register size should be a parameter to this function, so we can - // try different vectorization factors. unsigned Sz = R.getVectorElementSize(I0); - unsigned VF = R.getMinVecRegSize() / Sz; + unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz); + unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF); + if (MaxVF < 2) + return false; for (Value *V : VL) { Type *Ty = V->getType(); @@ -3837,70 +3933,89 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, // Keep track of values that were deleted by vectorizing in the loop below. SmallVector<WeakVH, 8> TrackValues(VL.begin(), VL.end()); - for (unsigned i = 0, e = VL.size(); i < e; ++i) { - unsigned OpsWidth = 0; + unsigned NextInst = 0, MaxInst = VL.size(); + for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; + VF /= 2) { + // No actual vectorization should happen, if number of parts is the same as + // provided vectorization factor (i.e. the scalar type is used for vector + // code during codegen). + auto *VecTy = VectorType::get(VL[0]->getType(), VF); + if (TTI->getNumberOfParts(VecTy) == VF) + continue; + for (unsigned I = NextInst; I < MaxInst; ++I) { + unsigned OpsWidth = 0; - if (i + VF > e) - OpsWidth = e - i; - else - OpsWidth = VF; + if (I + VF > MaxInst) + OpsWidth = MaxInst - I; + else + OpsWidth = VF; - if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2) - break; + if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2) + break; - // Check that a previous iteration of this loop did not delete the Value. - if (hasValueBeenRAUWed(VL, TrackValues, i, OpsWidth)) - continue; + // Check that a previous iteration of this loop did not delete the Value. + if (hasValueBeenRAUWed(VL, TrackValues, I, OpsWidth)) + continue; - DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations " - << "\n"); - ArrayRef<Value *> Ops = VL.slice(i, OpsWidth); - - ArrayRef<Value *> BuildVectorSlice; - if (!BuildVector.empty()) - BuildVectorSlice = BuildVector.slice(i, OpsWidth); - - R.buildTree(Ops, BuildVectorSlice); - // TODO: check if we can allow reordering also for other cases than - // tryToVectorizePair() - if (allowReorder && R.shouldReorder()) { - assert(Ops.size() == 2); - assert(BuildVectorSlice.empty()); - Value *ReorderedOps[] = { Ops[1], Ops[0] }; - R.buildTree(ReorderedOps, None); - } - R.computeMinimumValueSizes(); - int Cost = R.getTreeCost(); + DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations " + << "\n"); + ArrayRef<Value *> Ops = VL.slice(I, OpsWidth); + + ArrayRef<Value *> BuildVectorSlice; + if (!BuildVector.empty()) + BuildVectorSlice = BuildVector.slice(I, OpsWidth); + + R.buildTree(Ops, BuildVectorSlice); + // TODO: check if we can allow reordering for more cases. + if (AllowReorder && R.shouldReorder()) { + // Conceptually, there is nothing actually preventing us from trying to + // reorder a larger list. In fact, we do exactly this when vectorizing + // reductions. However, at this point, we only expect to get here from + // tryToVectorizePair(). + assert(Ops.size() == 2); + assert(BuildVectorSlice.empty()); + Value *ReorderedOps[] = {Ops[1], Ops[0]}; + R.buildTree(ReorderedOps, None); + } + if (R.isTreeTinyAndNotFullyVectorizable()) + continue; - if (Cost < -SLPCostThreshold) { - DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n"); - Value *VectorizedRoot = R.vectorizeTree(); - - // Reconstruct the build vector by extracting the vectorized root. This - // way we handle the case where some elements of the vector are undefined. - // (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2)) - if (!BuildVectorSlice.empty()) { - // The insert point is the last build vector instruction. The vectorized - // root will precede it. This guarantees that we get an instruction. The - // vectorized tree could have been constant folded. - Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back()); - unsigned VecIdx = 0; - for (auto &V : BuildVectorSlice) { - IRBuilder<NoFolder> Builder(InsertAfter->getParent(), - ++BasicBlock::iterator(InsertAfter)); - Instruction *I = cast<Instruction>(V); - assert(isa<InsertElementInst>(I) || isa<InsertValueInst>(I)); - Instruction *Extract = cast<Instruction>(Builder.CreateExtractElement( - VectorizedRoot, Builder.getInt32(VecIdx++))); - I->setOperand(1, Extract); - I->removeFromParent(); - I->insertAfter(Extract); - InsertAfter = I; + R.computeMinimumValueSizes(); + int Cost = R.getTreeCost(); + + if (Cost < -SLPCostThreshold) { + DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n"); + Value *VectorizedRoot = R.vectorizeTree(); + + // Reconstruct the build vector by extracting the vectorized root. This + // way we handle the case where some elements of the vector are + // undefined. + // (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2)) + if (!BuildVectorSlice.empty()) { + // The insert point is the last build vector instruction. The + // vectorized root will precede it. This guarantees that we get an + // instruction. The vectorized tree could have been constant folded. + Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back()); + unsigned VecIdx = 0; + for (auto &V : BuildVectorSlice) { + IRBuilder<NoFolder> Builder(InsertAfter->getParent(), + ++BasicBlock::iterator(InsertAfter)); + Instruction *I = cast<Instruction>(V); + assert(isa<InsertElementInst>(I) || isa<InsertValueInst>(I)); + Instruction *Extract = + cast<Instruction>(Builder.CreateExtractElement( + VectorizedRoot, Builder.getInt32(VecIdx++))); + I->setOperand(1, Extract); + I->removeFromParent(); + I->insertAfter(Extract); + InsertAfter = I; + } } + // Move to the next bundle. + I += VF - 1; + NextInst = I + 1; + Changed = true; } - // Move to the next bundle. - i += VF - 1; - Changed = true; } } @@ -3973,7 +4088,7 @@ static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx, return ConstantVector::get(ShuffleMask); } - +namespace { /// Model horizontal reductions. /// /// A horizontal reduction is a tree of reduction operations (currently add and @@ -4006,7 +4121,14 @@ class HorizontalReduction { SmallVector<Value *, 32> ReducedVals; BinaryOperator *ReductionRoot; - PHINode *ReductionPHI; + // After successfull horizontal reduction vectorization attempt for PHI node + // vectorizer tries to update root binary op by combining vectorized tree and + // the ReductionPHI node. But during vectorization this ReductionPHI can be + // vectorized itself and replaced by the undef value, while the instruction + // itself is marked for deletion. This 'marked for deletion' PHI node then can + // be used in new binary operation, causing "Use still stuck around after Def + // is destroyed" crash upon PHI node deletion. + WeakVH ReductionPHI; /// The opcode of the reduction. unsigned ReductionOpcode; @@ -4025,14 +4147,13 @@ public: unsigned MinVecRegSize; HorizontalReduction(unsigned MinVecRegSize) - : ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0), - ReducedValueOpcode(0), IsPairwiseReduction(false), ReduxWidth(0), + : ReductionRoot(nullptr), ReductionOpcode(0), ReducedValueOpcode(0), + IsPairwiseReduction(false), ReduxWidth(0), MinVecRegSize(MinVecRegSize) {} /// \brief Try to find a reduction tree. bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) { - assert((!Phi || - std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) && + assert((!Phi || is_contained(Phi->operands(), B)) && "Thi phi needs to use the binary operator"); // We could have a initial reductions that is not an add. @@ -4113,12 +4234,21 @@ public: // Visit left or right. Value *NextV = TreeN->getOperand(EdgeToVist); - // We currently only allow BinaryOperator's and SelectInst's as reduction - // values in our tree. - if (isa<BinaryOperator>(NextV) || isa<SelectInst>(NextV)) - Stack.push_back(std::make_pair(cast<Instruction>(NextV), 0)); - else if (NextV != Phi) + if (NextV != Phi) { + auto *I = dyn_cast<Instruction>(NextV); + // Continue analysis if the next operand is a reduction operation or + // (possibly) a reduced value. If the reduced value opcode is not set, + // the first met operation != reduction operation is considered as the + // reduced value class. + if (I && (!ReducedValueOpcode || I->getOpcode() == ReducedValueOpcode || + I->getOpcode() == ReductionOpcode)) { + if (!ReducedValueOpcode && I->getOpcode() != ReductionOpcode) + ReducedValueOpcode = I->getOpcode(); + Stack.push_back(std::make_pair(I, 0)); + continue; + } return false; + } } return true; } @@ -4141,7 +4271,15 @@ public: unsigned i = 0; for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) { - V.buildTree(makeArrayRef(&ReducedVals[i], ReduxWidth), ReductionOps); + auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth); + V.buildTree(VL, ReductionOps); + if (V.shouldReorder()) { + SmallVector<Value *, 8> Reversed(VL.rbegin(), VL.rend()); + V.buildTree(Reversed, ReductionOps); + } + if (V.isTreeTinyAndNotFullyVectorizable()) + continue; + V.computeMinimumValueSizes(); // Estimate cost. @@ -4175,7 +4313,7 @@ public: ReducedVals[i]); } // Update users. - if (ReductionPHI) { + if (ReductionPHI && !isa<UndefValue>(ReductionPHI)) { assert(ReductionRoot && "Need a reduction operation"); ReductionRoot->setOperand(0, VectorizedTree); ReductionRoot->setOperand(1, ReductionPHI); @@ -4202,7 +4340,8 @@ private: int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost; int ScalarReduxCost = - ReduxWidth * TTI->getArithmeticInstrCost(ReductionOpcode, VecTy); + (ReduxWidth - 1) * + TTI->getArithmeticInstrCost(ReductionOpcode, ScalarTy); DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost << " for reduction that starts with " << *FirstReducedVal @@ -4254,6 +4393,7 @@ private: return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); } }; +} // end anonymous namespace /// \brief Recognize construction of vectors like /// %ra = insertelement <4 x float> undef, float %s0, i32 0 @@ -4354,7 +4494,7 @@ static Value *getReductionValue(const DominatorTree *DT, PHINode *P, return nullptr; // There is a loop latch, return the incoming value if it comes from - // that. This reduction pattern occassionaly turns up. + // that. This reduction pattern occasionally turns up. if (P->getIncomingBlock(0) == BBLatch) { Rdx = P->getIncomingValue(0); } else if (P->getIncomingBlock(1) == BBLatch) { @@ -4510,8 +4650,10 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { if (BinaryOperator *BinOp = dyn_cast<BinaryOperator>(RI->getOperand(0))) { DEBUG(dbgs() << "SLP: Found a return to vectorize.\n"); - if (tryToVectorizePair(BinOp->getOperand(0), - BinOp->getOperand(1), R)) { + if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI, + R.getMinVecRegSize()) || + tryToVectorizePair(BinOp->getOperand(0), BinOp->getOperand(1), + R)) { Changed = true; it = BB->begin(); e = BB->end(); @@ -4690,8 +4832,7 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) { // may cause a significant compile-time increase. for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) { unsigned Len = std::min<unsigned>(CE - CI, 16); - Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len), - -SLPCostThreshold, R); + Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len), R); } } return Changed; |