diff options
Diffstat (limited to 'lib/Transforms/Vectorize/SLPVectorizer.cpp')
-rw-r--r-- | lib/Transforms/Vectorize/SLPVectorizer.cpp | 788 |
1 files changed, 605 insertions, 183 deletions
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index c72b51f..53a43d9 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -15,36 +15,39 @@ // "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks. // //===----------------------------------------------------------------------===// -#define SV_NAME "slp-vectorizer" -#define DEBUG_TYPE "SLP" - #include "llvm/Transforms/Vectorize.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SetVector.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/Analysis/Verifier.h" -#include "llvm/Analysis/LoopInfo.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Module.h" +#include "llvm/IR/NoFolder.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" +#include "llvm/IR/Verifier.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/VectorUtils.h" #include <algorithm> #include <map> using namespace llvm; +#define SV_NAME "slp-vectorizer" +#define DEBUG_TYPE "SLP" + static cl::opt<int> SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " @@ -71,8 +74,6 @@ struct BlockNumbering { BlockNumbering(BasicBlock *Bb) : BB(Bb), Valid(false) {} - BlockNumbering() : BB(0), Valid(false) {} - void numberInstructions() { unsigned Loc = 0; InstrIdx.clear(); @@ -119,15 +120,15 @@ private: static BasicBlock *getSameBlock(ArrayRef<Value *> VL) { Instruction *I0 = dyn_cast<Instruction>(VL[0]); if (!I0) - return 0; + return nullptr; BasicBlock *BB = I0->getParent(); for (int i = 1, e = VL.size(); i < e; i++) { Instruction *I = dyn_cast<Instruction>(VL[i]); if (!I) - return 0; + return nullptr; if (BB != I->getParent()) - return 0; + return nullptr; } return BB; } @@ -148,6 +149,48 @@ static bool isSplat(ArrayRef<Value *> VL) { return true; } +///\returns Opcode that can be clubbed with \p Op to create an alternate +/// sequence which can later be merged as a ShuffleVector instruction. +static unsigned getAltOpcode(unsigned Op) { + switch (Op) { + case Instruction::FAdd: + return Instruction::FSub; + case Instruction::FSub: + return Instruction::FAdd; + case Instruction::Add: + return Instruction::Sub; + case Instruction::Sub: + return Instruction::Add; + default: + return 0; + } +} + +///\returns bool representing if Opcode \p Op can be part +/// of an alternate sequence which can later be merged as +/// a ShuffleVector instruction. +static bool canCombineAsAltInst(unsigned Op) { + if (Op == Instruction::FAdd || Op == Instruction::FSub || + Op == Instruction::Sub || Op == Instruction::Add) + return true; + return false; +} + +/// \returns ShuffleVector instruction if intructions in \p VL have +/// alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence. +/// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...) +static unsigned isAltInst(ArrayRef<Value *> VL) { + Instruction *I0 = dyn_cast<Instruction>(VL[0]); + unsigned Opcode = I0->getOpcode(); + unsigned AltOpcode = getAltOpcode(Opcode); + for (int i = 1, e = VL.size(); i < e; i++) { + Instruction *I = dyn_cast<Instruction>(VL[i]); + if (!I || I->getOpcode() != ((i & 1) ? AltOpcode : Opcode)) + return 0; + } + return Instruction::ShuffleVector; +} + /// \returns The opcode if all of the Instructions in \p VL have the same /// opcode, or zero. static unsigned getSameOpcode(ArrayRef<Value *> VL) { @@ -157,8 +200,11 @@ static unsigned getSameOpcode(ArrayRef<Value *> VL) { unsigned Opcode = I0->getOpcode(); for (int i = 1, e = VL.size(); i < e; i++) { Instruction *I = dyn_cast<Instruction>(VL[i]); - if (!I || Opcode != I->getOpcode()) + if (!I || Opcode != I->getOpcode()) { + if (canCombineAsAltInst(Opcode) && i == 1) + return isAltInst(VL); return 0; + } } return Opcode; } @@ -179,7 +225,7 @@ static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) { switch (Kind) { default: - MD = 0; // Remove unknown metadata + MD = nullptr; // Remove unknown metadata break; case LLVMContext::MD_tbaa: MD = MDNode::getMostGenericTBAA(MD, IMD); @@ -200,7 +246,7 @@ static Type* getSameType(ArrayRef<Value *> VL) { Type *Ty = VL[0]->getType(); for (int i = 1, e = VL.size(); i < e; i++) if (VL[i]->getType() != Ty) - return 0; + return nullptr; return Ty; } @@ -343,18 +389,11 @@ public: typedef SmallPtrSet<Value *, 16> ValueSet; typedef SmallVector<StoreInst *, 8> StoreList; - BoUpSLP(Function *Func, ScalarEvolution *Se, DataLayout *Dl, - TargetTransformInfo *Tti, AliasAnalysis *Aa, LoopInfo *Li, - DominatorTree *Dt) : - F(Func), SE(Se), DL(Dl), TTI(Tti), AA(Aa), LI(Li), DT(Dt), - Builder(Se->getContext()) { - // Setup the block numbering utility for all of the blocks in the - // function. - for (Function::iterator it = F->begin(), e = F->end(); it != e; ++it) { - BasicBlock *BB = it; - BlocksNumbers[BB] = BlockNumbering(BB); - } - } + BoUpSLP(Function *Func, ScalarEvolution *Se, const DataLayout *Dl, + TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AliasAnalysis *Aa, + LoopInfo *Li, DominatorTree *Dt) + : F(Func), SE(Se), DL(Dl), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), + Builder(Se->getContext()) {} /// \brief Vectorize the tree that starts with the elements in \p VL. /// Returns the vectorized root. @@ -364,13 +403,13 @@ public: /// A negative number means that this is profitable. int getTreeCost(); - /// Construct a vectorizable tree that starts at \p Roots and is possibly - /// used by a reduction of \p RdxOps. - void buildTree(ArrayRef<Value *> Roots, ValueSet *RdxOps = 0); + /// Construct a vectorizable tree that starts at \p Roots, ignoring users for + /// the purpose of scheduling and extraction in the \p UserIgnoreLst. + void buildTree(ArrayRef<Value *> Roots, + ArrayRef<Value *> UserIgnoreLst = None); /// Clear the internal data structures that are created by 'buildTree'. void deleteTree() { - RdxOps = 0; VectorizableTree.clear(); ScalarToTreeEntry.clear(); MustGather.clear(); @@ -383,6 +422,7 @@ public: /// \brief Perform LICM and CSE on the newly generated gather sequences. void optimizeGatherSequence(); + private: struct TreeEntry; @@ -442,10 +482,10 @@ private: /// \returns whether the VectorizableTree is fully vectoriable and will /// be beneficial even the tree height is tiny. - bool isFullyVectorizableTinyTree(); + bool isFullyVectorizableTinyTree(); struct TreeEntry { - TreeEntry() : Scalars(), VectorizedValue(0), LastScalarIndex(0), + TreeEntry() : Scalars(), VectorizedValue(nullptr), LastScalarIndex(0), NeedToGather(0) {} /// \returns true if the scalars in VL are equal to this entry. @@ -521,19 +561,27 @@ private: /// Holds all of the instructions that we gathered. SetVector<Instruction *> GatherSeq; /// A list of blocks that we are going to CSE. - SmallSet<BasicBlock *, 8> CSEBlocks; + SetVector<BasicBlock *> CSEBlocks; /// Numbers instructions in different blocks. DenseMap<BasicBlock *, BlockNumbering> BlocksNumbers; - /// Reduction operators. - ValueSet *RdxOps; + /// \brief Get the corresponding instruction numbering list for a given + /// BasicBlock. The list is allocated lazily. + BlockNumbering &getBlockNumbering(BasicBlock *BB) { + auto I = BlocksNumbers.insert(std::make_pair(BB, BlockNumbering(BB))); + return I.first->second; + } + + /// List of users to ignore during scheduling and that don't need extracting. + ArrayRef<Value *> UserIgnoreList; // Analysis and block reference. Function *F; ScalarEvolution *SE; - DataLayout *DL; + const DataLayout *DL; TargetTransformInfo *TTI; + TargetLibraryInfo *TLI; AliasAnalysis *AA; LoopInfo *LI; DominatorTree *DT; @@ -541,9 +589,10 @@ private: IRBuilder<> Builder; }; -void BoUpSLP::buildTree(ArrayRef<Value *> Roots, ValueSet *Rdx) { +void BoUpSLP::buildTree(ArrayRef<Value *> Roots, + ArrayRef<Value *> UserIgnoreLst) { deleteTree(); - RdxOps = Rdx; + UserIgnoreList = UserIgnoreLst; if (!getSameType(Roots)) return; buildTree_rec(Roots, 0); @@ -560,29 +609,29 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots, ValueSet *Rdx) { if (Entry->NeedToGather) continue; - for (Value::use_iterator User = Scalar->use_begin(), - UE = Scalar->use_end(); User != UE; ++User) { - DEBUG(dbgs() << "SLP: Checking user:" << **User << ".\n"); + for (User *U : Scalar->users()) { + DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n"); // Skip in-tree scalars that become vectors. - if (ScalarToTreeEntry.count(*User)) { + if (ScalarToTreeEntry.count(U)) { DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << - **User << ".\n"); - int Idx = ScalarToTreeEntry[*User]; (void) Idx; + *U << ".\n"); + int Idx = ScalarToTreeEntry[U]; (void) Idx; assert(!VectorizableTree[Idx].NeedToGather && "Bad state"); continue; } - Instruction *UserInst = dyn_cast<Instruction>(*User); + Instruction *UserInst = dyn_cast<Instruction>(U); if (!UserInst) continue; - // Ignore uses that are part of the reduction. - if (Rdx && std::find(Rdx->begin(), Rdx->end(), UserInst) != Rdx->end()) + // Ignore users in the user ignore list. + if (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), UserInst) != + UserIgnoreList.end()) continue; - DEBUG(dbgs() << "SLP: Need to extract:" << **User << " from lane " << + DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " << Lane << " from " << *Scalar << ".\n"); - ExternalUses.push_back(ExternalUser(Scalar, *User, Lane)); + ExternalUses.push_back(ExternalUser(Scalar, U, Lane)); } } } @@ -591,6 +640,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots, ValueSet *Rdx) { void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { bool SameTy = getSameType(VL); (void)SameTy; + bool isAltShuffle = false; assert(SameTy && "Invalid types!"); if (Depth == RecursionMaxDepth) { @@ -612,10 +662,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { newTreeEntry(VL, false); return; } + unsigned Opcode = getSameOpcode(VL); + + // Check that this shuffle vector refers to the alternate + // sequence of opcodes. + if (Opcode == Instruction::ShuffleVector) { + Instruction *I0 = dyn_cast<Instruction>(VL[0]); + unsigned Op = I0->getOpcode(); + if (Op != Instruction::ShuffleVector) + isAltShuffle = true; + } // If all of the operands are identical or constant we have a simple solution. - if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || - !getSameOpcode(VL)) { + if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || !Opcode) { DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); newTreeEntry(VL, false); return; @@ -669,57 +728,57 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { for (unsigned i = 0, e = VL.size(); i != e; ++i) { Instruction *Scalar = cast<Instruction>(VL[i]); DEBUG(dbgs() << "SLP: Checking users of " << *Scalar << ". \n"); - for (Value::use_iterator U = Scalar->use_begin(), UE = Scalar->use_end(); - U != UE; ++U) { - DEBUG(dbgs() << "SLP: \tUser " << **U << ". \n"); - Instruction *User = dyn_cast<Instruction>(*U); - if (!User) { + for (User *U : Scalar->users()) { + DEBUG(dbgs() << "SLP: \tUser " << *U << ". \n"); + Instruction *UI = dyn_cast<Instruction>(U); + if (!UI) { DEBUG(dbgs() << "SLP: Gathering due unknown user. \n"); newTreeEntry(VL, false); return; } // We don't care if the user is in a different basic block. - BasicBlock *UserBlock = User->getParent(); + BasicBlock *UserBlock = UI->getParent(); if (UserBlock != BB) { DEBUG(dbgs() << "SLP: User from a different basic block " - << *User << ". \n"); + << *UI << ". \n"); continue; } // If this is a PHINode within this basic block then we can place the // extract wherever we want. - if (isa<PHINode>(*User)) { - DEBUG(dbgs() << "SLP: \tWe can schedule PHIs:" << *User << ". \n"); + if (isa<PHINode>(*UI)) { + DEBUG(dbgs() << "SLP: \tWe can schedule PHIs:" << *UI << ". \n"); continue; } // Check if this is a safe in-tree user. - if (ScalarToTreeEntry.count(User)) { - int Idx = ScalarToTreeEntry[User]; + if (ScalarToTreeEntry.count(UI)) { + int Idx = ScalarToTreeEntry[UI]; int VecLocation = VectorizableTree[Idx].LastScalarIndex; if (VecLocation <= MyLastIndex) { DEBUG(dbgs() << "SLP: Gathering due to unschedulable vector. \n"); newTreeEntry(VL, false); return; } - DEBUG(dbgs() << "SLP: In-tree user (" << *User << ") at #" << + DEBUG(dbgs() << "SLP: In-tree user (" << *UI << ") at #" << VecLocation << " vector value (" << *Scalar << ") at #" << MyLastIndex << ".\n"); continue; } - // This user is part of the reduction. - if (RdxOps && RdxOps->count(User)) + // Ignore users in the user ignore list. + if (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), UI) != + UserIgnoreList.end()) continue; // Make sure that we can schedule this unknown user. - BlockNumbering &BN = BlocksNumbers[BB]; - int UserIndex = BN.getIndex(User); + BlockNumbering &BN = getBlockNumbering(BB); + int UserIndex = BN.getIndex(UI); if (UserIndex < MyLastIndex) { DEBUG(dbgs() << "SLP: Can't schedule extractelement for " - << *User << ". \n"); + << *UI << ". \n"); newTreeEntry(VL, false); return; } @@ -738,11 +797,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { // Check that instructions in this bundle don't reference other instructions. // The runtime of this check is O(N * N-1 * uses(N)) and a typical N is 4. for (unsigned i = 0, e = VL.size(); i < e; ++i) { - for (Value::use_iterator U = VL[i]->use_begin(), UE = VL[i]->use_end(); - U != UE; ++U) { + for (User *U : VL[i]->users()) { for (unsigned j = 0; j < e; ++j) { - if (i != j && *U == VL[j]) { - DEBUG(dbgs() << "SLP: Intra-bundle dependencies!" << **U << ". \n"); + if (i != j && U == VL[j]) { + DEBUG(dbgs() << "SLP: Intra-bundle dependencies!" << *U << ". \n"); newTreeEntry(VL, false); return; } @@ -752,8 +810,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); - unsigned Opcode = getSameOpcode(VL); - // Check if it is safe to sink the loads or the stores. if (Opcode == Instruction::Load || Opcode == Instruction::Store) { Instruction *Last = getLastInstruction(VL); @@ -778,7 +834,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { // Check for terminator values (e.g. invoke). for (unsigned j = 0; j < VL.size(); ++j) for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { - TerminatorInst *Term = dyn_cast<TerminatorInst>(cast<PHINode>(VL[j])->getIncomingValue(i)); + TerminatorInst *Term = dyn_cast<TerminatorInst>( + cast<PHINode>(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i))); if (Term) { DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n"); newTreeEntry(VL, false); @@ -793,7 +850,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { ValueList Operands; // Prepare the operand vector. for (unsigned j = 0; j < VL.size(); ++j) - Operands.push_back(cast<PHINode>(VL[j])->getIncomingValue(i)); + Operands.push_back(cast<PHINode>(VL[j])->getIncomingValueForBlock( + PH->getIncomingBlock(i))); buildTree_rec(Operands, Depth + 1); } @@ -910,8 +968,20 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) { ValueList Left, Right; reorderInputsAccordingToOpcode(VL, Left, Right); - buildTree_rec(Left, Depth + 1); - buildTree_rec(Right, Depth + 1); + BasicBlock *LeftBB = getSameBlock(Left); + BasicBlock *RightBB = getSameBlock(Right); + // If we have common uses on separate paths in the tree make sure we + // process the one with greater common depth first. + // We can use block numbering to determine the subtree traversal as + // earler user has to come in between the common use and the later user. + if (LeftBB && RightBB && LeftBB == RightBB && + getLastIndex(Right) > getLastIndex(Left)) { + buildTree_rec(Right, Depth + 1); + buildTree_rec(Left, Depth + 1); + } else { + buildTree_rec(Left, Depth + 1); + buildTree_rec(Right, Depth + 1); + } return; } @@ -925,12 +995,57 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { } return; } + case Instruction::GetElementPtr: { + // We don't combine GEPs with complicated (nested) indexing. + for (unsigned j = 0; j < VL.size(); ++j) { + if (cast<Instruction>(VL[j])->getNumOperands() != 2) { + DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); + newTreeEntry(VL, false); + return; + } + } + + // We can't combine several GEPs into one vector if they operate on + // different types. + Type *Ty0 = cast<Instruction>(VL0)->getOperand(0)->getType(); + for (unsigned j = 0; j < VL.size(); ++j) { + Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType(); + if (Ty0 != CurTy) { + DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n"); + newTreeEntry(VL, false); + return; + } + } + + // We don't combine GEPs with non-constant indexes. + for (unsigned j = 0; j < VL.size(); ++j) { + auto Op = cast<Instruction>(VL[j])->getOperand(1); + if (!isa<ConstantInt>(Op)) { + DEBUG( + dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); + newTreeEntry(VL, false); + return; + } + } + + newTreeEntry(VL, true); + DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); + for (unsigned i = 0, e = 2; i < e; ++i) { + ValueList Operands; + // Prepare the operand vector. + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); + + buildTree_rec(Operands, Depth + 1); + } + return; + } case Instruction::Store: { // Check if the stores are consecutive or of we need to swizzle them. for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) if (!isConsecutiveAccess(VL[i], VL[i + 1])) { newTreeEntry(VL, false); - DEBUG(dbgs() << "SLP: Non consecutive store.\n"); + DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); return; } @@ -946,6 +1061,76 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { buildTree_rec(Operands, Depth + 1); return; } + case Instruction::Call: { + // Check if the calls are all to the same vectorizable intrinsic. + CallInst *CI = cast<CallInst>(VL[0]); + // Check if this is an Intrinsic call or something that can be + // represented by an intrinsic call + Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); + if (!isTriviallyVectorizable(ID)) { + newTreeEntry(VL, false); + DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); + return; + } + Function *Int = CI->getCalledFunction(); + Value *A1I = nullptr; + if (hasVectorInstrinsicScalarOpd(ID, 1)) + A1I = CI->getArgOperand(1); + for (unsigned i = 1, e = VL.size(); i != e; ++i) { + CallInst *CI2 = dyn_cast<CallInst>(VL[i]); + if (!CI2 || CI2->getCalledFunction() != Int || + getIntrinsicIDForCall(CI2, TLI) != ID) { + newTreeEntry(VL, false); + DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i] + << "\n"); + return; + } + // ctlz,cttz and powi are special intrinsics whose second argument + // should be same in order for them to be vectorized. + if (hasVectorInstrinsicScalarOpd(ID, 1)) { + Value *A1J = CI2->getArgOperand(1); + if (A1I != A1J) { + newTreeEntry(VL, false); + DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI + << " argument "<< A1I<<"!=" << A1J + << "\n"); + return; + } + } + } + + newTreeEntry(VL, true); + for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { + ValueList Operands; + // Prepare the operand vector. + for (unsigned j = 0; j < VL.size(); ++j) { + CallInst *CI2 = dyn_cast<CallInst>(VL[j]); + Operands.push_back(CI2->getArgOperand(i)); + } + buildTree_rec(Operands, Depth + 1); + } + return; + } + case Instruction::ShuffleVector: { + // If this is not an alternate sequence of opcode like add-sub + // then do not vectorize this instruction. + if (!isAltShuffle) { + newTreeEntry(VL, false); + DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); + return; + } + newTreeEntry(VL, true); + DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); + for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { + ValueList Operands; + // Prepare the operand vector. + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); + + buildTree_rec(Operands, Depth + 1); + } + return; + } default: newTreeEntry(VL, false); DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); @@ -969,18 +1154,25 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { } return getGatherCost(E->Scalars); } - - assert(getSameOpcode(VL) && getSameType(VL) && getSameBlock(VL) && - "Invalid VL"); + unsigned Opcode = getSameOpcode(VL); + assert(Opcode && getSameType(VL) && getSameBlock(VL) && "Invalid VL"); Instruction *VL0 = cast<Instruction>(VL[0]); - unsigned Opcode = VL0->getOpcode(); switch (Opcode) { case Instruction::PHI: { return 0; } case Instruction::ExtractElement: { - if (CanReuseExtract(VL)) - return 0; + if (CanReuseExtract(VL)) { + int DeadCost = 0; + for (unsigned i = 0, e = VL.size(); i < e; ++i) { + ExtractElementInst *E = cast<ExtractElementInst>(VL[i]); + if (E->hasOneUse()) + // Take credit for instruction that will become dead. + DeadCost += + TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i); + } + return -DeadCost; + } return getGatherCost(VecTy); } case Instruction::ZExt: @@ -1043,12 +1235,26 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { TargetTransformInfo::OperandValueKind Op2VK = TargetTransformInfo::OK_UniformConstantValue; - // Check whether all second operands are constant. - for (unsigned i = 0; i < VL.size(); ++i) - if (!isa<ConstantInt>(cast<Instruction>(VL[i])->getOperand(1))) { + // If all operands are exactly the same ConstantInt then set the + // operand kind to OK_UniformConstantValue. + // If instead not all operands are constants, then set the operand kind + // to OK_AnyValue. If all operands are constants but not the same, + // then set the operand kind to OK_NonUniformConstantValue. + ConstantInt *CInt = nullptr; + for (unsigned i = 0; i < VL.size(); ++i) { + const Instruction *I = cast<Instruction>(VL[i]); + if (!isa<ConstantInt>(I->getOperand(1))) { Op2VK = TargetTransformInfo::OK_AnyValue; break; } + if (i == 0) { + CInt = cast<ConstantInt>(I->getOperand(1)); + continue; + } + if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && + CInt != cast<ConstantInt>(I->getOperand(1))) + Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; + } ScalarCost = VecTy->getNumElements() * @@ -1057,6 +1263,20 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { } return VecCost - ScalarCost; } + case Instruction::GetElementPtr: { + TargetTransformInfo::OperandValueKind Op1VK = + TargetTransformInfo::OK_AnyValue; + TargetTransformInfo::OperandValueKind Op2VK = + TargetTransformInfo::OK_UniformConstantValue; + + int ScalarCost = + VecTy->getNumElements() * + TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK); + int VecCost = + TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK); + + return VecCost - ScalarCost; + } case Instruction::Load: { // Cost of wide load - cost of scalar loads. int ScalarLdCost = VecTy->getNumElements() * @@ -1071,6 +1291,55 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { int VecStCost = TTI->getMemoryOpCost(Instruction::Store, VecTy, 1, 0); return VecStCost - ScalarStCost; } + case Instruction::Call: { + CallInst *CI = cast<CallInst>(VL0); + Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); + + // Calculate the cost of the scalar and vector calls. + SmallVector<Type*, 4> ScalarTys, VecTys; + for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) { + ScalarTys.push_back(CI->getArgOperand(op)->getType()); + VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(), + VecTy->getNumElements())); + } + + int ScalarCallCost = VecTy->getNumElements() * + TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys); + + int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys); + + DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost + << " (" << VecCallCost << "-" << ScalarCallCost << ")" + << " for " << *CI << "\n"); + + return VecCallCost - ScalarCallCost; + } + case Instruction::ShuffleVector: { + TargetTransformInfo::OperandValueKind Op1VK = + TargetTransformInfo::OK_AnyValue; + TargetTransformInfo::OperandValueKind Op2VK = + TargetTransformInfo::OK_AnyValue; + int ScalarCost = 0; + int VecCost = 0; + for (unsigned i = 0; i < VL.size(); ++i) { + Instruction *I = cast<Instruction>(VL[i]); + if (!I) + break; + ScalarCost += + TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK); + } + // VecCost is equal to sum of the cost of creating 2 vectors + // and the cost of creating shuffle. + Instruction *I0 = cast<Instruction>(VL[0]); + VecCost = + TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK); + Instruction *I1 = cast<Instruction>(VL[1]); + VecCost += + TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK); + VecCost += + TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0); + return VecCost - ScalarCost; + } default: llvm_unreachable("Unknown instruction"); } @@ -1084,11 +1353,15 @@ bool BoUpSLP::isFullyVectorizableTinyTree() { if (VectorizableTree.size() != 2) return false; + // Handle splat stores. + if (!VectorizableTree[0].NeedToGather && isSplat(VectorizableTree[1].Scalars)) + return true; + // Gathering cost would be too much for tiny trees. - if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather) - return false; + if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather) + return false; - return true; + return true; } int BoUpSLP::getTreeCost() { @@ -1113,16 +1386,19 @@ int BoUpSLP::getTreeCost() { Cost += C; } + SmallSet<Value *, 16> ExtractCostCalculated; int ExtractCost = 0; for (UserList::iterator I = ExternalUses.begin(), E = ExternalUses.end(); I != E; ++I) { + // We only add extract cost once for the same scalar. + if (!ExtractCostCalculated.insert(I->Scalar)) + continue; VectorType *VecTy = VectorType::get(I->Scalar->getType(), BundleWidth); ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I->Lane); } - DEBUG(dbgs() << "SLP: Total Cost " << Cost + ExtractCost<< ".\n"); return Cost + ExtractCost; } @@ -1157,7 +1433,7 @@ Value *BoUpSLP::getPointerOperand(Value *I) { return LI->getPointerOperand(); if (StoreInst *SI = dyn_cast<StoreInst>(I)) return SI->getPointerOperand(); - return 0; + return nullptr; } unsigned BoUpSLP::getAddressSpaceOperand(Value *I) { @@ -1231,13 +1507,13 @@ Value *BoUpSLP::getSinkBarrier(Instruction *Src, Instruction *Dst) { if (!A.Ptr || !B.Ptr || AA->alias(A, B)) return I; } - return 0; + return nullptr; } int BoUpSLP::getLastIndex(ArrayRef<Value *> VL) { BasicBlock *BB = cast<Instruction>(VL[0])->getParent(); - assert(BB == getSameBlock(VL) && BlocksNumbers.count(BB) && "Invalid block"); - BlockNumbering &BN = BlocksNumbers[BB]; + assert(BB == getSameBlock(VL) && "Invalid block"); + BlockNumbering &BN = getBlockNumbering(BB); int MaxIdx = BN.getIndex(BB->getFirstNonPHI()); for (unsigned i = 0, e = VL.size(); i < e; ++i) @@ -1247,8 +1523,8 @@ int BoUpSLP::getLastIndex(ArrayRef<Value *> VL) { Instruction *BoUpSLP::getLastInstruction(ArrayRef<Value *> VL) { BasicBlock *BB = cast<Instruction>(VL[0])->getParent(); - assert(BB == getSameBlock(VL) && BlocksNumbers.count(BB) && "Invalid block"); - BlockNumbering &BN = BlocksNumbers[BB]; + assert(BB == getSameBlock(VL) && "Invalid block"); + BlockNumbering &BN = getBlockNumbering(BB); int MaxIdx = BN.getIndex(cast<Instruction>(VL[0])); for (unsigned i = 1, e = VL.size(); i < e; ++i) @@ -1307,7 +1583,7 @@ Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) const { if (En->isSame(VL) && En->VectorizedValue) return En->VectorizedValue; } - return 0; + return nullptr; } Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { @@ -1344,9 +1620,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { setInsertPointAfterBundle(E->Scalars); return Gather(E->Scalars, VecTy); } - - unsigned Opcode = VL0->getOpcode(); - assert(Opcode == getSameOpcode(E->Scalars) && "Invalid opcode"); + unsigned Opcode = getSameOpcode(E->Scalars); switch (Opcode) { case Instruction::PHI: { @@ -1528,6 +1802,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { VecTy->getPointerTo(AS)); unsigned Alignment = LI->getAlignment(); LI = Builder.CreateLoad(VecPtr); + if (!Alignment) + Alignment = DL->getABITypeAlignment(LI->getPointerOperand()->getType()); LI->setAlignment(Alignment); E->VectorizedValue = LI; return propagateMetadata(LI, E->Scalars); @@ -1547,14 +1823,123 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(), VecTy->getPointerTo(AS)); StoreInst *S = Builder.CreateStore(VecValue, VecPtr); + if (!Alignment) + Alignment = DL->getABITypeAlignment(SI->getPointerOperand()->getType()); S->setAlignment(Alignment); E->VectorizedValue = S; return propagateMetadata(S, E->Scalars); } + case Instruction::GetElementPtr: { + setInsertPointAfterBundle(E->Scalars); + + ValueList Op0VL; + for (int i = 0, e = E->Scalars.size(); i < e; ++i) + Op0VL.push_back(cast<GetElementPtrInst>(E->Scalars[i])->getOperand(0)); + + Value *Op0 = vectorizeTree(Op0VL); + + std::vector<Value *> OpVecs; + for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e; + ++j) { + ValueList OpVL; + for (int i = 0, e = E->Scalars.size(); i < e; ++i) + OpVL.push_back(cast<GetElementPtrInst>(E->Scalars[i])->getOperand(j)); + + Value *OpVec = vectorizeTree(OpVL); + OpVecs.push_back(OpVec); + } + + Value *V = Builder.CreateGEP(Op0, OpVecs); + E->VectorizedValue = V; + + if (Instruction *I = dyn_cast<Instruction>(V)) + return propagateMetadata(I, E->Scalars); + + return V; + } + case Instruction::Call: { + CallInst *CI = cast<CallInst>(VL0); + setInsertPointAfterBundle(E->Scalars); + Function *FI; + Intrinsic::ID IID = Intrinsic::not_intrinsic; + if (CI && (FI = CI->getCalledFunction())) { + IID = (Intrinsic::ID) FI->getIntrinsicID(); + } + std::vector<Value *> OpVecs; + for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) { + ValueList OpVL; + // ctlz,cttz and powi are special intrinsics whose second argument is + // a scalar. This argument should not be vectorized. + if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) { + CallInst *CEI = cast<CallInst>(E->Scalars[0]); + OpVecs.push_back(CEI->getArgOperand(j)); + continue; + } + for (int i = 0, e = E->Scalars.size(); i < e; ++i) { + CallInst *CEI = cast<CallInst>(E->Scalars[i]); + OpVL.push_back(CEI->getArgOperand(j)); + } + + Value *OpVec = vectorizeTree(OpVL); + DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n"); + OpVecs.push_back(OpVec); + } + + Module *M = F->getParent(); + Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); + Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) }; + Function *CF = Intrinsic::getDeclaration(M, ID, Tys); + Value *V = Builder.CreateCall(CF, OpVecs); + E->VectorizedValue = V; + return V; + } + case Instruction::ShuffleVector: { + ValueList LHSVL, RHSVL; + for (int i = 0, e = E->Scalars.size(); i < e; ++i) { + LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); + RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1)); + } + setInsertPointAfterBundle(E->Scalars); + + Value *LHS = vectorizeTree(LHSVL); + Value *RHS = vectorizeTree(RHSVL); + + if (Value *V = alreadyVectorized(E->Scalars)) + return V; + + // Create a vector of LHS op1 RHS + BinaryOperator *BinOp0 = cast<BinaryOperator>(VL0); + Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS); + + // Create a vector of LHS op2 RHS + Instruction *VL1 = cast<Instruction>(E->Scalars[1]); + BinaryOperator *BinOp1 = cast<BinaryOperator>(VL1); + Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS); + + // Create appropriate shuffle to take alternative operations from + // the vector. + std::vector<Constant *> Mask(E->Scalars.size()); + unsigned e = E->Scalars.size(); + for (unsigned i = 0; i < e; ++i) { + if (i & 1) + Mask[i] = Builder.getInt32(e + i); + else + Mask[i] = Builder.getInt32(i); + } + + Value *ShuffleMask = ConstantVector::get(Mask); + + Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask); + E->VectorizedValue = V; + if (Instruction *I = dyn_cast<Instruction>(V)) + return propagateMetadata(I, E->Scalars); + + return V; + } default: llvm_unreachable("unknown inst"); } - return 0; + return nullptr; } Value *BoUpSLP::vectorizeTree() { @@ -1571,8 +1956,8 @@ Value *BoUpSLP::vectorizeTree() { // Skip users that we already RAUW. This happens when one instruction // has multiple uses of the same value. - if (std::find(Scalar->use_begin(), Scalar->use_end(), User) == - Scalar->use_end()) + if (std::find(Scalar->user_begin(), Scalar->user_end(), User) == + Scalar->user_end()) continue; assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar"); @@ -1586,12 +1971,7 @@ Value *BoUpSLP::vectorizeTree() { Value *Lane = Builder.getInt32(it->Lane); // Generate extracts for out-of-tree users. // Find the insertion point for the extractelement lane. - if (PHINode *PN = dyn_cast<PHINode>(Vec)) { - Builder.SetInsertPoint(PN->getParent()->getFirstInsertionPt()); - Value *Ex = Builder.CreateExtractElement(Vec, Lane); - CSEBlocks.insert(PN->getParent()); - User->replaceUsesOfWith(Scalar, Ex); - } else if (isa<Instruction>(Vec)){ + if (isa<Instruction>(Vec)){ if (PHINode *PH = dyn_cast<PHINode>(User)) { for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) { if (PH->getIncomingValue(i) == Scalar) { @@ -1624,7 +2004,6 @@ Value *BoUpSLP::vectorizeTree() { // For each lane: for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { Value *Scalar = Entry->Scalars[Lane]; - // No need to handle users of gathered values. if (Entry->NeedToGather) continue; @@ -1633,15 +2012,17 @@ Value *BoUpSLP::vectorizeTree() { Type *Ty = Scalar->getType(); if (!Ty->isVoidTy()) { - for (Value::use_iterator User = Scalar->use_begin(), - UE = Scalar->use_end(); User != UE; ++User) { - DEBUG(dbgs() << "SLP: \tvalidating user:" << **User << ".\n"); - - assert((ScalarToTreeEntry.count(*User) || - // It is legal to replace the reduction users by undef. - (RdxOps && RdxOps->count(*User))) && +#ifndef NDEBUG + for (User *U : Scalar->users()) { + DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n"); + + assert((ScalarToTreeEntry.count(U) || + // It is legal to replace users in the ignorelist by undef. + (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), U) != + UserIgnoreList.end())) && "Replacing out-of-tree value with undef"); } +#endif Value *Undef = UndefValue::get(Ty); Scalar->replaceAllUsesWith(Undef); } @@ -1650,24 +2031,14 @@ Value *BoUpSLP::vectorizeTree() { } } - for (Function::iterator it = F->begin(), e = F->end(); it != e; ++it) { - BlocksNumbers[it].forget(); - } + for (auto &BN : BlocksNumbers) + BN.second.forget(); + Builder.ClearInsertionPoint(); return VectorizableTree[0].VectorizedValue; } -class DTCmp { - const DominatorTree *DT; - -public: - DTCmp(const DominatorTree *DT) : DT(DT) {} - bool operator()(const BasicBlock *A, const BasicBlock *B) const { - return DT->properlyDominates(A, B); - } -}; - void BoUpSLP::optimizeGatherSequence() { DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size() << " gather sequences instructions.\n"); @@ -1703,21 +2074,30 @@ void BoUpSLP::optimizeGatherSequence() { Insert->moveBefore(PreHeader->getTerminator()); } + // Make a list of all reachable blocks in our CSE queue. + SmallVector<const DomTreeNode *, 8> CSEWorkList; + CSEWorkList.reserve(CSEBlocks.size()); + for (BasicBlock *BB : CSEBlocks) + if (DomTreeNode *N = DT->getNode(BB)) { + assert(DT->isReachableFromEntry(N)); + CSEWorkList.push_back(N); + } + // Sort blocks by domination. This ensures we visit a block after all blocks // dominating it are visited. - SmallVector<BasicBlock *, 8> CSEWorkList(CSEBlocks.begin(), CSEBlocks.end()); - std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(), DTCmp(DT)); + std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(), + [this](const DomTreeNode *A, const DomTreeNode *B) { + return DT->properlyDominates(A, B); + }); // Perform O(N^2) search over the gather sequences and merge identical // instructions. TODO: We can further optimize this scan if we split the // instructions into different buckets based on the insert lane. SmallVector<Instruction *, 16> Visited; - for (SmallVectorImpl<BasicBlock *>::iterator I = CSEWorkList.begin(), - E = CSEWorkList.end(); - I != E; ++I) { - assert((I == CSEWorkList.begin() || !DT->dominates(*I, *llvm::prior(I))) && + for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) { + assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) && "Worklist not sorted properly!"); - BasicBlock *BB = *I; + BasicBlock *BB = (*I)->getBlock(); // For all instructions in blocks containing gather sequences: for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) { Instruction *In = it++; @@ -1733,7 +2113,7 @@ void BoUpSLP::optimizeGatherSequence() { DT->dominates((*v)->getParent(), In->getParent())) { In->replaceAllUsesWith(*v); In->eraseFromParent(); - In = 0; + In = nullptr; break; } } @@ -1760,19 +2140,25 @@ struct SLPVectorizer : public FunctionPass { } ScalarEvolution *SE; - DataLayout *DL; + const DataLayout *DL; TargetTransformInfo *TTI; + TargetLibraryInfo *TLI; AliasAnalysis *AA; LoopInfo *LI; DominatorTree *DT; - virtual bool runOnFunction(Function &F) { + bool runOnFunction(Function &F) override { + if (skipOptnoneFunction(F)) + return false; + SE = &getAnalysis<ScalarEvolution>(); - DL = getAnalysisIfAvailable<DataLayout>(); + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + DL = DLP ? &DLP->getDataLayout() : nullptr; TTI = &getAnalysis<TargetTransformInfo>(); + TLI = getAnalysisIfAvailable<TargetLibraryInfo>(); AA = &getAnalysis<AliasAnalysis>(); LI = &getAnalysis<LoopInfo>(); - DT = &getAnalysis<DominatorTree>(); + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); StoreRefs.clear(); bool Changed = false; @@ -1793,15 +2179,14 @@ struct SLPVectorizer : public FunctionPass { DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n"); - // Use the bollom up slp vectorizer to construct chains that start with - // he store instructions. - BoUpSLP R(&F, SE, DL, TTI, AA, LI, DT); + // Use the bottom up slp vectorizer to construct chains that start with + // store instructions. + BoUpSLP R(&F, SE, DL, TTI, TLI, AA, LI, DT); // Scan the blocks in the function in post order. for (po_iterator<BasicBlock*> it = po_begin(&F.getEntryBlock()), e = po_end(&F.getEntryBlock()); it != e; ++it) { BasicBlock *BB = *it; - // Vectorize trees that end at stores. if (unsigned count = collectStores(BB, R)) { (void)count; @@ -1821,15 +2206,15 @@ struct SLPVectorizer : public FunctionPass { return Changed; } - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { FunctionPass::getAnalysisUsage(AU); AU.addRequired<ScalarEvolution>(); AU.addRequired<AliasAnalysis>(); AU.addRequired<TargetTransformInfo>(); AU.addRequired<LoopInfo>(); - AU.addRequired<DominatorTree>(); + AU.addRequired<DominatorTreeWrapperPass>(); AU.addPreserved<LoopInfo>(); - AU.addPreserved<DominatorTree>(); + AU.addPreserved<DominatorTreeWrapperPass>(); AU.setPreservesCFG(); } @@ -1845,8 +2230,11 @@ private: bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R); /// \brief Try to vectorize a list of operands. + /// \@param BuildVector A list of users to ignore for the purpose of + /// scheduling and that don't need extracting. /// \returns true if a value was vectorized. - bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R); + bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, + ArrayRef<Value *> BuildVector = None); /// \brief Try to vectorize a chain that may start at the operands of \V; bool tryToVectorize(BinaryOperator *V, BoUpSLP &R); @@ -1867,7 +2255,7 @@ private: StoreListMap StoreRefs; }; -/// \brief Check that the Values in the slice in VL array are still existant in +/// \brief Check that the Values in the slice in VL array are still existent in /// the WeakVH array. /// Vectorization of part of the VL array may cause later values in the VL array /// to become invalid. We track when this has happened in the WeakVH array. @@ -1894,7 +2282,7 @@ bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain, if (!isPowerOf2_32(Sz) || VF < 2) return false; - // Keep track of values that were delete by vectorizing in the loop below. + // Keep track of values that were deleted by vectorizing in the loop below. SmallVector<WeakVH, 8> TrackValues(Chain.begin(), Chain.end()); bool Changed = false; @@ -2000,7 +2388,7 @@ unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) { // Check that the pointer points to scalars. Type *Ty = SI->getValueOperand()->getType(); if (Ty->isAggregateType() || Ty->isVectorTy()) - return 0; + continue; // Find the base pointer. Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), DL); @@ -2019,7 +2407,8 @@ bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) { return tryToVectorizeList(VL, R); } -bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R) { +bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, + ArrayRef<Value *> BuildVector) { if (VL.size() < 2) return false; @@ -2047,7 +2436,7 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R) { bool Changed = false; - // Keep track of values that were delete by vectorizing in the loop below. + // Keep track of values that were deleted by vectorizing in the loop below. SmallVector<WeakVH, 8> TrackValues(VL.begin(), VL.end()); for (unsigned i = 0, e = VL.size(); i < e; ++i) { @@ -2069,13 +2458,38 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R) { << "\n"); ArrayRef<Value *> Ops = VL.slice(i, OpsWidth); - R.buildTree(Ops); + ArrayRef<Value *> BuildVectorSlice; + if (!BuildVector.empty()) + BuildVectorSlice = BuildVector.slice(i, OpsWidth); + + R.buildTree(Ops, BuildVectorSlice); int Cost = R.getTreeCost(); if (Cost < -SLPCostThreshold) { - DEBUG(dbgs() << "SLP: Vectorizing pair at cost:" << Cost << ".\n"); - R.vectorizeTree(); - + DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n"); + Value *VectorizedRoot = R.vectorizeTree(); + + // Reconstruct the build vector by extracting the vectorized root. This + // way we handle the case where some elements of the vector are undefined. + // (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2)) + if (!BuildVectorSlice.empty()) { + // The insert point is the last build vector instruction. The vectorized + // root will precede it. This guarantees that we get an instruction. The + // vectorized tree could have been constant folded. + Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back()); + unsigned VecIdx = 0; + for (auto &V : BuildVectorSlice) { + IRBuilder<true, NoFolder> Builder( + ++BasicBlock::iterator(InsertAfter)); + InsertElementInst *IE = cast<InsertElementInst>(V); + Instruction *Extract = cast<Instruction>(Builder.CreateExtractElement( + VectorizedRoot, Builder.getInt32(VecIdx++))); + IE->setOperand(1, Extract); + IE->removeFromParent(); + IE->insertAfter(Extract); + InsertAfter = IE; + } + } // Move to the next bundle. i += VF - 1; Changed = true; @@ -2184,7 +2598,7 @@ static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx, /// *p = /// class HorizontalReduction { - SmallPtrSet<Value *, 16> ReductionOps; + SmallVector<Value *, 16> ReductionOps; SmallVector<Value *, 32> ReducedVals; BinaryOperator *ReductionRoot; @@ -2202,12 +2616,12 @@ class HorizontalReduction { public: HorizontalReduction() - : ReductionRoot(0), ReductionPHI(0), ReductionOpcode(0), + : ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0), ReducedValueOpcode(0), ReduxWidth(0), IsPairwiseReduction(false) {} /// \brief Try to find a reduction tree. bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B, - DataLayout *DL) { + const DataLayout *DL) { assert((!Phi || std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) && "Thi phi needs to use the binary operator"); @@ -2217,10 +2631,10 @@ public: // In such a case start looking for a tree rooted in the first '+'. if (Phi) { if (B->getOperand(0) == Phi) { - Phi = 0; + Phi = nullptr; B = dyn_cast<BinaryOperator>(B->getOperand(1)); } else if (B->getOperand(1) == Phi) { - Phi = 0; + Phi = nullptr; B = dyn_cast<BinaryOperator>(B->getOperand(0)); } } @@ -2278,7 +2692,7 @@ public: // We need to be able to reassociate the adds. if (!TreeN->isAssociative()) return false; - ReductionOps.insert(TreeN); + ReductionOps.push_back(TreeN); } // Retract. Stack.pop_back(); @@ -2306,7 +2720,7 @@ public: if (NumReducedVals < ReduxWidth) return false; - Value *VectorizedTree = 0; + Value *VectorizedTree = nullptr; IRBuilder<> Builder(ReductionRoot); FastMathFlags Unsafe; Unsafe.setUnsafeAlgebra(); @@ -2315,7 +2729,7 @@ public: for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) { ArrayRef<Value *> ValsToReduce(&ReducedVals[i], ReduxWidth); - V.buildTree(ValsToReduce, &ReductionOps); + V.buildTree(ValsToReduce, ReductionOps); // Estimate cost. int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]); @@ -2349,13 +2763,13 @@ public: } // Update users. if (ReductionPHI) { - assert(ReductionRoot != NULL && "Need a reduction operation"); + assert(ReductionRoot && "Need a reduction operation"); ReductionRoot->setOperand(0, VectorizedTree); ReductionRoot->setOperand(1, ReductionPHI); } else ReductionRoot->replaceAllUsesWith(VectorizedTree); } - return VectorizedTree != 0; + return VectorizedTree != nullptr; } private: @@ -2434,18 +2848,21 @@ private: /// /// Returns true if it matches /// -static bool findBuildVector(InsertElementInst *IE, - SmallVectorImpl<Value *> &Ops) { - if (!isa<UndefValue>(IE->getOperand(0))) +static bool findBuildVector(InsertElementInst *FirstInsertElem, + SmallVectorImpl<Value *> &BuildVector, + SmallVectorImpl<Value *> &BuildVectorOpds) { + if (!isa<UndefValue>(FirstInsertElem->getOperand(0))) return false; + InsertElementInst *IE = FirstInsertElem; while (true) { - Ops.push_back(IE->getOperand(1)); + BuildVector.push_back(IE); + BuildVectorOpds.push_back(IE->getOperand(1)); if (IE->use_empty()) return false; - InsertElementInst *NextUse = dyn_cast<InsertElementInst>(IE->use_back()); + InsertElementInst *NextUse = dyn_cast<InsertElementInst>(IE->user_back()); if (!NextUse) return true; @@ -2512,7 +2929,7 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { break; } - // Start over at the next instruction of a differnt type (or the end). + // Start over at the next instruction of a different type (or the end). IncIt = SameTypeIt; } } @@ -2535,7 +2952,8 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { Value *Rdx = (P->getIncomingBlock(0) == BB ? (P->getIncomingValue(0)) - : (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1) : 0)); + : (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1) + : nullptr)); // Check if this is a Binary Operator. BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx); if (!BI) @@ -2574,7 +2992,7 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { if (BinaryOperator *BinOp = dyn_cast<BinaryOperator>(SI->getValueOperand())) { HorizontalReduction HorRdx; - if (((HorRdx.matchAssociativeReduction(0, BinOp, DL) && + if (((HorRdx.matchAssociativeReduction(nullptr, BinOp, DL) && HorRdx.tryToReduce(R, TTI)) || tryToVectorize(BinOp, R))) { Changed = true; @@ -2610,12 +3028,16 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { } // Try to vectorize trees that start at insertelement instructions. - if (InsertElementInst *IE = dyn_cast<InsertElementInst>(it)) { - SmallVector<Value *, 8> Ops; - if (!findBuildVector(IE, Ops)) + if (InsertElementInst *FirstInsertElem = dyn_cast<InsertElementInst>(it)) { + SmallVector<Value *, 16> BuildVector; + SmallVector<Value *, 16> BuildVectorOpds; + if (!findBuildVector(FirstInsertElem, BuildVector, BuildVectorOpds)) continue; - if (tryToVectorizeList(Ops, R)) { + // Vectorize starting with the build vector operands ignoring the + // BuildVector instructions for the purpose of scheduling and user + // extraction. + if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) { Changed = true; it = BB->begin(); e = BB->end(); |