diff options
Diffstat (limited to 'lib/Transforms/Vectorize/BBVectorize.cpp')
-rw-r--r-- | lib/Transforms/Vectorize/BBVectorize.cpp | 1012 |
1 files changed, 794 insertions, 218 deletions
diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp index 62d23cb..f7be3e3 100644 --- a/lib/Transforms/Vectorize/BBVectorize.cpp +++ b/lib/Transforms/Vectorize/BBVectorize.cpp @@ -28,12 +28,14 @@ #include "llvm/Type.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" +#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/ValueTracking.h" @@ -41,17 +43,27 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Support/ValueHandle.h" -#include "llvm/Target/TargetData.h" +#include "llvm/DataLayout.h" +#include "llvm/TargetTransformInfo.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Vectorize.h" #include <algorithm> #include <map> using namespace llvm; +static cl::opt<bool> +IgnoreTargetInfo("bb-vectorize-ignore-target-info", cl::init(false), + cl::Hidden, cl::desc("Ignore target information")); + static cl::opt<unsigned> ReqChainDepth("bb-vectorize-req-chain-depth", cl::init(6), cl::Hidden, cl::desc("The required chain depth for vectorization")); +static cl::opt<bool> +UseChainDepthWithTI("bb-vectorize-use-chain-depth", cl::init(false), + cl::Hidden, cl::desc("Use the chain depth requirement with" + " target information")); + static cl::opt<unsigned> SearchLimit("bb-vectorize-search-limit", cl::init(400), cl::Hidden, cl::desc("The maximum search distance for instruction pairs")); @@ -93,8 +105,9 @@ static cl::opt<bool> NoFloats("bb-vectorize-no-floats", cl::init(false), cl::Hidden, cl::desc("Don't try to vectorize floating-point values")); +// FIXME: This should default to false once pointer vector support works. static cl::opt<bool> -NoPointers("bb-vectorize-no-pointers", cl::init(false), cl::Hidden, +NoPointers("bb-vectorize-no-pointers", cl::init(/*false*/ true), cl::Hidden, cl::desc("Don't try to vectorize pointer values")); static cl::opt<bool> @@ -159,6 +172,12 @@ DebugCycleCheck("bb-vectorize-debug-cycle-check", cl::init(false), cl::Hidden, cl::desc("When debugging is enabled, output information on the" " cycle-checking process")); + +static cl::opt<bool> +PrintAfterEveryPair("bb-vectorize-debug-print-after-every-pair", + cl::init(false), cl::Hidden, + cl::desc("When debugging is enabled, dump the basic block after" + " every pair is fused")); #endif STATISTIC(NumFusedOps, "Number of operations fused by bb-vectorize"); @@ -177,13 +196,19 @@ namespace { BBVectorize(Pass *P, const VectorizeConfig &C) : BasicBlockPass(ID), Config(C) { AA = &P->getAnalysis<AliasAnalysis>(); + DT = &P->getAnalysis<DominatorTree>(); SE = &P->getAnalysis<ScalarEvolution>(); - TD = P->getAnalysisIfAvailable<TargetData>(); + TD = P->getAnalysisIfAvailable<DataLayout>(); + TTI = IgnoreTargetInfo ? 0 : + P->getAnalysisIfAvailable<TargetTransformInfo>(); + VTTI = TTI ? TTI->getVectorTargetTransformInfo() : 0; } typedef std::pair<Value *, Value *> ValuePair; + typedef std::pair<ValuePair, int> ValuePairWithCost; typedef std::pair<ValuePair, size_t> ValuePairWithDepth; typedef std::pair<ValuePair, ValuePair> VPPair; // A ValuePair pair + typedef std::pair<VPPair, unsigned> VPPairWithType; typedef std::pair<std::multimap<Value *, Value *>::iterator, std::multimap<Value *, Value *>::iterator> VPIteratorPair; typedef std::pair<std::multimap<ValuePair, ValuePair>::iterator, @@ -191,8 +216,11 @@ namespace { VPPIteratorPair; AliasAnalysis *AA; + DominatorTree *DT; ScalarEvolution *SE; - TargetData *TD; + DataLayout *TD; + TargetTransformInfo *TTI; + const VectorTargetTransformInfo *VTTI; // FIXME: const correct? @@ -201,11 +229,23 @@ namespace { bool getCandidatePairs(BasicBlock &BB, BasicBlock::iterator &Start, std::multimap<Value *, Value *> &CandidatePairs, + DenseSet<ValuePair> &FixedOrderPairs, + DenseMap<ValuePair, int> &CandidatePairCostSavings, std::vector<Value *> &PairableInsts, bool NonPow2Len); + // FIXME: The current implementation does not account for pairs that + // are connected in multiple ways. For example: + // C1 = A1 / A2; C2 = A2 / A1 (which may be both direct and a swap) + enum PairConnectionType { + PairConnectionDirect, + PairConnectionSwap, + PairConnectionSplat + }; + void computeConnectedPairs(std::multimap<Value *, Value *> &CandidatePairs, std::vector<Value *> &PairableInsts, - std::multimap<ValuePair, ValuePair> &ConnectedPairs); + std::multimap<ValuePair, ValuePair> &ConnectedPairs, + DenseMap<VPPair, unsigned> &PairConnectionTypes); void buildDepMap(BasicBlock &BB, std::multimap<Value *, Value *> &CandidatePairs, @@ -213,19 +253,29 @@ namespace { DenseSet<ValuePair> &PairableInstUsers); void choosePairs(std::multimap<Value *, Value *> &CandidatePairs, + DenseMap<ValuePair, int> &CandidatePairCostSavings, std::vector<Value *> &PairableInsts, + DenseSet<ValuePair> &FixedOrderPairs, + DenseMap<VPPair, unsigned> &PairConnectionTypes, std::multimap<ValuePair, ValuePair> &ConnectedPairs, + std::multimap<ValuePair, ValuePair> &ConnectedPairDeps, DenseSet<ValuePair> &PairableInstUsers, DenseMap<Value *, Value *>& ChosenPairs); void fuseChosenPairs(BasicBlock &BB, std::vector<Value *> &PairableInsts, - DenseMap<Value *, Value *>& ChosenPairs); + DenseMap<Value *, Value *>& ChosenPairs, + DenseSet<ValuePair> &FixedOrderPairs, + DenseMap<VPPair, unsigned> &PairConnectionTypes, + std::multimap<ValuePair, ValuePair> &ConnectedPairs, + std::multimap<ValuePair, ValuePair> &ConnectedPairDeps); + bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore); bool areInstsCompatible(Instruction *I, Instruction *J, - bool IsSimpleLoadStore, bool NonPow2Len); + bool IsSimpleLoadStore, bool NonPow2Len, + int &CostSavings, int &FixedOrder); bool trackUsesOfI(DenseSet<Value *> &Users, AliasSetTracker &WriteSet, Instruction *I, @@ -236,6 +286,7 @@ namespace { std::multimap<Value *, Value *> &CandidatePairs, std::vector<Value *> &PairableInsts, std::multimap<ValuePair, ValuePair> &ConnectedPairs, + DenseMap<VPPair, unsigned> &PairConnectionTypes, ValuePair P); bool pairsConflict(ValuePair P, ValuePair Q, @@ -267,17 +318,21 @@ namespace { void findBestTreeFor( std::multimap<Value *, Value *> &CandidatePairs, + DenseMap<ValuePair, int> &CandidatePairCostSavings, std::vector<Value *> &PairableInsts, + DenseSet<ValuePair> &FixedOrderPairs, + DenseMap<VPPair, unsigned> &PairConnectionTypes, std::multimap<ValuePair, ValuePair> &ConnectedPairs, + std::multimap<ValuePair, ValuePair> &ConnectedPairDeps, DenseSet<ValuePair> &PairableInstUsers, std::multimap<ValuePair, ValuePair> &PairableInstUserMap, DenseMap<Value *, Value *> &ChosenPairs, DenseSet<ValuePair> &BestTree, size_t &BestMaxDepth, - size_t &BestEffSize, VPIteratorPair ChoiceRange, + int &BestEffSize, VPIteratorPair ChoiceRange, bool UseCycleCheck); Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I, - Instruction *J, unsigned o, bool FlipMemInputs); + Instruction *J, unsigned o); void fillNewShuffleMask(LLVMContext& Context, Instruction *J, unsigned MaskOffset, unsigned NumInElem, @@ -289,20 +344,20 @@ namespace { bool expandIEChain(LLVMContext& Context, Instruction *I, Instruction *J, unsigned o, Value *&LOp, unsigned numElemL, - Type *ArgTypeL, Type *ArgTypeR, + Type *ArgTypeL, Type *ArgTypeR, bool IBeforeJ, unsigned IdxOff = 0); Value *getReplacementInput(LLVMContext& Context, Instruction *I, - Instruction *J, unsigned o, bool FlipMemInputs); + Instruction *J, unsigned o, bool IBeforeJ); void getReplacementInputsForPair(LLVMContext& Context, Instruction *I, Instruction *J, SmallVector<Value *, 3> &ReplacedOperands, - bool FlipMemInputs); + bool IBeforeJ); void replaceOutputsOfPair(LLVMContext& Context, Instruction *I, Instruction *J, Instruction *K, Instruction *&InsertionPt, Instruction *&K1, - Instruction *&K2, bool FlipMemInputs); + Instruction *&K2); void collectPairLoadMoveSet(BasicBlock &BB, DenseMap<Value *, Value *> &ChosenPairs, @@ -314,10 +369,6 @@ namespace { DenseMap<Value *, Value *> &ChosenPairs, std::multimap<Value *, Value *> &LoadMoveSet); - void collectPtrInfo(std::vector<Value *> &PairableInsts, - DenseMap<Value *, Value *> &ChosenPairs, - DenseSet<Value *> &LowPtrInsts); - bool canMoveUsesOfIAfterJ(BasicBlock &BB, std::multimap<Value *, Value *> &LoadMoveSet, Instruction *I, Instruction *J); @@ -330,13 +381,22 @@ namespace { void combineMetadata(Instruction *K, const Instruction *J); bool vectorizeBB(BasicBlock &BB) { + if (!DT->isReachableFromEntry(&BB)) { + DEBUG(dbgs() << "BBV: skipping unreachable " << BB.getName() << + " in " << BB.getParent()->getName() << "\n"); + return false; + } + + DEBUG(if (VTTI) dbgs() << "BBV: using target information\n"); + bool changed = false; // Iterate a sufficient number of times to merge types of size 1 bit, // then 2 bits, then 4, etc. up to half of the target vector width of the // target vector register. unsigned n = 1; for (unsigned v = 2; - v <= Config.VectorBits && (!Config.MaxIter || n <= Config.MaxIter); + (VTTI || v <= Config.VectorBits) && + (!Config.MaxIter || n <= Config.MaxIter); v *= 2, ++n) { DEBUG(dbgs() << "BBV: fusing loop #" << n << " for " << BB.getName() << " in " << @@ -363,8 +423,12 @@ namespace { virtual bool runOnBasicBlock(BasicBlock &BB) { AA = &getAnalysis<AliasAnalysis>(); + DT = &getAnalysis<DominatorTree>(); SE = &getAnalysis<ScalarEvolution>(); - TD = getAnalysisIfAvailable<TargetData>(); + TD = getAnalysisIfAvailable<DataLayout>(); + TTI = IgnoreTargetInfo ? 0 : + getAnalysisIfAvailable<TargetTransformInfo>(); + VTTI = TTI ? TTI->getVectorTargetTransformInfo() : 0; return vectorizeBB(BB); } @@ -372,8 +436,10 @@ namespace { virtual void getAnalysisUsage(AnalysisUsage &AU) const { BasicBlockPass::getAnalysisUsage(AU); AU.addRequired<AliasAnalysis>(); + AU.addRequired<DominatorTree>(); AU.addRequired<ScalarEvolution>(); AU.addPreserved<AliasAnalysis>(); + AU.addPreserved<DominatorTree>(); AU.addPreserved<ScalarEvolution>(); AU.setPreservesCFG(); } @@ -415,6 +481,14 @@ namespace { T2 = cast<CastInst>(I)->getSrcTy(); else T2 = T1; + + if (SelectInst *SI = dyn_cast<SelectInst>(I)) { + T2 = SI->getCondition()->getType(); + } else if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(I)) { + T2 = SI->getOperand(0)->getType(); + } else if (CmpInst *CI = dyn_cast<CmpInst>(I)) { + T2 = CI->getOperand(0)->getType(); + } } // Returns the weight associated with the provided value. A chain of @@ -446,6 +520,62 @@ namespace { return 1; } + // Returns the cost of the provided instruction using VTTI. + // This does not handle loads and stores. + unsigned getInstrCost(unsigned Opcode, Type *T1, Type *T2) { + switch (Opcode) { + default: break; + case Instruction::GetElementPtr: + // We mark this instruction as zero-cost because scalar GEPs are usually + // lowered to the intruction addressing mode. At the moment we don't + // generate vector GEPs. + return 0; + case Instruction::Br: + return VTTI->getCFInstrCost(Opcode); + case Instruction::PHI: + return 0; + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + return VTTI->getArithmeticInstrCost(Opcode, T1); + case Instruction::Select: + case Instruction::ICmp: + case Instruction::FCmp: + return VTTI->getCmpSelInstrCost(Opcode, T1, T2); + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: + case Instruction::ShuffleVector: + return VTTI->getCastInstrCost(Opcode, T1, T2); + } + + return 1; + } + // This determines the relative offset of two loads or stores, returning // true if the offset could be determined to be some constant value. // For example, if OffsetInElmts == 1, then J accesses the memory directly @@ -453,20 +583,30 @@ namespace { // directly after J. bool getPairPtrInfo(Instruction *I, Instruction *J, Value *&IPtr, Value *&JPtr, unsigned &IAlignment, unsigned &JAlignment, - int64_t &OffsetInElmts) { + unsigned &IAddressSpace, unsigned &JAddressSpace, + int64_t &OffsetInElmts, bool ComputeOffset = true) { OffsetInElmts = 0; - if (isa<LoadInst>(I)) { - IPtr = cast<LoadInst>(I)->getPointerOperand(); - JPtr = cast<LoadInst>(J)->getPointerOperand(); - IAlignment = cast<LoadInst>(I)->getAlignment(); - JAlignment = cast<LoadInst>(J)->getAlignment(); + if (LoadInst *LI = dyn_cast<LoadInst>(I)) { + LoadInst *LJ = cast<LoadInst>(J); + IPtr = LI->getPointerOperand(); + JPtr = LJ->getPointerOperand(); + IAlignment = LI->getAlignment(); + JAlignment = LJ->getAlignment(); + IAddressSpace = LI->getPointerAddressSpace(); + JAddressSpace = LJ->getPointerAddressSpace(); } else { - IPtr = cast<StoreInst>(I)->getPointerOperand(); - JPtr = cast<StoreInst>(J)->getPointerOperand(); - IAlignment = cast<StoreInst>(I)->getAlignment(); - JAlignment = cast<StoreInst>(J)->getAlignment(); + StoreInst *SI = cast<StoreInst>(I), *SJ = cast<StoreInst>(J); + IPtr = SI->getPointerOperand(); + JPtr = SJ->getPointerOperand(); + IAlignment = SI->getAlignment(); + JAlignment = SJ->getAlignment(); + IAddressSpace = SI->getPointerAddressSpace(); + JAddressSpace = SJ->getPointerAddressSpace(); } + if (!ComputeOffset) + return true; + const SCEV *IPtrSCEV = SE->getSCEV(IPtr); const SCEV *JPtrSCEV = SE->getSCEV(JPtr); @@ -536,6 +676,19 @@ namespace { return false; } + + bool isPureIEChain(InsertElementInst *IE) { + InsertElementInst *IENext = IE; + do { + if (!isa<UndefValue>(IENext->getOperand(0)) && + !isa<InsertElementInst>(IENext->getOperand(0))) { + return false; + } + } while ((IENext = + dyn_cast<InsertElementInst>(IENext->getOperand(0)))); + + return true; + } }; // This function implements one vectorization iteration on the provided @@ -546,11 +699,18 @@ namespace { std::vector<Value *> AllPairableInsts; DenseMap<Value *, Value *> AllChosenPairs; + DenseSet<ValuePair> AllFixedOrderPairs; + DenseMap<VPPair, unsigned> AllPairConnectionTypes; + std::multimap<ValuePair, ValuePair> AllConnectedPairs, AllConnectedPairDeps; do { std::vector<Value *> PairableInsts; std::multimap<Value *, Value *> CandidatePairs; + DenseSet<ValuePair> FixedOrderPairs; + DenseMap<ValuePair, int> CandidatePairCostSavings; ShouldContinue = getCandidatePairs(BB, Start, CandidatePairs, + FixedOrderPairs, + CandidatePairCostSavings, PairableInsts, NonPow2Len); if (PairableInsts.empty()) continue; @@ -563,10 +723,18 @@ namespace { // Note that it only matters that both members of the second pair use some // element of the first pair (to allow for splatting). - std::multimap<ValuePair, ValuePair> ConnectedPairs; - computeConnectedPairs(CandidatePairs, PairableInsts, ConnectedPairs); + std::multimap<ValuePair, ValuePair> ConnectedPairs, ConnectedPairDeps; + DenseMap<VPPair, unsigned> PairConnectionTypes; + computeConnectedPairs(CandidatePairs, PairableInsts, ConnectedPairs, + PairConnectionTypes); if (ConnectedPairs.empty()) continue; + for (std::multimap<ValuePair, ValuePair>::iterator + I = ConnectedPairs.begin(), IE = ConnectedPairs.end(); + I != IE; ++I) { + ConnectedPairDeps.insert(VPPair(I->second, I->first)); + } + // Build the pairable-instruction dependency map DenseSet<ValuePair> PairableInstUsers; buildDepMap(BB, CandidatePairs, PairableInsts, PairableInstUsers); @@ -578,13 +746,48 @@ namespace { // variables. DenseMap<Value *, Value *> ChosenPairs; - choosePairs(CandidatePairs, PairableInsts, ConnectedPairs, + choosePairs(CandidatePairs, CandidatePairCostSavings, + PairableInsts, FixedOrderPairs, PairConnectionTypes, + ConnectedPairs, ConnectedPairDeps, PairableInstUsers, ChosenPairs); if (ChosenPairs.empty()) continue; AllPairableInsts.insert(AllPairableInsts.end(), PairableInsts.begin(), PairableInsts.end()); AllChosenPairs.insert(ChosenPairs.begin(), ChosenPairs.end()); + + // Only for the chosen pairs, propagate information on fixed-order pairs, + // pair connections, and their types to the data structures used by the + // pair fusion procedures. + for (DenseMap<Value *, Value *>::iterator I = ChosenPairs.begin(), + IE = ChosenPairs.end(); I != IE; ++I) { + if (FixedOrderPairs.count(*I)) + AllFixedOrderPairs.insert(*I); + else if (FixedOrderPairs.count(ValuePair(I->second, I->first))) + AllFixedOrderPairs.insert(ValuePair(I->second, I->first)); + + for (DenseMap<Value *, Value *>::iterator J = ChosenPairs.begin(); + J != IE; ++J) { + DenseMap<VPPair, unsigned>::iterator K = + PairConnectionTypes.find(VPPair(*I, *J)); + if (K != PairConnectionTypes.end()) { + AllPairConnectionTypes.insert(*K); + } else { + K = PairConnectionTypes.find(VPPair(*J, *I)); + if (K != PairConnectionTypes.end()) + AllPairConnectionTypes.insert(*K); + } + } + } + + for (std::multimap<ValuePair, ValuePair>::iterator + I = ConnectedPairs.begin(), IE = ConnectedPairs.end(); + I != IE; ++I) { + if (AllPairConnectionTypes.count(*I)) { + AllConnectedPairs.insert(*I); + AllConnectedPairDeps.insert(VPPair(I->second, I->first)); + } + } } while (ShouldContinue); if (AllChosenPairs.empty()) return false; @@ -597,11 +800,13 @@ namespace { // replaced with a vector_extract on the result. Subsequent optimization // passes should coalesce the build/extract combinations. - fuseChosenPairs(BB, AllPairableInsts, AllChosenPairs); + fuseChosenPairs(BB, AllPairableInsts, AllChosenPairs, AllFixedOrderPairs, + AllPairConnectionTypes, + AllConnectedPairs, AllConnectedPairDeps); // It is important to cleanup here so that future iterations of this // function have less work to do. - (void) SimplifyInstructionsInBlock(&BB, TD); + (void) SimplifyInstructionsInBlock(&BB, TD, AA->getTargetLibraryInfo()); return true; } @@ -667,15 +872,22 @@ namespace { !(VectorType::isValidElementType(T2) || T2->isVectorTy())) return false; - if (T1->getScalarSizeInBits() == 1 && T2->getScalarSizeInBits() == 1) { + if (T1->getScalarSizeInBits() == 1) { if (!Config.VectorizeBools) return false; } else { - if (!Config.VectorizeInts - && (T1->isIntOrIntVectorTy() || T2->isIntOrIntVectorTy())) + if (!Config.VectorizeInts && T1->isIntOrIntVectorTy()) return false; } - + + if (T2->getScalarSizeInBits() == 1) { + if (!Config.VectorizeBools) + return false; + } else { + if (!Config.VectorizeInts && T2->isIntOrIntVectorTy()) + return false; + } + if (!Config.VectorizeFloats && (T1->isFPOrFPVectorTy() || T2->isFPOrFPVectorTy())) return false; @@ -691,8 +903,8 @@ namespace { T2->getScalarType()->isPointerTy())) return false; - if (T1->getPrimitiveSizeInBits() >= Config.VectorBits || - T2->getPrimitiveSizeInBits() >= Config.VectorBits) + if (!VTTI && (T1->getPrimitiveSizeInBits() >= Config.VectorBits || + T2->getPrimitiveSizeInBits() >= Config.VectorBits)) return false; return true; @@ -703,10 +915,14 @@ namespace { // that I has already been determined to be vectorizable and that J is not // in the use tree of I. bool BBVectorize::areInstsCompatible(Instruction *I, Instruction *J, - bool IsSimpleLoadStore, bool NonPow2Len) { + bool IsSimpleLoadStore, bool NonPow2Len, + int &CostSavings, int &FixedOrder) { DEBUG(if (DebugInstructionExamination) dbgs() << "BBV: looking at " << *I << " <-> " << *J << "\n"); + CostSavings = 0; + FixedOrder = 0; + // Loads and stores can be merged if they have different alignments, // but are otherwise the same. if (!J->isSameOperationAs(I, Instruction::CompareIgnoringAlignment | @@ -719,38 +935,84 @@ namespace { unsigned MaxTypeBits = std::max( IT1->getPrimitiveSizeInBits() + JT1->getPrimitiveSizeInBits(), IT2->getPrimitiveSizeInBits() + JT2->getPrimitiveSizeInBits()); - if (MaxTypeBits > Config.VectorBits) + if (!VTTI && MaxTypeBits > Config.VectorBits) return false; // FIXME: handle addsub-type operations! if (IsSimpleLoadStore) { Value *IPtr, *JPtr; - unsigned IAlignment, JAlignment; + unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace; int64_t OffsetInElmts = 0; if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, + IAddressSpace, JAddressSpace, OffsetInElmts) && abs64(OffsetInElmts) == 1) { - if (Config.AlignedOnly) { - Type *aTypeI = isa<StoreInst>(I) ? - cast<StoreInst>(I)->getValueOperand()->getType() : I->getType(); - Type *aTypeJ = isa<StoreInst>(J) ? - cast<StoreInst>(J)->getValueOperand()->getType() : J->getType(); + FixedOrder = (int) OffsetInElmts; + unsigned BottomAlignment = IAlignment; + if (OffsetInElmts < 0) BottomAlignment = JAlignment; + + Type *aTypeI = isa<StoreInst>(I) ? + cast<StoreInst>(I)->getValueOperand()->getType() : I->getType(); + Type *aTypeJ = isa<StoreInst>(J) ? + cast<StoreInst>(J)->getValueOperand()->getType() : J->getType(); + Type *VType = getVecTypeForPair(aTypeI, aTypeJ); + if (Config.AlignedOnly) { // An aligned load or store is possible only if the instruction // with the lower offset has an alignment suitable for the // vector type. - unsigned BottomAlignment = IAlignment; - if (OffsetInElmts < 0) BottomAlignment = JAlignment; - - Type *VType = getVecTypeForPair(aTypeI, aTypeJ); unsigned VecAlignment = TD->getPrefTypeAlignment(VType); if (BottomAlignment < VecAlignment) return false; } + + if (VTTI) { + unsigned ICost = VTTI->getMemoryOpCost(I->getOpcode(), I->getType(), + IAlignment, IAddressSpace); + unsigned JCost = VTTI->getMemoryOpCost(J->getOpcode(), J->getType(), + JAlignment, JAddressSpace); + unsigned VCost = VTTI->getMemoryOpCost(I->getOpcode(), VType, + BottomAlignment, + IAddressSpace); + if (VCost > ICost + JCost) + return false; + + // We don't want to fuse to a type that will be split, even + // if the two input types will also be split and there is no other + // associated cost. + unsigned VParts = VTTI->getNumberOfParts(VType); + if (VParts > 1) + return false; + else if (!VParts && VCost == ICost + JCost) + return false; + + CostSavings = ICost + JCost - VCost; + } } else { return false; } + } else if (VTTI) { + unsigned ICost = getInstrCost(I->getOpcode(), IT1, IT2); + unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2); + Type *VT1 = getVecTypeForPair(IT1, JT1), + *VT2 = getVecTypeForPair(IT2, JT2); + unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2); + + if (VCost > ICost + JCost) + return false; + + // We don't want to fuse to a type that will be split, even + // if the two input types will also be split and there is no other + // associated cost. + unsigned VParts1 = VTTI->getNumberOfParts(VT1), + VParts2 = VTTI->getNumberOfParts(VT2); + if (VParts1 > 1 || VParts2 > 1) + return false; + else if ((!VParts1 || !VParts2) && VCost == ICost + JCost) + return false; + + CostSavings = ICost + JCost - VCost; } // The powi intrinsic is special because only the first argument is @@ -833,6 +1095,8 @@ namespace { bool BBVectorize::getCandidatePairs(BasicBlock &BB, BasicBlock::iterator &Start, std::multimap<Value *, Value *> &CandidatePairs, + DenseSet<ValuePair> &FixedOrderPairs, + DenseMap<ValuePair, int> &CandidatePairCostSavings, std::vector<Value *> &PairableInsts, bool NonPow2Len) { BasicBlock::iterator E = BB.end(); if (Start == E) return false; @@ -869,7 +1133,9 @@ namespace { // J does not use I, and comes before the first use of I, so it can be // merged with I if the instructions are compatible. - if (!areInstsCompatible(I, J, IsSimpleLoadStore, NonPow2Len)) continue; + int CostSavings, FixedOrder; + if (!areInstsCompatible(I, J, IsSimpleLoadStore, NonPow2Len, + CostSavings, FixedOrder)) continue; // J is a candidate for merging with I. if (!PairableInsts.size() || @@ -878,6 +1144,14 @@ namespace { } CandidatePairs.insert(ValuePair(I, J)); + if (VTTI) + CandidatePairCostSavings.insert(ValuePairWithCost(ValuePair(I, J), + CostSavings)); + + if (FixedOrder == 1) + FixedOrderPairs.insert(ValuePair(I, J)); + else if (FixedOrder == -1) + FixedOrderPairs.insert(ValuePair(J, I)); // The next call to this function must start after the last instruction // selected during this invocation. @@ -887,7 +1161,8 @@ namespace { } DEBUG(if (DebugCandidateSelection) dbgs() << "BBV: candidate pair " - << *I << " <-> " << *J << "\n"); + << *I << " <-> " << *J << " (cost savings: " << + CostSavings << ")\n"); // If we have already found too many pairs, break here and this function // will be called again starting after the last instruction selected @@ -915,6 +1190,7 @@ namespace { std::multimap<Value *, Value *> &CandidatePairs, std::vector<Value *> &PairableInsts, std::multimap<ValuePair, ValuePair> &ConnectedPairs, + DenseMap<VPPair, unsigned> &PairConnectionTypes, ValuePair P) { StoreInst *SI, *SJ; @@ -946,12 +1222,18 @@ namespace { VPIteratorPair JPairRange = CandidatePairs.equal_range(*J); // Look for <I, J>: - if (isSecondInIteratorPair<Value*>(*J, IPairRange)) - ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J))); + if (isSecondInIteratorPair<Value*>(*J, IPairRange)) { + VPPair VP(P, ValuePair(*I, *J)); + ConnectedPairs.insert(VP); + PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionDirect)); + } // Look for <J, I>: - if (isSecondInIteratorPair<Value*>(*I, JPairRange)) - ConnectedPairs.insert(VPPair(P, ValuePair(*J, *I))); + if (isSecondInIteratorPair<Value*>(*I, JPairRange)) { + VPPair VP(P, ValuePair(*J, *I)); + ConnectedPairs.insert(VP); + PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSwap)); + } } if (Config.SplatBreaksChain) continue; @@ -962,8 +1244,11 @@ namespace { P.first == SJ->getPointerOperand()) continue; - if (isSecondInIteratorPair<Value*>(*J, IPairRange)) - ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J))); + if (isSecondInIteratorPair<Value*>(*J, IPairRange)) { + VPPair VP(P, ValuePair(*I, *J)); + ConnectedPairs.insert(VP); + PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat)); + } } } @@ -985,8 +1270,11 @@ namespace { P.second == SJ->getPointerOperand()) continue; - if (isSecondInIteratorPair<Value*>(*J, IPairRange)) - ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J))); + if (isSecondInIteratorPair<Value*>(*J, IPairRange)) { + VPPair VP(P, ValuePair(*I, *J)); + ConnectedPairs.insert(VP); + PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat)); + } } } } @@ -997,7 +1285,8 @@ namespace { void BBVectorize::computeConnectedPairs( std::multimap<Value *, Value *> &CandidatePairs, std::vector<Value *> &PairableInsts, - std::multimap<ValuePair, ValuePair> &ConnectedPairs) { + std::multimap<ValuePair, ValuePair> &ConnectedPairs, + DenseMap<VPPair, unsigned> &PairConnectionTypes) { for (std::vector<Value *>::iterator PI = PairableInsts.begin(), PE = PairableInsts.end(); PI != PE; ++PI) { @@ -1006,7 +1295,7 @@ namespace { for (std::multimap<Value *, Value *>::iterator P = choiceRange.first; P != choiceRange.second; ++P) computePairsConnectedTo(CandidatePairs, PairableInsts, - ConnectedPairs, *P); + ConnectedPairs, PairConnectionTypes, *P); } DEBUG(dbgs() << "BBV: found " << ConnectedPairs.size() @@ -1196,7 +1485,7 @@ namespace { PrunedTree.insert(QTop.first); // Visit each child, pruning as necessary... - DenseMap<ValuePair, size_t> BestChildren; + SmallVector<ValuePairWithDepth, 8> BestChildren; VPPIteratorPair QTopRange = ConnectedPairs.equal_range(QTop.first); for (std::multimap<ValuePair, ValuePair>::iterator K = QTopRange.first; K != QTopRange.second; ++K) { @@ -1228,7 +1517,7 @@ namespace { DenseSet<ValuePair> CurrentPairs; bool CanAdd = true; - for (DenseMap<ValuePair, size_t>::iterator C2 + for (SmallVector<ValuePairWithDepth, 8>::iterator C2 = BestChildren.begin(), E2 = BestChildren.end(); C2 != E2; ++C2) { if (C2->first.first == C->first.first || @@ -1313,22 +1602,22 @@ namespace { // to an already-selected child. Check for this here, and if a // conflict is found, then remove the previously-selected child // before adding this one in its place. - for (DenseMap<ValuePair, size_t>::iterator C2 + for (SmallVector<ValuePairWithDepth, 8>::iterator C2 = BestChildren.begin(); C2 != BestChildren.end();) { if (C2->first.first == C->first.first || C2->first.first == C->first.second || C2->first.second == C->first.first || C2->first.second == C->first.second || pairsConflict(C2->first, C->first, PairableInstUsers)) - BestChildren.erase(C2++); + C2 = BestChildren.erase(C2); else ++C2; } - BestChildren.insert(ValuePairWithDepth(C->first, C->second)); + BestChildren.push_back(ValuePairWithDepth(C->first, C->second)); } - for (DenseMap<ValuePair, size_t>::iterator C + for (SmallVector<ValuePairWithDepth, 8>::iterator C = BestChildren.begin(), E2 = BestChildren.end(); C != E2; ++C) { size_t DepthF = getDepthFactor(C->first.first); @@ -1341,13 +1630,17 @@ namespace { // pairs, given the choice of root pairs as an iterator range. void BBVectorize::findBestTreeFor( std::multimap<Value *, Value *> &CandidatePairs, + DenseMap<ValuePair, int> &CandidatePairCostSavings, std::vector<Value *> &PairableInsts, + DenseSet<ValuePair> &FixedOrderPairs, + DenseMap<VPPair, unsigned> &PairConnectionTypes, std::multimap<ValuePair, ValuePair> &ConnectedPairs, + std::multimap<ValuePair, ValuePair> &ConnectedPairDeps, DenseSet<ValuePair> &PairableInstUsers, std::multimap<ValuePair, ValuePair> &PairableInstUserMap, DenseMap<Value *, Value *> &ChosenPairs, DenseSet<ValuePair> &BestTree, size_t &BestMaxDepth, - size_t &BestEffSize, VPIteratorPair ChoiceRange, + int &BestEffSize, VPIteratorPair ChoiceRange, bool UseCycleCheck) { for (std::multimap<Value *, Value *>::iterator J = ChoiceRange.first; J != ChoiceRange.second; ++J) { @@ -1397,17 +1690,289 @@ namespace { PairableInstUsers, PairableInstUserMap, ChosenPairs, Tree, PrunedTree, *J, UseCycleCheck); - size_t EffSize = 0; - for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(), - E = PrunedTree.end(); S != E; ++S) - EffSize += getDepthFactor(S->first); + int EffSize = 0; + if (VTTI) { + DenseSet<Value *> PrunedTreeInstrs; + for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(), + E = PrunedTree.end(); S != E; ++S) { + PrunedTreeInstrs.insert(S->first); + PrunedTreeInstrs.insert(S->second); + } + + // The set of pairs that have already contributed to the total cost. + DenseSet<ValuePair> IncomingPairs; + + // If the cost model were perfect, this might not be necessary; but we + // need to make sure that we don't get stuck vectorizing our own + // shuffle chains. + bool HasNontrivialInsts = false; + + // The node weights represent the cost savings associated with + // fusing the pair of instructions. + for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(), + E = PrunedTree.end(); S != E; ++S) { + if (!isa<ShuffleVectorInst>(S->first) && + !isa<InsertElementInst>(S->first) && + !isa<ExtractElementInst>(S->first)) + HasNontrivialInsts = true; + + bool FlipOrder = false; + + if (getDepthFactor(S->first)) { + int ESContrib = CandidatePairCostSavings.find(*S)->second; + DEBUG(if (DebugPairSelection) dbgs() << "\tweight {" + << *S->first << " <-> " << *S->second << "} = " << + ESContrib << "\n"); + EffSize += ESContrib; + } + + // The edge weights contribute in a negative sense: they represent + // the cost of shuffles. + VPPIteratorPair IP = ConnectedPairDeps.equal_range(*S); + if (IP.first != ConnectedPairDeps.end()) { + unsigned NumDepsDirect = 0, NumDepsSwap = 0; + for (std::multimap<ValuePair, ValuePair>::iterator Q = IP.first; + Q != IP.second; ++Q) { + if (!PrunedTree.count(Q->second)) + continue; + DenseMap<VPPair, unsigned>::iterator R = + PairConnectionTypes.find(VPPair(Q->second, Q->first)); + assert(R != PairConnectionTypes.end() && + "Cannot find pair connection type"); + if (R->second == PairConnectionDirect) + ++NumDepsDirect; + else if (R->second == PairConnectionSwap) + ++NumDepsSwap; + } + + // If there are more swaps than direct connections, then + // the pair order will be flipped during fusion. So the real + // number of swaps is the minimum number. + FlipOrder = !FixedOrderPairs.count(*S) && + ((NumDepsSwap > NumDepsDirect) || + FixedOrderPairs.count(ValuePair(S->second, S->first))); + + for (std::multimap<ValuePair, ValuePair>::iterator Q = IP.first; + Q != IP.second; ++Q) { + if (!PrunedTree.count(Q->second)) + continue; + DenseMap<VPPair, unsigned>::iterator R = + PairConnectionTypes.find(VPPair(Q->second, Q->first)); + assert(R != PairConnectionTypes.end() && + "Cannot find pair connection type"); + Type *Ty1 = Q->second.first->getType(), + *Ty2 = Q->second.second->getType(); + Type *VTy = getVecTypeForPair(Ty1, Ty2); + if ((R->second == PairConnectionDirect && FlipOrder) || + (R->second == PairConnectionSwap && !FlipOrder) || + R->second == PairConnectionSplat) { + int ESContrib = (int) getInstrCost(Instruction::ShuffleVector, + VTy, VTy); + DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" << + *Q->second.first << " <-> " << *Q->second.second << + "} -> {" << + *S->first << " <-> " << *S->second << "} = " << + ESContrib << "\n"); + EffSize -= ESContrib; + } + } + } + + // Compute the cost of outgoing edges. We assume that edges outgoing + // to shuffles, inserts or extracts can be merged, and so contribute + // no additional cost. + if (!S->first->getType()->isVoidTy()) { + Type *Ty1 = S->first->getType(), + *Ty2 = S->second->getType(); + Type *VTy = getVecTypeForPair(Ty1, Ty2); + + bool NeedsExtraction = false; + for (Value::use_iterator I = S->first->use_begin(), + IE = S->first->use_end(); I != IE; ++I) { + if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(*I)) { + // Shuffle can be folded if it has no other input + if (isa<UndefValue>(SI->getOperand(1))) + continue; + } + if (isa<ExtractElementInst>(*I)) + continue; + if (PrunedTreeInstrs.count(*I)) + continue; + NeedsExtraction = true; + break; + } + + if (NeedsExtraction) { + int ESContrib; + if (Ty1->isVectorTy()) + ESContrib = (int) getInstrCost(Instruction::ShuffleVector, + Ty1, VTy); + else + ESContrib = (int) VTTI->getVectorInstrCost( + Instruction::ExtractElement, VTy, 0); + + DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" << + *S->first << "} = " << ESContrib << "\n"); + EffSize -= ESContrib; + } + + NeedsExtraction = false; + for (Value::use_iterator I = S->second->use_begin(), + IE = S->second->use_end(); I != IE; ++I) { + if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(*I)) { + // Shuffle can be folded if it has no other input + if (isa<UndefValue>(SI->getOperand(1))) + continue; + } + if (isa<ExtractElementInst>(*I)) + continue; + if (PrunedTreeInstrs.count(*I)) + continue; + NeedsExtraction = true; + break; + } + + if (NeedsExtraction) { + int ESContrib; + if (Ty2->isVectorTy()) + ESContrib = (int) getInstrCost(Instruction::ShuffleVector, + Ty2, VTy); + else + ESContrib = (int) VTTI->getVectorInstrCost( + Instruction::ExtractElement, VTy, 1); + DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" << + *S->second << "} = " << ESContrib << "\n"); + EffSize -= ESContrib; + } + } + + // Compute the cost of incoming edges. + if (!isa<LoadInst>(S->first) && !isa<StoreInst>(S->first)) { + Instruction *S1 = cast<Instruction>(S->first), + *S2 = cast<Instruction>(S->second); + for (unsigned o = 0; o < S1->getNumOperands(); ++o) { + Value *O1 = S1->getOperand(o), *O2 = S2->getOperand(o); + + // Combining constants into vector constants (or small vector + // constants into larger ones are assumed free). + if (isa<Constant>(O1) && isa<Constant>(O2)) + continue; + + if (FlipOrder) + std::swap(O1, O2); + + ValuePair VP = ValuePair(O1, O2); + ValuePair VPR = ValuePair(O2, O1); + + // Internal edges are not handled here. + if (PrunedTree.count(VP) || PrunedTree.count(VPR)) + continue; + + Type *Ty1 = O1->getType(), + *Ty2 = O2->getType(); + Type *VTy = getVecTypeForPair(Ty1, Ty2); + + // Combining vector operations of the same type is also assumed + // folded with other operations. + if (Ty1 == Ty2) { + // If both are insert elements, then both can be widened. + InsertElementInst *IEO1 = dyn_cast<InsertElementInst>(O1), + *IEO2 = dyn_cast<InsertElementInst>(O2); + if (IEO1 && IEO2 && isPureIEChain(IEO1) && isPureIEChain(IEO2)) + continue; + // If both are extract elements, and both have the same input + // type, then they can be replaced with a shuffle + ExtractElementInst *EIO1 = dyn_cast<ExtractElementInst>(O1), + *EIO2 = dyn_cast<ExtractElementInst>(O2); + if (EIO1 && EIO2 && + EIO1->getOperand(0)->getType() == + EIO2->getOperand(0)->getType()) + continue; + // If both are a shuffle with equal operand types and only two + // unqiue operands, then they can be replaced with a single + // shuffle + ShuffleVectorInst *SIO1 = dyn_cast<ShuffleVectorInst>(O1), + *SIO2 = dyn_cast<ShuffleVectorInst>(O2); + if (SIO1 && SIO2 && + SIO1->getOperand(0)->getType() == + SIO2->getOperand(0)->getType()) { + SmallSet<Value *, 4> SIOps; + SIOps.insert(SIO1->getOperand(0)); + SIOps.insert(SIO1->getOperand(1)); + SIOps.insert(SIO2->getOperand(0)); + SIOps.insert(SIO2->getOperand(1)); + if (SIOps.size() <= 2) + continue; + } + } + + int ESContrib; + // This pair has already been formed. + if (IncomingPairs.count(VP)) { + continue; + } else if (IncomingPairs.count(VPR)) { + ESContrib = (int) getInstrCost(Instruction::ShuffleVector, + VTy, VTy); + } else if (!Ty1->isVectorTy() && !Ty2->isVectorTy()) { + ESContrib = (int) VTTI->getVectorInstrCost( + Instruction::InsertElement, VTy, 0); + ESContrib += (int) VTTI->getVectorInstrCost( + Instruction::InsertElement, VTy, 1); + } else if (!Ty1->isVectorTy()) { + // O1 needs to be inserted into a vector of size O2, and then + // both need to be shuffled together. + ESContrib = (int) VTTI->getVectorInstrCost( + Instruction::InsertElement, Ty2, 0); + ESContrib += (int) getInstrCost(Instruction::ShuffleVector, + VTy, Ty2); + } else if (!Ty2->isVectorTy()) { + // O2 needs to be inserted into a vector of size O1, and then + // both need to be shuffled together. + ESContrib = (int) VTTI->getVectorInstrCost( + Instruction::InsertElement, Ty1, 0); + ESContrib += (int) getInstrCost(Instruction::ShuffleVector, + VTy, Ty1); + } else { + Type *TyBig = Ty1, *TySmall = Ty2; + if (Ty2->getVectorNumElements() > Ty1->getVectorNumElements()) + std::swap(TyBig, TySmall); + + ESContrib = (int) getInstrCost(Instruction::ShuffleVector, + VTy, TyBig); + if (TyBig != TySmall) + ESContrib += (int) getInstrCost(Instruction::ShuffleVector, + TyBig, TySmall); + } + + DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" + << *O1 << " <-> " << *O2 << "} = " << + ESContrib << "\n"); + EffSize -= ESContrib; + IncomingPairs.insert(VP); + } + } + } + + if (!HasNontrivialInsts) { + DEBUG(if (DebugPairSelection) dbgs() << + "\tNo non-trivial instructions in tree;" + " override to zero effective size\n"); + EffSize = 0; + } + } else { + for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(), + E = PrunedTree.end(); S != E; ++S) + EffSize += (int) getDepthFactor(S->first); + } DEBUG(if (DebugPairSelection) dbgs() << "BBV: found pruned Tree for pair {" << *J->first << " <-> " << *J->second << "} of depth " << MaxDepth << " and size " << PrunedTree.size() << " (effective size: " << EffSize << ")\n"); - if (MaxDepth >= Config.ReqChainDepth && EffSize > BestEffSize) { + if (((VTTI && !UseChainDepthWithTI) || + MaxDepth >= Config.ReqChainDepth) && + EffSize > 0 && EffSize > BestEffSize) { BestMaxDepth = MaxDepth; BestEffSize = EffSize; BestTree = PrunedTree; @@ -1419,8 +1984,12 @@ namespace { // that will be fused into vector instructions. void BBVectorize::choosePairs( std::multimap<Value *, Value *> &CandidatePairs, + DenseMap<ValuePair, int> &CandidatePairCostSavings, std::vector<Value *> &PairableInsts, + DenseSet<ValuePair> &FixedOrderPairs, + DenseMap<VPPair, unsigned> &PairConnectionTypes, std::multimap<ValuePair, ValuePair> &ConnectedPairs, + std::multimap<ValuePair, ValuePair> &ConnectedPairDeps, DenseSet<ValuePair> &PairableInstUsers, DenseMap<Value *, Value *>& ChosenPairs) { bool UseCycleCheck = @@ -1435,9 +2004,12 @@ namespace { VPIteratorPair ChoiceRange = CandidatePairs.equal_range(*I); // The best pair to choose and its tree: - size_t BestMaxDepth = 0, BestEffSize = 0; + size_t BestMaxDepth = 0; + int BestEffSize = 0; DenseSet<ValuePair> BestTree; - findBestTreeFor(CandidatePairs, PairableInsts, ConnectedPairs, + findBestTreeFor(CandidatePairs, CandidatePairCostSavings, + PairableInsts, FixedOrderPairs, PairConnectionTypes, + ConnectedPairs, ConnectedPairDeps, PairableInstUsers, PairableInstUserMap, ChosenPairs, BestTree, BestMaxDepth, BestEffSize, ChoiceRange, UseCycleCheck); @@ -1490,24 +2062,19 @@ namespace { // Returns the value that is to be used as the pointer input to the vector // instruction that fuses I with J. Value *BBVectorize::getReplacementPointerInput(LLVMContext& Context, - Instruction *I, Instruction *J, unsigned o, - bool FlipMemInputs) { + Instruction *I, Instruction *J, unsigned o) { Value *IPtr, *JPtr; - unsigned IAlignment, JAlignment; + unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace; int64_t OffsetInElmts; - // Note: the analysis might fail here, that is why FlipMemInputs has + // Note: the analysis might fail here, that is why the pair order has // been precomputed (OffsetInElmts must be unused here). (void) getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, - OffsetInElmts); + IAddressSpace, JAddressSpace, + OffsetInElmts, false); // The pointer value is taken to be the one with the lowest offset. - Value *VPtr; - if (!FlipMemInputs) { - VPtr = IPtr; - } else { - VPtr = JPtr; - } + Value *VPtr = IPtr; Type *ArgTypeI = cast<PointerType>(IPtr->getType())->getElementType(); Type *ArgTypeJ = cast<PointerType>(JPtr->getType())->getElementType(); @@ -1515,7 +2082,7 @@ namespace { Type *VArgPtrType = PointerType::get(VArgType, cast<PointerType>(IPtr->getType())->getAddressSpace()); return new BitCastInst(VPtr, VArgPtrType, getReplacementName(I, true, o), - /* insert before */ FlipMemInputs ? J : I); + /* insert before */ I); } void BBVectorize::fillNewShuffleMask(LLVMContext& Context, Instruction *J, @@ -1585,23 +2152,12 @@ namespace { Instruction *J, unsigned o, Value *&LOp, unsigned numElemL, Type *ArgTypeL, Type *ArgTypeH, - unsigned IdxOff) { + bool IBeforeJ, unsigned IdxOff) { bool ExpandedIEChain = false; if (InsertElementInst *LIE = dyn_cast<InsertElementInst>(LOp)) { // If we have a pure insertelement chain, then this can be rewritten // into a chain that directly builds the larger type. - bool PureChain = true; - InsertElementInst *LIENext = LIE; - do { - if (!isa<UndefValue>(LIENext->getOperand(0)) && - !isa<InsertElementInst>(LIENext->getOperand(0))) { - PureChain = false; - break; - } - } while ((LIENext = - dyn_cast<InsertElementInst>(LIENext->getOperand(0)))); - - if (PureChain) { + if (isPureIEChain(LIE)) { SmallVector<Value *, 8> VectElemts(numElemL, UndefValue::get(ArgTypeL->getScalarType())); InsertElementInst *LIENext = LIE; @@ -1619,8 +2175,9 @@ namespace { LIENext = InsertElementInst::Create(LIEPrev, VectElemts[i], ConstantInt::get(Type::getInt32Ty(Context), i + IdxOff), - getReplacementName(I, true, o, i+1)); - LIENext->insertBefore(J); + getReplacementName(IBeforeJ ? I : J, + true, o, i+1)); + LIENext->insertBefore(IBeforeJ ? J : I); LIEPrev = LIENext; } @@ -1635,7 +2192,7 @@ namespace { // Returns the value to be used as the specified operand of the vector // instruction that fuses I with J. Value *BBVectorize::getReplacementInput(LLVMContext& Context, Instruction *I, - Instruction *J, unsigned o, bool FlipMemInputs) { + Instruction *J, unsigned o, bool IBeforeJ) { Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1); @@ -1646,12 +2203,6 @@ namespace { Instruction *L = I, *H = J; Type *ArgTypeL = ArgTypeI, *ArgTypeH = ArgTypeJ; - if (FlipMemInputs) { - L = J; - H = I; - ArgTypeL = ArgTypeJ; - ArgTypeH = ArgTypeI; - } unsigned numElemL; if (ArgTypeL->isVectorTy()) @@ -1804,8 +2355,9 @@ namespace { Instruction *S = new ShuffleVectorInst(I1, UndefValue::get(I1T), ConstantVector::get(Mask), - getReplacementName(I, true, o)); - S->insertBefore(J); + getReplacementName(IBeforeJ ? I : J, + true, o)); + S->insertBefore(IBeforeJ ? J : I); return S; } @@ -1826,8 +2378,9 @@ namespace { Instruction *NewI1 = new ShuffleVectorInst(I1, UndefValue::get(I1T), ConstantVector::get(Mask), - getReplacementName(I, true, o, 1)); - NewI1->insertBefore(J); + getReplacementName(IBeforeJ ? I : J, + true, o, 1)); + NewI1->insertBefore(IBeforeJ ? J : I); I1 = NewI1; I1T = I2T; I1Elem = I2Elem; @@ -1842,8 +2395,9 @@ namespace { Instruction *NewI2 = new ShuffleVectorInst(I2, UndefValue::get(I2T), ConstantVector::get(Mask), - getReplacementName(I, true, o, 1)); - NewI2->insertBefore(J); + getReplacementName(IBeforeJ ? I : J, + true, o, 1)); + NewI2->insertBefore(IBeforeJ ? J : I); I2 = NewI2; I2T = I1T; I2Elem = I1Elem; @@ -1863,8 +2417,8 @@ namespace { Instruction *NewOp = new ShuffleVectorInst(I1, I2, ConstantVector::get(Mask), - getReplacementName(I, true, o)); - NewOp->insertBefore(J); + getReplacementName(IBeforeJ ? I : J, true, o)); + NewOp->insertBefore(IBeforeJ ? J : I); return NewOp; } } @@ -1872,17 +2426,17 @@ namespace { Type *ArgType = ArgTypeL; if (numElemL < numElemH) { if (numElemL == 1 && expandIEChain(Context, I, J, o, HOp, numElemH, - ArgTypeL, VArgType, 1)) { + ArgTypeL, VArgType, IBeforeJ, 1)) { // This is another short-circuit case: we're combining a scalar into // a vector that is formed by an IE chain. We've just expanded the IE // chain, now insert the scalar and we're done. Instruction *S = InsertElementInst::Create(HOp, LOp, CV0, - getReplacementName(I, true, o)); - S->insertBefore(J); + getReplacementName(IBeforeJ ? I : J, true, o)); + S->insertBefore(IBeforeJ ? J : I); return S; } else if (!expandIEChain(Context, I, J, o, LOp, numElemL, ArgTypeL, - ArgTypeH)) { + ArgTypeH, IBeforeJ)) { // The two vector inputs to the shuffle must be the same length, // so extend the smaller vector to be the same length as the larger one. Instruction *NLOp; @@ -1897,29 +2451,32 @@ namespace { NLOp = new ShuffleVectorInst(LOp, UndefValue::get(ArgTypeL), ConstantVector::get(Mask), - getReplacementName(I, true, o, 1)); + getReplacementName(IBeforeJ ? I : J, + true, o, 1)); } else { NLOp = InsertElementInst::Create(UndefValue::get(ArgTypeH), LOp, CV0, - getReplacementName(I, true, o, 1)); + getReplacementName(IBeforeJ ? I : J, + true, o, 1)); } - NLOp->insertBefore(J); + NLOp->insertBefore(IBeforeJ ? J : I); LOp = NLOp; } ArgType = ArgTypeH; } else if (numElemL > numElemH) { if (numElemH == 1 && expandIEChain(Context, I, J, o, LOp, numElemL, - ArgTypeH, VArgType)) { + ArgTypeH, VArgType, IBeforeJ)) { Instruction *S = InsertElementInst::Create(LOp, HOp, ConstantInt::get(Type::getInt32Ty(Context), numElemL), - getReplacementName(I, true, o)); - S->insertBefore(J); + getReplacementName(IBeforeJ ? I : J, + true, o)); + S->insertBefore(IBeforeJ ? J : I); return S; } else if (!expandIEChain(Context, I, J, o, HOp, numElemH, ArgTypeH, - ArgTypeL)) { + ArgTypeL, IBeforeJ)) { Instruction *NHOp; if (numElemH > 1) { std::vector<Constant *> Mask(numElemL); @@ -1931,13 +2488,15 @@ namespace { NHOp = new ShuffleVectorInst(HOp, UndefValue::get(ArgTypeH), ConstantVector::get(Mask), - getReplacementName(I, true, o, 1)); + getReplacementName(IBeforeJ ? I : J, + true, o, 1)); } else { NHOp = InsertElementInst::Create(UndefValue::get(ArgTypeL), HOp, CV0, - getReplacementName(I, true, o, 1)); + getReplacementName(IBeforeJ ? I : J, + true, o, 1)); } - NHOp->insertBefore(J); + NHOp->insertBefore(IBeforeJ ? J : I); HOp = NHOp; } } @@ -1955,19 +2514,21 @@ namespace { } Instruction *BV = new ShuffleVectorInst(LOp, HOp, - ConstantVector::get(Mask), - getReplacementName(I, true, o)); - BV->insertBefore(J); + ConstantVector::get(Mask), + getReplacementName(IBeforeJ ? I : J, true, o)); + BV->insertBefore(IBeforeJ ? J : I); return BV; } Instruction *BV1 = InsertElementInst::Create( UndefValue::get(VArgType), LOp, CV0, - getReplacementName(I, true, o, 1)); - BV1->insertBefore(I); + getReplacementName(IBeforeJ ? I : J, + true, o, 1)); + BV1->insertBefore(IBeforeJ ? J : I); Instruction *BV2 = InsertElementInst::Create(BV1, HOp, CV1, - getReplacementName(I, true, o, 2)); - BV2->insertBefore(J); + getReplacementName(IBeforeJ ? I : J, + true, o, 2)); + BV2->insertBefore(IBeforeJ ? J : I); return BV2; } @@ -1976,7 +2537,7 @@ namespace { void BBVectorize::getReplacementInputsForPair(LLVMContext& Context, Instruction *I, Instruction *J, SmallVector<Value *, 3> &ReplacedOperands, - bool FlipMemInputs) { + bool IBeforeJ) { unsigned NumOperands = I->getNumOperands(); for (unsigned p = 0, o = NumOperands-1; p < NumOperands; ++p, --o) { @@ -1985,8 +2546,7 @@ namespace { if (isa<LoadInst>(I) || (o == 1 && isa<StoreInst>(I))) { // This is the pointer for a load/store instruction. - ReplacedOperands[o] = getReplacementPointerInput(Context, I, J, o, - FlipMemInputs); + ReplacedOperands[o] = getReplacementPointerInput(Context, I, J, o); continue; } else if (isa<CallInst>(I)) { Function *F = cast<CallInst>(I)->getCalledFunction(); @@ -2014,8 +2574,7 @@ namespace { continue; } - ReplacedOperands[o] = - getReplacementInput(Context, I, J, o, FlipMemInputs); + ReplacedOperands[o] = getReplacementInput(Context, I, J, o, IBeforeJ); } } @@ -2026,8 +2585,7 @@ namespace { void BBVectorize::replaceOutputsOfPair(LLVMContext& Context, Instruction *I, Instruction *J, Instruction *K, Instruction *&InsertionPt, - Instruction *&K1, Instruction *&K2, - bool FlipMemInputs) { + Instruction *&K1, Instruction *&K2) { if (isa<StoreInst>(I)) { AA->replaceWithNewValue(I, K); AA->replaceWithNewValue(J, K); @@ -2057,13 +2615,11 @@ namespace { } K1 = new ShuffleVectorInst(K, UndefValue::get(VType), - ConstantVector::get( - FlipMemInputs ? Mask2 : Mask1), + ConstantVector::get( Mask1), getReplacementName(K, false, 1)); } else { Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); - Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem-1); - K1 = ExtractElementInst::Create(K, FlipMemInputs ? CV1 : CV0, + K1 = ExtractElementInst::Create(K, CV0, getReplacementName(K, false, 1)); } @@ -2075,13 +2631,11 @@ namespace { } K2 = new ShuffleVectorInst(K, UndefValue::get(VType), - ConstantVector::get( - FlipMemInputs ? Mask1 : Mask2), + ConstantVector::get( Mask2), getReplacementName(K, false, 2)); } else { - Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem-1); - K2 = ExtractElementInst::Create(K, FlipMemInputs ? CV0 : CV1, + K2 = ExtractElementInst::Create(K, CV1, getReplacementName(K, false, 2)); } @@ -2181,36 +2735,6 @@ namespace { } } - // As with the aliasing information, SCEV can also change because of - // vectorization. This information is used to compute relative pointer - // offsets; the necessary information will be cached here prior to - // fusion. - void BBVectorize::collectPtrInfo(std::vector<Value *> &PairableInsts, - DenseMap<Value *, Value *> &ChosenPairs, - DenseSet<Value *> &LowPtrInsts) { - for (std::vector<Value *>::iterator PI = PairableInsts.begin(), - PIE = PairableInsts.end(); PI != PIE; ++PI) { - DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(*PI); - if (P == ChosenPairs.end()) continue; - - Instruction *I = cast<Instruction>(P->first); - Instruction *J = cast<Instruction>(P->second); - - if (!isa<LoadInst>(I) && !isa<StoreInst>(I)) - continue; - - Value *IPtr, *JPtr; - unsigned IAlignment, JAlignment; - int64_t OffsetInElmts; - if (!getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, - OffsetInElmts) || abs64(OffsetInElmts) != 1) - llvm_unreachable("Pre-fusion pointer analysis failed"); - - Value *LowPI = (OffsetInElmts > 0) ? I : J; - LowPtrInsts.insert(LowPI); - } - } - // When the first instruction in each pair is cloned, it will inherit its // parent's metadata. This metadata must be combined with that of the other // instruction in a safe way. @@ -2244,27 +2768,27 @@ namespace { // second member). void BBVectorize::fuseChosenPairs(BasicBlock &BB, std::vector<Value *> &PairableInsts, - DenseMap<Value *, Value *> &ChosenPairs) { + DenseMap<Value *, Value *> &ChosenPairs, + DenseSet<ValuePair> &FixedOrderPairs, + DenseMap<VPPair, unsigned> &PairConnectionTypes, + std::multimap<ValuePair, ValuePair> &ConnectedPairs, + std::multimap<ValuePair, ValuePair> &ConnectedPairDeps) { LLVMContext& Context = BB.getContext(); // During the vectorization process, the order of the pairs to be fused // could be flipped. So we'll add each pair, flipped, into the ChosenPairs // list. After a pair is fused, the flipped pair is removed from the list. - std::vector<ValuePair> FlippedPairs; - FlippedPairs.reserve(ChosenPairs.size()); + DenseSet<ValuePair> FlippedPairs; for (DenseMap<Value *, Value *>::iterator P = ChosenPairs.begin(), E = ChosenPairs.end(); P != E; ++P) - FlippedPairs.push_back(ValuePair(P->second, P->first)); - for (std::vector<ValuePair>::iterator P = FlippedPairs.begin(), + FlippedPairs.insert(ValuePair(P->second, P->first)); + for (DenseSet<ValuePair>::iterator P = FlippedPairs.begin(), E = FlippedPairs.end(); P != E; ++P) ChosenPairs.insert(*P); std::multimap<Value *, Value *> LoadMoveSet; collectLoadMoveSet(BB, PairableInsts, ChosenPairs, LoadMoveSet); - DenseSet<Value *> LowPtrInsts; - collectPtrInfo(PairableInsts, ChosenPairs, LowPtrInsts); - DEBUG(dbgs() << "BBV: initial: \n" << BB << "\n"); for (BasicBlock::iterator PI = BB.getFirstInsertionPt(); PI != BB.end();) { @@ -2304,44 +2828,92 @@ namespace { continue; } - bool FlipMemInputs = false; - if (isa<LoadInst>(I) || isa<StoreInst>(I)) - FlipMemInputs = (LowPtrInsts.find(I) == LowPtrInsts.end()); + // If the pair must have the other order, then flip it. + bool FlipPairOrder = FixedOrderPairs.count(ValuePair(J, I)); + if (!FlipPairOrder && !FixedOrderPairs.count(ValuePair(I, J))) { + // This pair does not have a fixed order, and so we might want to + // flip it if that will yield fewer shuffles. We count the number + // of dependencies connected via swaps, and those directly connected, + // and flip the order if the number of swaps is greater. + bool OrigOrder = true; + VPPIteratorPair IP = ConnectedPairDeps.equal_range(ValuePair(I, J)); + if (IP.first == ConnectedPairDeps.end()) { + IP = ConnectedPairDeps.equal_range(ValuePair(J, I)); + OrigOrder = false; + } + if (IP.first != ConnectedPairDeps.end()) { + unsigned NumDepsDirect = 0, NumDepsSwap = 0; + for (std::multimap<ValuePair, ValuePair>::iterator Q = IP.first; + Q != IP.second; ++Q) { + DenseMap<VPPair, unsigned>::iterator R = + PairConnectionTypes.find(VPPair(Q->second, Q->first)); + assert(R != PairConnectionTypes.end() && + "Cannot find pair connection type"); + if (R->second == PairConnectionDirect) + ++NumDepsDirect; + else if (R->second == PairConnectionSwap) + ++NumDepsSwap; + } + + if (!OrigOrder) + std::swap(NumDepsDirect, NumDepsSwap); + + if (NumDepsSwap > NumDepsDirect) { + FlipPairOrder = true; + DEBUG(dbgs() << "BBV: reordering pair: " << *I << + " <-> " << *J << "\n"); + } + } + } + + Instruction *L = I, *H = J; + if (FlipPairOrder) + std::swap(H, L); + + // If the pair being fused uses the opposite order from that in the pair + // connection map, then we need to flip the types. + VPPIteratorPair IP = ConnectedPairs.equal_range(ValuePair(H, L)); + for (std::multimap<ValuePair, ValuePair>::iterator Q = IP.first; + Q != IP.second; ++Q) { + DenseMap<VPPair, unsigned>::iterator R = PairConnectionTypes.find(*Q); + assert(R != PairConnectionTypes.end() && + "Cannot find pair connection type"); + if (R->second == PairConnectionDirect) + R->second = PairConnectionSwap; + else if (R->second == PairConnectionSwap) + R->second = PairConnectionDirect; + } + + bool LBeforeH = !FlipPairOrder; unsigned NumOperands = I->getNumOperands(); SmallVector<Value *, 3> ReplacedOperands(NumOperands); - getReplacementInputsForPair(Context, I, J, ReplacedOperands, - FlipMemInputs); + getReplacementInputsForPair(Context, L, H, ReplacedOperands, + LBeforeH); // Make a copy of the original operation, change its type to the vector // type and replace its operands with the vector operands. - Instruction *K = I->clone(); - if (I->hasName()) K->takeName(I); + Instruction *K = L->clone(); + if (L->hasName()) + K->takeName(L); + else if (H->hasName()) + K->takeName(H); if (!isa<StoreInst>(K)) - K->mutateType(getVecTypeForPair(I->getType(), J->getType())); + K->mutateType(getVecTypeForPair(L->getType(), H->getType())); - combineMetadata(K, J); + combineMetadata(K, H); + K->intersectOptionalDataWith(H); for (unsigned o = 0; o < NumOperands; ++o) K->setOperand(o, ReplacedOperands[o]); - // If we've flipped the memory inputs, make sure that we take the correct - // alignment. - if (FlipMemInputs) { - if (isa<StoreInst>(K)) - cast<StoreInst>(K)->setAlignment(cast<StoreInst>(J)->getAlignment()); - else - cast<LoadInst>(K)->setAlignment(cast<LoadInst>(J)->getAlignment()); - } - K->insertAfter(J); // Instruction insertion point: Instruction *InsertionPt = K; Instruction *K1 = 0, *K2 = 0; - replaceOutputsOfPair(Context, I, J, K, InsertionPt, K1, K2, - FlipMemInputs); + replaceOutputsOfPair(Context, L, H, K, InsertionPt, K1, K2); // The use tree of the first original instruction must be moved to after // the location of the second instruction. The entire use tree of the @@ -2351,10 +2923,10 @@ namespace { moveUsesOfIAfterJ(BB, LoadMoveSet, InsertionPt, I, J); if (!isa<StoreInst>(I)) { - I->replaceAllUsesWith(K1); - J->replaceAllUsesWith(K2); - AA->replaceWithNewValue(I, K1); - AA->replaceWithNewValue(J, K2); + L->replaceAllUsesWith(K1); + H->replaceAllUsesWith(K2); + AA->replaceWithNewValue(L, K1); + AA->replaceWithNewValue(H, K2); } // Instructions that may read from memory may be in the load move set. @@ -2387,6 +2959,9 @@ namespace { SE->forgetValue(J); I->eraseFromParent(); J->eraseFromParent(); + + DEBUG(if (PrintAfterEveryPair) dbgs() << "BBV: block is now: \n" << + BB << "\n"); } DEBUG(dbgs() << "BBV: final: \n" << BB << "\n"); @@ -2397,6 +2972,7 @@ char BBVectorize::ID = 0; static const char bb_vectorize_name[] = "Basic-Block Vectorization"; INITIALIZE_PASS_BEGIN(BBVectorize, BBV_NAME, bb_vectorize_name, false, false) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_END(BBVectorize, BBV_NAME, bb_vectorize_name, false, false) |