summaryrefslogtreecommitdiffstats
path: root/contrib/llvm/lib/Transforms/Vectorize
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm/lib/Transforms/Vectorize')
-rw-r--r--contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp3251
-rw-r--r--contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp5823
-rw-r--r--contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp4224
-rw-r--r--contrib/llvm/lib/Transforms/Vectorize/Vectorize.cpp48
4 files changed, 13346 insertions, 0 deletions
diff --git a/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp b/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp
new file mode 100644
index 0000000..8844d57
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp
@@ -0,0 +1,3251 @@
+//===- BBVectorize.cpp - A Basic-Block Vectorizer -------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a basic-block vectorization pass. The algorithm was
+// inspired by that used by the Vienna MAP Vectorizor by Franchetti and Kral,
+// et al. It works by looking for chains of pairable operations and then
+// pairing them.
+//
+//===----------------------------------------------------------------------===//
+
+#define BBV_NAME "bb-vectorize"
+#include "llvm/Transforms/Vectorize.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+using namespace llvm;
+
+#define DEBUG_TYPE BBV_NAME
+
+static cl::opt<bool>
+IgnoreTargetInfo("bb-vectorize-ignore-target-info", cl::init(false),
+ cl::Hidden, cl::desc("Ignore target information"));
+
+static cl::opt<unsigned>
+ReqChainDepth("bb-vectorize-req-chain-depth", cl::init(6), cl::Hidden,
+ cl::desc("The required chain depth for vectorization"));
+
+static cl::opt<bool>
+UseChainDepthWithTI("bb-vectorize-use-chain-depth", cl::init(false),
+ cl::Hidden, cl::desc("Use the chain depth requirement with"
+ " target information"));
+
+static cl::opt<unsigned>
+SearchLimit("bb-vectorize-search-limit", cl::init(400), cl::Hidden,
+ cl::desc("The maximum search distance for instruction pairs"));
+
+static cl::opt<bool>
+SplatBreaksChain("bb-vectorize-splat-breaks-chain", cl::init(false), cl::Hidden,
+ cl::desc("Replicating one element to a pair breaks the chain"));
+
+static cl::opt<unsigned>
+VectorBits("bb-vectorize-vector-bits", cl::init(128), cl::Hidden,
+ cl::desc("The size of the native vector registers"));
+
+static cl::opt<unsigned>
+MaxIter("bb-vectorize-max-iter", cl::init(0), cl::Hidden,
+ cl::desc("The maximum number of pairing iterations"));
+
+static cl::opt<bool>
+Pow2LenOnly("bb-vectorize-pow2-len-only", cl::init(false), cl::Hidden,
+ cl::desc("Don't try to form non-2^n-length vectors"));
+
+static cl::opt<unsigned>
+MaxInsts("bb-vectorize-max-instr-per-group", cl::init(500), cl::Hidden,
+ cl::desc("The maximum number of pairable instructions per group"));
+
+static cl::opt<unsigned>
+MaxPairs("bb-vectorize-max-pairs-per-group", cl::init(3000), cl::Hidden,
+ cl::desc("The maximum number of candidate instruction pairs per group"));
+
+static cl::opt<unsigned>
+MaxCandPairsForCycleCheck("bb-vectorize-max-cycle-check-pairs", cl::init(200),
+ cl::Hidden, cl::desc("The maximum number of candidate pairs with which to use"
+ " a full cycle check"));
+
+static cl::opt<bool>
+NoBools("bb-vectorize-no-bools", cl::init(false), cl::Hidden,
+ cl::desc("Don't try to vectorize boolean (i1) values"));
+
+static cl::opt<bool>
+NoInts("bb-vectorize-no-ints", cl::init(false), cl::Hidden,
+ cl::desc("Don't try to vectorize integer values"));
+
+static cl::opt<bool>
+NoFloats("bb-vectorize-no-floats", cl::init(false), cl::Hidden,
+ cl::desc("Don't try to vectorize floating-point values"));
+
+// FIXME: This should default to false once pointer vector support works.
+static cl::opt<bool>
+NoPointers("bb-vectorize-no-pointers", cl::init(/*false*/ true), cl::Hidden,
+ cl::desc("Don't try to vectorize pointer values"));
+
+static cl::opt<bool>
+NoCasts("bb-vectorize-no-casts", cl::init(false), cl::Hidden,
+ cl::desc("Don't try to vectorize casting (conversion) operations"));
+
+static cl::opt<bool>
+NoMath("bb-vectorize-no-math", cl::init(false), cl::Hidden,
+ cl::desc("Don't try to vectorize floating-point math intrinsics"));
+
+static cl::opt<bool>
+ NoBitManipulation("bb-vectorize-no-bitmanip", cl::init(false), cl::Hidden,
+ cl::desc("Don't try to vectorize BitManipulation intrinsics"));
+
+static cl::opt<bool>
+NoFMA("bb-vectorize-no-fma", cl::init(false), cl::Hidden,
+ cl::desc("Don't try to vectorize the fused-multiply-add intrinsic"));
+
+static cl::opt<bool>
+NoSelect("bb-vectorize-no-select", cl::init(false), cl::Hidden,
+ cl::desc("Don't try to vectorize select instructions"));
+
+static cl::opt<bool>
+NoCmp("bb-vectorize-no-cmp", cl::init(false), cl::Hidden,
+ cl::desc("Don't try to vectorize comparison instructions"));
+
+static cl::opt<bool>
+NoGEP("bb-vectorize-no-gep", cl::init(false), cl::Hidden,
+ cl::desc("Don't try to vectorize getelementptr instructions"));
+
+static cl::opt<bool>
+NoMemOps("bb-vectorize-no-mem-ops", cl::init(false), cl::Hidden,
+ cl::desc("Don't try to vectorize loads and stores"));
+
+static cl::opt<bool>
+AlignedOnly("bb-vectorize-aligned-only", cl::init(false), cl::Hidden,
+ cl::desc("Only generate aligned loads and stores"));
+
+static cl::opt<bool>
+NoMemOpBoost("bb-vectorize-no-mem-op-boost",
+ cl::init(false), cl::Hidden,
+ cl::desc("Don't boost the chain-depth contribution of loads and stores"));
+
+static cl::opt<bool>
+FastDep("bb-vectorize-fast-dep", cl::init(false), cl::Hidden,
+ cl::desc("Use a fast instruction dependency analysis"));
+
+#ifndef NDEBUG
+static cl::opt<bool>
+DebugInstructionExamination("bb-vectorize-debug-instruction-examination",
+ cl::init(false), cl::Hidden,
+ cl::desc("When debugging is enabled, output information on the"
+ " instruction-examination process"));
+static cl::opt<bool>
+DebugCandidateSelection("bb-vectorize-debug-candidate-selection",
+ cl::init(false), cl::Hidden,
+ cl::desc("When debugging is enabled, output information on the"
+ " candidate-selection process"));
+static cl::opt<bool>
+DebugPairSelection("bb-vectorize-debug-pair-selection",
+ cl::init(false), cl::Hidden,
+ cl::desc("When debugging is enabled, output information on the"
+ " pair-selection process"));
+static cl::opt<bool>
+DebugCycleCheck("bb-vectorize-debug-cycle-check",
+ cl::init(false), cl::Hidden,
+ cl::desc("When debugging is enabled, output information on the"
+ " cycle-checking process"));
+
+static cl::opt<bool>
+PrintAfterEveryPair("bb-vectorize-debug-print-after-every-pair",
+ cl::init(false), cl::Hidden,
+ cl::desc("When debugging is enabled, dump the basic block after"
+ " every pair is fused"));
+#endif
+
+STATISTIC(NumFusedOps, "Number of operations fused by bb-vectorize");
+
+namespace {
+ struct BBVectorize : public BasicBlockPass {
+ static char ID; // Pass identification, replacement for typeid
+
+ const VectorizeConfig Config;
+
+ BBVectorize(const VectorizeConfig &C = VectorizeConfig())
+ : BasicBlockPass(ID), Config(C) {
+ initializeBBVectorizePass(*PassRegistry::getPassRegistry());
+ }
+
+ BBVectorize(Pass *P, Function &F, const VectorizeConfig &C)
+ : BasicBlockPass(ID), Config(C) {
+ AA = &P->getAnalysis<AAResultsWrapperPass>().getAAResults();
+ DT = &P->getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ SE = &P->getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ TLI = &P->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ TTI = IgnoreTargetInfo
+ ? nullptr
+ : &P->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ }
+
+ typedef std::pair<Value *, Value *> ValuePair;
+ typedef std::pair<ValuePair, int> ValuePairWithCost;
+ typedef std::pair<ValuePair, size_t> ValuePairWithDepth;
+ typedef std::pair<ValuePair, ValuePair> VPPair; // A ValuePair pair
+ typedef std::pair<VPPair, unsigned> VPPairWithType;
+
+ AliasAnalysis *AA;
+ DominatorTree *DT;
+ ScalarEvolution *SE;
+ const TargetLibraryInfo *TLI;
+ const TargetTransformInfo *TTI;
+
+ // FIXME: const correct?
+
+ bool vectorizePairs(BasicBlock &BB, bool NonPow2Len = false);
+
+ bool getCandidatePairs(BasicBlock &BB,
+ BasicBlock::iterator &Start,
+ DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+ DenseSet<ValuePair> &FixedOrderPairs,
+ DenseMap<ValuePair, int> &CandidatePairCostSavings,
+ std::vector<Value *> &PairableInsts, bool NonPow2Len);
+
+ // FIXME: The current implementation does not account for pairs that
+ // are connected in multiple ways. For example:
+ // C1 = A1 / A2; C2 = A2 / A1 (which may be both direct and a swap)
+ enum PairConnectionType {
+ PairConnectionDirect,
+ PairConnectionSwap,
+ PairConnectionSplat
+ };
+
+ void computeConnectedPairs(
+ DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+ DenseSet<ValuePair> &CandidatePairsSet,
+ std::vector<Value *> &PairableInsts,
+ DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+ DenseMap<VPPair, unsigned> &PairConnectionTypes);
+
+ void buildDepMap(BasicBlock &BB,
+ DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+ std::vector<Value *> &PairableInsts,
+ DenseSet<ValuePair> &PairableInstUsers);
+
+ void choosePairs(DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+ DenseSet<ValuePair> &CandidatePairsSet,
+ DenseMap<ValuePair, int> &CandidatePairCostSavings,
+ std::vector<Value *> &PairableInsts,
+ DenseSet<ValuePair> &FixedOrderPairs,
+ DenseMap<VPPair, unsigned> &PairConnectionTypes,
+ DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+ DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
+ DenseSet<ValuePair> &PairableInstUsers,
+ DenseMap<Value *, Value *>& ChosenPairs);
+
+ void fuseChosenPairs(BasicBlock &BB,
+ std::vector<Value *> &PairableInsts,
+ DenseMap<Value *, Value *>& ChosenPairs,
+ DenseSet<ValuePair> &FixedOrderPairs,
+ DenseMap<VPPair, unsigned> &PairConnectionTypes,
+ DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+ DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps);
+
+
+ bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore);
+
+ bool areInstsCompatible(Instruction *I, Instruction *J,
+ bool IsSimpleLoadStore, bool NonPow2Len,
+ int &CostSavings, int &FixedOrder);
+
+ bool trackUsesOfI(DenseSet<Value *> &Users,
+ AliasSetTracker &WriteSet, Instruction *I,
+ Instruction *J, bool UpdateUsers = true,
+ DenseSet<ValuePair> *LoadMoveSetPairs = nullptr);
+
+ void computePairsConnectedTo(
+ DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+ DenseSet<ValuePair> &CandidatePairsSet,
+ std::vector<Value *> &PairableInsts,
+ DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+ DenseMap<VPPair, unsigned> &PairConnectionTypes,
+ ValuePair P);
+
+ bool pairsConflict(ValuePair P, ValuePair Q,
+ DenseSet<ValuePair> &PairableInstUsers,
+ DenseMap<ValuePair, std::vector<ValuePair> >
+ *PairableInstUserMap = nullptr,
+ DenseSet<VPPair> *PairableInstUserPairSet = nullptr);
+
+ bool pairWillFormCycle(ValuePair P,
+ DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUsers,
+ DenseSet<ValuePair> &CurrentPairs);
+
+ void pruneDAGFor(
+ DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+ std::vector<Value *> &PairableInsts,
+ DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+ DenseSet<ValuePair> &PairableInstUsers,
+ DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
+ DenseSet<VPPair> &PairableInstUserPairSet,
+ DenseMap<Value *, Value *> &ChosenPairs,
+ DenseMap<ValuePair, size_t> &DAG,
+ DenseSet<ValuePair> &PrunedDAG, ValuePair J,
+ bool UseCycleCheck);
+
+ void buildInitialDAGFor(
+ DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+ DenseSet<ValuePair> &CandidatePairsSet,
+ std::vector<Value *> &PairableInsts,
+ DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+ DenseSet<ValuePair> &PairableInstUsers,
+ DenseMap<Value *, Value *> &ChosenPairs,
+ DenseMap<ValuePair, size_t> &DAG, ValuePair J);
+
+ void findBestDAGFor(
+ DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+ DenseSet<ValuePair> &CandidatePairsSet,
+ DenseMap<ValuePair, int> &CandidatePairCostSavings,
+ std::vector<Value *> &PairableInsts,
+ DenseSet<ValuePair> &FixedOrderPairs,
+ DenseMap<VPPair, unsigned> &PairConnectionTypes,
+ DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+ DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
+ DenseSet<ValuePair> &PairableInstUsers,
+ DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
+ DenseSet<VPPair> &PairableInstUserPairSet,
+ DenseMap<Value *, Value *> &ChosenPairs,
+ DenseSet<ValuePair> &BestDAG, size_t &BestMaxDepth,
+ int &BestEffSize, Value *II, std::vector<Value *>&JJ,
+ bool UseCycleCheck);
+
+ Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I,
+ Instruction *J, unsigned o);
+
+ void fillNewShuffleMask(LLVMContext& Context, Instruction *J,
+ unsigned MaskOffset, unsigned NumInElem,
+ unsigned NumInElem1, unsigned IdxOffset,
+ std::vector<Constant*> &Mask);
+
+ Value *getReplacementShuffleMask(LLVMContext& Context, Instruction *I,
+ Instruction *J);
+
+ bool expandIEChain(LLVMContext& Context, Instruction *I, Instruction *J,
+ unsigned o, Value *&LOp, unsigned numElemL,
+ Type *ArgTypeL, Type *ArgTypeR, bool IBeforeJ,
+ unsigned IdxOff = 0);
+
+ Value *getReplacementInput(LLVMContext& Context, Instruction *I,
+ Instruction *J, unsigned o, bool IBeforeJ);
+
+ void getReplacementInputsForPair(LLVMContext& Context, Instruction *I,
+ Instruction *J, SmallVectorImpl<Value *> &ReplacedOperands,
+ bool IBeforeJ);
+
+ void replaceOutputsOfPair(LLVMContext& Context, Instruction *I,
+ Instruction *J, Instruction *K,
+ Instruction *&InsertionPt, Instruction *&K1,
+ Instruction *&K2);
+
+ void collectPairLoadMoveSet(BasicBlock &BB,
+ DenseMap<Value *, Value *> &ChosenPairs,
+ DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
+ DenseSet<ValuePair> &LoadMoveSetPairs,
+ Instruction *I);
+
+ void collectLoadMoveSet(BasicBlock &BB,
+ std::vector<Value *> &PairableInsts,
+ DenseMap<Value *, Value *> &ChosenPairs,
+ DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
+ DenseSet<ValuePair> &LoadMoveSetPairs);
+
+ bool canMoveUsesOfIAfterJ(BasicBlock &BB,
+ DenseSet<ValuePair> &LoadMoveSetPairs,
+ Instruction *I, Instruction *J);
+
+ void moveUsesOfIAfterJ(BasicBlock &BB,
+ DenseSet<ValuePair> &LoadMoveSetPairs,
+ Instruction *&InsertionPt,
+ Instruction *I, Instruction *J);
+
+ bool vectorizeBB(BasicBlock &BB) {
+ if (skipOptnoneFunction(BB))
+ return false;
+ if (!DT->isReachableFromEntry(&BB)) {
+ DEBUG(dbgs() << "BBV: skipping unreachable " << BB.getName() <<
+ " in " << BB.getParent()->getName() << "\n");
+ return false;
+ }
+
+ DEBUG(if (TTI) dbgs() << "BBV: using target information\n");
+
+ bool changed = false;
+ // Iterate a sufficient number of times to merge types of size 1 bit,
+ // then 2 bits, then 4, etc. up to half of the target vector width of the
+ // target vector register.
+ unsigned n = 1;
+ for (unsigned v = 2;
+ (TTI || v <= Config.VectorBits) &&
+ (!Config.MaxIter || n <= Config.MaxIter);
+ v *= 2, ++n) {
+ DEBUG(dbgs() << "BBV: fusing loop #" << n <<
+ " for " << BB.getName() << " in " <<
+ BB.getParent()->getName() << "...\n");
+ if (vectorizePairs(BB))
+ changed = true;
+ else
+ break;
+ }
+
+ if (changed && !Pow2LenOnly) {
+ ++n;
+ for (; !Config.MaxIter || n <= Config.MaxIter; ++n) {
+ DEBUG(dbgs() << "BBV: fusing for non-2^n-length vectors loop #: " <<
+ n << " for " << BB.getName() << " in " <<
+ BB.getParent()->getName() << "...\n");
+ if (!vectorizePairs(BB, true)) break;
+ }
+ }
+
+ DEBUG(dbgs() << "BBV: done!\n");
+ return changed;
+ }
+
+ bool runOnBasicBlock(BasicBlock &BB) override {
+ // OptimizeNone check deferred to vectorizeBB().
+
+ AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ TTI = IgnoreTargetInfo
+ ? nullptr
+ : &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+ *BB.getParent());
+
+ return vectorizeBB(BB);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ BasicBlockPass::getAnalysisUsage(AU);
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addPreserved<ScalarEvolutionWrapperPass>();
+ AU.addPreserved<SCEVAAWrapperPass>();
+ AU.setPreservesCFG();
+ }
+
+ static inline VectorType *getVecTypeForPair(Type *ElemTy, Type *Elem2Ty) {
+ assert(ElemTy->getScalarType() == Elem2Ty->getScalarType() &&
+ "Cannot form vector from incompatible scalar types");
+ Type *STy = ElemTy->getScalarType();
+
+ unsigned numElem;
+ if (VectorType *VTy = dyn_cast<VectorType>(ElemTy)) {
+ numElem = VTy->getNumElements();
+ } else {
+ numElem = 1;
+ }
+
+ if (VectorType *VTy = dyn_cast<VectorType>(Elem2Ty)) {
+ numElem += VTy->getNumElements();
+ } else {
+ numElem += 1;
+ }
+
+ return VectorType::get(STy, numElem);
+ }
+
+ static inline void getInstructionTypes(Instruction *I,
+ Type *&T1, Type *&T2) {
+ if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+ // For stores, it is the value type, not the pointer type that matters
+ // because the value is what will come from a vector register.
+
+ Value *IVal = SI->getValueOperand();
+ T1 = IVal->getType();
+ } else {
+ T1 = I->getType();
+ }
+
+ if (CastInst *CI = dyn_cast<CastInst>(I))
+ T2 = CI->getSrcTy();
+ else
+ T2 = T1;
+
+ if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
+ T2 = SI->getCondition()->getType();
+ } else if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(I)) {
+ T2 = SI->getOperand(0)->getType();
+ } else if (CmpInst *CI = dyn_cast<CmpInst>(I)) {
+ T2 = CI->getOperand(0)->getType();
+ }
+ }
+
+ // Returns the weight associated with the provided value. A chain of
+ // candidate pairs has a length given by the sum of the weights of its
+ // members (one weight per pair; the weight of each member of the pair
+ // is assumed to be the same). This length is then compared to the
+ // chain-length threshold to determine if a given chain is significant
+ // enough to be vectorized. The length is also used in comparing
+ // candidate chains where longer chains are considered to be better.
+ // Note: when this function returns 0, the resulting instructions are
+ // not actually fused.
+ inline size_t getDepthFactor(Value *V) {
+ // InsertElement and ExtractElement have a depth factor of zero. This is
+ // for two reasons: First, they cannot be usefully fused. Second, because
+ // the pass generates a lot of these, they can confuse the simple metric
+ // used to compare the dags in the next iteration. Thus, giving them a
+ // weight of zero allows the pass to essentially ignore them in
+ // subsequent iterations when looking for vectorization opportunities
+ // while still tracking dependency chains that flow through those
+ // instructions.
+ if (isa<InsertElementInst>(V) || isa<ExtractElementInst>(V))
+ return 0;
+
+ // Give a load or store half of the required depth so that load/store
+ // pairs will vectorize.
+ if (!Config.NoMemOpBoost && (isa<LoadInst>(V) || isa<StoreInst>(V)))
+ return Config.ReqChainDepth/2;
+
+ return 1;
+ }
+
+ // Returns the cost of the provided instruction using TTI.
+ // This does not handle loads and stores.
+ unsigned getInstrCost(unsigned Opcode, Type *T1, Type *T2,
+ TargetTransformInfo::OperandValueKind Op1VK =
+ TargetTransformInfo::OK_AnyValue,
+ TargetTransformInfo::OperandValueKind Op2VK =
+ TargetTransformInfo::OK_AnyValue) {
+ switch (Opcode) {
+ default: break;
+ case Instruction::GetElementPtr:
+ // We mark this instruction as zero-cost because scalar GEPs are usually
+ // lowered to the instruction addressing mode. At the moment we don't
+ // generate vector GEPs.
+ return 0;
+ case Instruction::Br:
+ return TTI->getCFInstrCost(Opcode);
+ case Instruction::PHI:
+ return 0;
+ case Instruction::Add:
+ case Instruction::FAdd:
+ case Instruction::Sub:
+ case Instruction::FSub:
+ case Instruction::Mul:
+ case Instruction::FMul:
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::FDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ return TTI->getArithmeticInstrCost(Opcode, T1, Op1VK, Op2VK);
+ case Instruction::Select:
+ case Instruction::ICmp:
+ case Instruction::FCmp:
+ return TTI->getCmpSelInstrCost(Opcode, T1, T2);
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::SIToFP:
+ case Instruction::UIToFP:
+ case Instruction::Trunc:
+ case Instruction::FPTrunc:
+ case Instruction::BitCast:
+ case Instruction::ShuffleVector:
+ return TTI->getCastInstrCost(Opcode, T1, T2);
+ }
+
+ return 1;
+ }
+
+ // This determines the relative offset of two loads or stores, returning
+ // true if the offset could be determined to be some constant value.
+ // For example, if OffsetInElmts == 1, then J accesses the memory directly
+ // after I; if OffsetInElmts == -1 then I accesses the memory
+ // directly after J.
+ bool getPairPtrInfo(Instruction *I, Instruction *J,
+ Value *&IPtr, Value *&JPtr, unsigned &IAlignment, unsigned &JAlignment,
+ unsigned &IAddressSpace, unsigned &JAddressSpace,
+ int64_t &OffsetInElmts, bool ComputeOffset = true) {
+ OffsetInElmts = 0;
+ if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+ LoadInst *LJ = cast<LoadInst>(J);
+ IPtr = LI->getPointerOperand();
+ JPtr = LJ->getPointerOperand();
+ IAlignment = LI->getAlignment();
+ JAlignment = LJ->getAlignment();
+ IAddressSpace = LI->getPointerAddressSpace();
+ JAddressSpace = LJ->getPointerAddressSpace();
+ } else {
+ StoreInst *SI = cast<StoreInst>(I), *SJ = cast<StoreInst>(J);
+ IPtr = SI->getPointerOperand();
+ JPtr = SJ->getPointerOperand();
+ IAlignment = SI->getAlignment();
+ JAlignment = SJ->getAlignment();
+ IAddressSpace = SI->getPointerAddressSpace();
+ JAddressSpace = SJ->getPointerAddressSpace();
+ }
+
+ if (!ComputeOffset)
+ return true;
+
+ const SCEV *IPtrSCEV = SE->getSCEV(IPtr);
+ const SCEV *JPtrSCEV = SE->getSCEV(JPtr);
+
+ // If this is a trivial offset, then we'll get something like
+ // 1*sizeof(type). With target data, which we need anyway, this will get
+ // constant folded into a number.
+ const SCEV *OffsetSCEV = SE->getMinusSCEV(JPtrSCEV, IPtrSCEV);
+ if (const SCEVConstant *ConstOffSCEV =
+ dyn_cast<SCEVConstant>(OffsetSCEV)) {
+ ConstantInt *IntOff = ConstOffSCEV->getValue();
+ int64_t Offset = IntOff->getSExtValue();
+ const DataLayout &DL = I->getModule()->getDataLayout();
+ Type *VTy = IPtr->getType()->getPointerElementType();
+ int64_t VTyTSS = (int64_t)DL.getTypeStoreSize(VTy);
+
+ Type *VTy2 = JPtr->getType()->getPointerElementType();
+ if (VTy != VTy2 && Offset < 0) {
+ int64_t VTy2TSS = (int64_t)DL.getTypeStoreSize(VTy2);
+ OffsetInElmts = Offset/VTy2TSS;
+ return (std::abs(Offset) % VTy2TSS) == 0;
+ }
+
+ OffsetInElmts = Offset/VTyTSS;
+ return (std::abs(Offset) % VTyTSS) == 0;
+ }
+
+ return false;
+ }
+
+ // Returns true if the provided CallInst represents an intrinsic that can
+ // be vectorized.
+ bool isVectorizableIntrinsic(CallInst* I) {
+ Function *F = I->getCalledFunction();
+ if (!F) return false;
+
+ Intrinsic::ID IID = F->getIntrinsicID();
+ if (!IID) return false;
+
+ switch(IID) {
+ default:
+ return false;
+ case Intrinsic::sqrt:
+ case Intrinsic::powi:
+ case Intrinsic::sin:
+ case Intrinsic::cos:
+ case Intrinsic::log:
+ case Intrinsic::log2:
+ case Intrinsic::log10:
+ case Intrinsic::exp:
+ case Intrinsic::exp2:
+ case Intrinsic::pow:
+ case Intrinsic::round:
+ case Intrinsic::copysign:
+ case Intrinsic::ceil:
+ case Intrinsic::nearbyint:
+ case Intrinsic::rint:
+ case Intrinsic::trunc:
+ case Intrinsic::floor:
+ case Intrinsic::fabs:
+ case Intrinsic::minnum:
+ case Intrinsic::maxnum:
+ return Config.VectorizeMath;
+ case Intrinsic::bswap:
+ case Intrinsic::ctpop:
+ case Intrinsic::ctlz:
+ case Intrinsic::cttz:
+ return Config.VectorizeBitManipulations;
+ case Intrinsic::fma:
+ case Intrinsic::fmuladd:
+ return Config.VectorizeFMA;
+ }
+ }
+
+ bool isPureIEChain(InsertElementInst *IE) {
+ InsertElementInst *IENext = IE;
+ do {
+ if (!isa<UndefValue>(IENext->getOperand(0)) &&
+ !isa<InsertElementInst>(IENext->getOperand(0))) {
+ return false;
+ }
+ } while ((IENext =
+ dyn_cast<InsertElementInst>(IENext->getOperand(0))));
+
+ return true;
+ }
+ };
+
+ // This function implements one vectorization iteration on the provided
+ // basic block. It returns true if the block is changed.
+ bool BBVectorize::vectorizePairs(BasicBlock &BB, bool NonPow2Len) {
+ bool ShouldContinue;
+ BasicBlock::iterator Start = BB.getFirstInsertionPt();
+
+ std::vector<Value *> AllPairableInsts;
+ DenseMap<Value *, Value *> AllChosenPairs;
+ DenseSet<ValuePair> AllFixedOrderPairs;
+ DenseMap<VPPair, unsigned> AllPairConnectionTypes;
+ DenseMap<ValuePair, std::vector<ValuePair> > AllConnectedPairs,
+ AllConnectedPairDeps;
+
+ do {
+ std::vector<Value *> PairableInsts;
+ DenseMap<Value *, std::vector<Value *> > CandidatePairs;
+ DenseSet<ValuePair> FixedOrderPairs;
+ DenseMap<ValuePair, int> CandidatePairCostSavings;
+ ShouldContinue = getCandidatePairs(BB, Start, CandidatePairs,
+ FixedOrderPairs,
+ CandidatePairCostSavings,
+ PairableInsts, NonPow2Len);
+ if (PairableInsts.empty()) continue;
+
+ // Build the candidate pair set for faster lookups.
+ DenseSet<ValuePair> CandidatePairsSet;
+ for (DenseMap<Value *, std::vector<Value *> >::iterator I =
+ CandidatePairs.begin(), E = CandidatePairs.end(); I != E; ++I)
+ for (std::vector<Value *>::iterator J = I->second.begin(),
+ JE = I->second.end(); J != JE; ++J)
+ CandidatePairsSet.insert(ValuePair(I->first, *J));
+
+ // Now we have a map of all of the pairable instructions and we need to
+ // select the best possible pairing. A good pairing is one such that the
+ // users of the pair are also paired. This defines a (directed) forest
+ // over the pairs such that two pairs are connected iff the second pair
+ // uses the first.
+
+ // Note that it only matters that both members of the second pair use some
+ // element of the first pair (to allow for splatting).
+
+ DenseMap<ValuePair, std::vector<ValuePair> > ConnectedPairs,
+ ConnectedPairDeps;
+ DenseMap<VPPair, unsigned> PairConnectionTypes;
+ computeConnectedPairs(CandidatePairs, CandidatePairsSet,
+ PairableInsts, ConnectedPairs, PairConnectionTypes);
+ if (ConnectedPairs.empty()) continue;
+
+ for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator
+ I = ConnectedPairs.begin(), IE = ConnectedPairs.end();
+ I != IE; ++I)
+ for (std::vector<ValuePair>::iterator J = I->second.begin(),
+ JE = I->second.end(); J != JE; ++J)
+ ConnectedPairDeps[*J].push_back(I->first);
+
+ // Build the pairable-instruction dependency map
+ DenseSet<ValuePair> PairableInstUsers;
+ buildDepMap(BB, CandidatePairs, PairableInsts, PairableInstUsers);
+
+ // There is now a graph of the connected pairs. For each variable, pick
+ // the pairing with the largest dag meeting the depth requirement on at
+ // least one branch. Then select all pairings that are part of that dag
+ // and remove them from the list of available pairings and pairable
+ // variables.
+
+ DenseMap<Value *, Value *> ChosenPairs;
+ choosePairs(CandidatePairs, CandidatePairsSet,
+ CandidatePairCostSavings,
+ PairableInsts, FixedOrderPairs, PairConnectionTypes,
+ ConnectedPairs, ConnectedPairDeps,
+ PairableInstUsers, ChosenPairs);
+
+ if (ChosenPairs.empty()) continue;
+ AllPairableInsts.insert(AllPairableInsts.end(), PairableInsts.begin(),
+ PairableInsts.end());
+ AllChosenPairs.insert(ChosenPairs.begin(), ChosenPairs.end());
+
+ // Only for the chosen pairs, propagate information on fixed-order pairs,
+ // pair connections, and their types to the data structures used by the
+ // pair fusion procedures.
+ for (DenseMap<Value *, Value *>::iterator I = ChosenPairs.begin(),
+ IE = ChosenPairs.end(); I != IE; ++I) {
+ if (FixedOrderPairs.count(*I))
+ AllFixedOrderPairs.insert(*I);
+ else if (FixedOrderPairs.count(ValuePair(I->second, I->first)))
+ AllFixedOrderPairs.insert(ValuePair(I->second, I->first));
+
+ for (DenseMap<Value *, Value *>::iterator J = ChosenPairs.begin();
+ J != IE; ++J) {
+ DenseMap<VPPair, unsigned>::iterator K =
+ PairConnectionTypes.find(VPPair(*I, *J));
+ if (K != PairConnectionTypes.end()) {
+ AllPairConnectionTypes.insert(*K);
+ } else {
+ K = PairConnectionTypes.find(VPPair(*J, *I));
+ if (K != PairConnectionTypes.end())
+ AllPairConnectionTypes.insert(*K);
+ }
+ }
+ }
+
+ for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator
+ I = ConnectedPairs.begin(), IE = ConnectedPairs.end();
+ I != IE; ++I)
+ for (std::vector<ValuePair>::iterator J = I->second.begin(),
+ JE = I->second.end(); J != JE; ++J)
+ if (AllPairConnectionTypes.count(VPPair(I->first, *J))) {
+ AllConnectedPairs[I->first].push_back(*J);
+ AllConnectedPairDeps[*J].push_back(I->first);
+ }
+ } while (ShouldContinue);
+
+ if (AllChosenPairs.empty()) return false;
+ NumFusedOps += AllChosenPairs.size();
+
+ // A set of pairs has now been selected. It is now necessary to replace the
+ // paired instructions with vector instructions. For this procedure each
+ // operand must be replaced with a vector operand. This vector is formed
+ // by using build_vector on the old operands. The replaced values are then
+ // replaced with a vector_extract on the result. Subsequent optimization
+ // passes should coalesce the build/extract combinations.
+
+ fuseChosenPairs(BB, AllPairableInsts, AllChosenPairs, AllFixedOrderPairs,
+ AllPairConnectionTypes,
+ AllConnectedPairs, AllConnectedPairDeps);
+
+ // It is important to cleanup here so that future iterations of this
+ // function have less work to do.
+ (void)SimplifyInstructionsInBlock(&BB, TLI);
+ return true;
+ }
+
+ // This function returns true if the provided instruction is capable of being
+ // fused into a vector instruction. This determination is based only on the
+ // type and other attributes of the instruction.
+ bool BBVectorize::isInstVectorizable(Instruction *I,
+ bool &IsSimpleLoadStore) {
+ IsSimpleLoadStore = false;
+
+ if (CallInst *C = dyn_cast<CallInst>(I)) {
+ if (!isVectorizableIntrinsic(C))
+ return false;
+ } else if (LoadInst *L = dyn_cast<LoadInst>(I)) {
+ // Vectorize simple loads if possbile:
+ IsSimpleLoadStore = L->isSimple();
+ if (!IsSimpleLoadStore || !Config.VectorizeMemOps)
+ return false;
+ } else if (StoreInst *S = dyn_cast<StoreInst>(I)) {
+ // Vectorize simple stores if possbile:
+ IsSimpleLoadStore = S->isSimple();
+ if (!IsSimpleLoadStore || !Config.VectorizeMemOps)
+ return false;
+ } else if (CastInst *C = dyn_cast<CastInst>(I)) {
+ // We can vectorize casts, but not casts of pointer types, etc.
+ if (!Config.VectorizeCasts)
+ return false;
+
+ Type *SrcTy = C->getSrcTy();
+ if (!SrcTy->isSingleValueType())
+ return false;
+
+ Type *DestTy = C->getDestTy();
+ if (!DestTy->isSingleValueType())
+ return false;
+ } else if (isa<SelectInst>(I)) {
+ if (!Config.VectorizeSelect)
+ return false;
+ } else if (isa<CmpInst>(I)) {
+ if (!Config.VectorizeCmp)
+ return false;
+ } else if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(I)) {
+ if (!Config.VectorizeGEP)
+ return false;
+
+ // Currently, vector GEPs exist only with one index.
+ if (G->getNumIndices() != 1)
+ return false;
+ } else if (!(I->isBinaryOp() || isa<ShuffleVectorInst>(I) ||
+ isa<ExtractElementInst>(I) || isa<InsertElementInst>(I))) {
+ return false;
+ }
+
+ Type *T1, *T2;
+ getInstructionTypes(I, T1, T2);
+
+ // Not every type can be vectorized...
+ if (!(VectorType::isValidElementType(T1) || T1->isVectorTy()) ||
+ !(VectorType::isValidElementType(T2) || T2->isVectorTy()))
+ return false;
+
+ if (T1->getScalarSizeInBits() == 1) {
+ if (!Config.VectorizeBools)
+ return false;
+ } else {
+ if (!Config.VectorizeInts && T1->isIntOrIntVectorTy())
+ return false;
+ }
+
+ if (T2->getScalarSizeInBits() == 1) {
+ if (!Config.VectorizeBools)
+ return false;
+ } else {
+ if (!Config.VectorizeInts && T2->isIntOrIntVectorTy())
+ return false;
+ }
+
+ if (!Config.VectorizeFloats
+ && (T1->isFPOrFPVectorTy() || T2->isFPOrFPVectorTy()))
+ return false;
+
+ // Don't vectorize target-specific types.
+ if (T1->isX86_FP80Ty() || T1->isPPC_FP128Ty() || T1->isX86_MMXTy())
+ return false;
+ if (T2->isX86_FP80Ty() || T2->isPPC_FP128Ty() || T2->isX86_MMXTy())
+ return false;
+
+ if (!Config.VectorizePointers && (T1->getScalarType()->isPointerTy() ||
+ T2->getScalarType()->isPointerTy()))
+ return false;
+
+ if (!TTI && (T1->getPrimitiveSizeInBits() >= Config.VectorBits ||
+ T2->getPrimitiveSizeInBits() >= Config.VectorBits))
+ return false;
+
+ return true;
+ }
+
+ // This function returns true if the two provided instructions are compatible
+ // (meaning that they can be fused into a vector instruction). This assumes
+ // that I has already been determined to be vectorizable and that J is not
+ // in the use dag of I.
+ bool BBVectorize::areInstsCompatible(Instruction *I, Instruction *J,
+ bool IsSimpleLoadStore, bool NonPow2Len,
+ int &CostSavings, int &FixedOrder) {
+ DEBUG(if (DebugInstructionExamination) dbgs() << "BBV: looking at " << *I <<
+ " <-> " << *J << "\n");
+
+ CostSavings = 0;
+ FixedOrder = 0;
+
+ // Loads and stores can be merged if they have different alignments,
+ // but are otherwise the same.
+ if (!J->isSameOperationAs(I, Instruction::CompareIgnoringAlignment |
+ (NonPow2Len ? Instruction::CompareUsingScalarTypes : 0)))
+ return false;
+
+ Type *IT1, *IT2, *JT1, *JT2;
+ getInstructionTypes(I, IT1, IT2);
+ getInstructionTypes(J, JT1, JT2);
+ unsigned MaxTypeBits = std::max(
+ IT1->getPrimitiveSizeInBits() + JT1->getPrimitiveSizeInBits(),
+ IT2->getPrimitiveSizeInBits() + JT2->getPrimitiveSizeInBits());
+ if (!TTI && MaxTypeBits > Config.VectorBits)
+ return false;
+
+ // FIXME: handle addsub-type operations!
+
+ if (IsSimpleLoadStore) {
+ Value *IPtr, *JPtr;
+ unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace;
+ int64_t OffsetInElmts = 0;
+ if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
+ IAddressSpace, JAddressSpace, OffsetInElmts) &&
+ std::abs(OffsetInElmts) == 1) {
+ FixedOrder = (int) OffsetInElmts;
+ unsigned BottomAlignment = IAlignment;
+ if (OffsetInElmts < 0) BottomAlignment = JAlignment;
+
+ Type *aTypeI = isa<StoreInst>(I) ?
+ cast<StoreInst>(I)->getValueOperand()->getType() : I->getType();
+ Type *aTypeJ = isa<StoreInst>(J) ?
+ cast<StoreInst>(J)->getValueOperand()->getType() : J->getType();
+ Type *VType = getVecTypeForPair(aTypeI, aTypeJ);
+
+ if (Config.AlignedOnly) {
+ // An aligned load or store is possible only if the instruction
+ // with the lower offset has an alignment suitable for the
+ // vector type.
+ const DataLayout &DL = I->getModule()->getDataLayout();
+ unsigned VecAlignment = DL.getPrefTypeAlignment(VType);
+ if (BottomAlignment < VecAlignment)
+ return false;
+ }
+
+ if (TTI) {
+ unsigned ICost = TTI->getMemoryOpCost(I->getOpcode(), aTypeI,
+ IAlignment, IAddressSpace);
+ unsigned JCost = TTI->getMemoryOpCost(J->getOpcode(), aTypeJ,
+ JAlignment, JAddressSpace);
+ unsigned VCost = TTI->getMemoryOpCost(I->getOpcode(), VType,
+ BottomAlignment,
+ IAddressSpace);
+
+ ICost += TTI->getAddressComputationCost(aTypeI);
+ JCost += TTI->getAddressComputationCost(aTypeJ);
+ VCost += TTI->getAddressComputationCost(VType);
+
+ if (VCost > ICost + JCost)
+ return false;
+
+ // We don't want to fuse to a type that will be split, even
+ // if the two input types will also be split and there is no other
+ // associated cost.
+ unsigned VParts = TTI->getNumberOfParts(VType);
+ if (VParts > 1)
+ return false;
+ else if (!VParts && VCost == ICost + JCost)
+ return false;
+
+ CostSavings = ICost + JCost - VCost;
+ }
+ } else {
+ return false;
+ }
+ } else if (TTI) {
+ unsigned ICost = getInstrCost(I->getOpcode(), IT1, IT2);
+ unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2);
+ Type *VT1 = getVecTypeForPair(IT1, JT1),
+ *VT2 = getVecTypeForPair(IT2, JT2);
+ TargetTransformInfo::OperandValueKind Op1VK =
+ TargetTransformInfo::OK_AnyValue;
+ TargetTransformInfo::OperandValueKind Op2VK =
+ TargetTransformInfo::OK_AnyValue;
+
+ // On some targets (example X86) the cost of a vector shift may vary
+ // depending on whether the second operand is a Uniform or
+ // NonUniform Constant.
+ switch (I->getOpcode()) {
+ default : break;
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+
+ // If both I and J are scalar shifts by constant, then the
+ // merged vector shift count would be either a constant splat value
+ // or a non-uniform vector of constants.
+ if (ConstantInt *CII = dyn_cast<ConstantInt>(I->getOperand(1))) {
+ if (ConstantInt *CIJ = dyn_cast<ConstantInt>(J->getOperand(1)))
+ Op2VK = CII == CIJ ? TargetTransformInfo::OK_UniformConstantValue :
+ TargetTransformInfo::OK_NonUniformConstantValue;
+ } else {
+ // Check for a splat of a constant or for a non uniform vector
+ // of constants.
+ Value *IOp = I->getOperand(1);
+ Value *JOp = J->getOperand(1);
+ if ((isa<ConstantVector>(IOp) || isa<ConstantDataVector>(IOp)) &&
+ (isa<ConstantVector>(JOp) || isa<ConstantDataVector>(JOp))) {
+ Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
+ Constant *SplatValue = cast<Constant>(IOp)->getSplatValue();
+ if (SplatValue != nullptr &&
+ SplatValue == cast<Constant>(JOp)->getSplatValue())
+ Op2VK = TargetTransformInfo::OK_UniformConstantValue;
+ }
+ }
+ }
+
+ // Note that this procedure is incorrect for insert and extract element
+ // instructions (because combining these often results in a shuffle),
+ // but this cost is ignored (because insert and extract element
+ // instructions are assigned a zero depth factor and are not really
+ // fused in general).
+ unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2, Op1VK, Op2VK);
+
+ if (VCost > ICost + JCost)
+ return false;
+
+ // We don't want to fuse to a type that will be split, even
+ // if the two input types will also be split and there is no other
+ // associated cost.
+ unsigned VParts1 = TTI->getNumberOfParts(VT1),
+ VParts2 = TTI->getNumberOfParts(VT2);
+ if (VParts1 > 1 || VParts2 > 1)
+ return false;
+ else if ((!VParts1 || !VParts2) && VCost == ICost + JCost)
+ return false;
+
+ CostSavings = ICost + JCost - VCost;
+ }
+
+ // The powi,ctlz,cttz intrinsics are special because only the first
+ // argument is vectorized, the second arguments must be equal.
+ CallInst *CI = dyn_cast<CallInst>(I);
+ Function *FI;
+ if (CI && (FI = CI->getCalledFunction())) {
+ Intrinsic::ID IID = FI->getIntrinsicID();
+ if (IID == Intrinsic::powi || IID == Intrinsic::ctlz ||
+ IID == Intrinsic::cttz) {
+ Value *A1I = CI->getArgOperand(1),
+ *A1J = cast<CallInst>(J)->getArgOperand(1);
+ const SCEV *A1ISCEV = SE->getSCEV(A1I),
+ *A1JSCEV = SE->getSCEV(A1J);
+ return (A1ISCEV == A1JSCEV);
+ }
+
+ if (IID && TTI) {
+ SmallVector<Type*, 4> Tys;
+ for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i)
+ Tys.push_back(CI->getArgOperand(i)->getType());
+ unsigned ICost = TTI->getIntrinsicInstrCost(IID, IT1, Tys);
+
+ Tys.clear();
+ CallInst *CJ = cast<CallInst>(J);
+ for (unsigned i = 0, ie = CJ->getNumArgOperands(); i != ie; ++i)
+ Tys.push_back(CJ->getArgOperand(i)->getType());
+ unsigned JCost = TTI->getIntrinsicInstrCost(IID, JT1, Tys);
+
+ Tys.clear();
+ assert(CI->getNumArgOperands() == CJ->getNumArgOperands() &&
+ "Intrinsic argument counts differ");
+ for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
+ if ((IID == Intrinsic::powi || IID == Intrinsic::ctlz ||
+ IID == Intrinsic::cttz) && i == 1)
+ Tys.push_back(CI->getArgOperand(i)->getType());
+ else
+ Tys.push_back(getVecTypeForPair(CI->getArgOperand(i)->getType(),
+ CJ->getArgOperand(i)->getType()));
+ }
+
+ Type *RetTy = getVecTypeForPair(IT1, JT1);
+ unsigned VCost = TTI->getIntrinsicInstrCost(IID, RetTy, Tys);
+
+ if (VCost > ICost + JCost)
+ return false;
+
+ // We don't want to fuse to a type that will be split, even
+ // if the two input types will also be split and there is no other
+ // associated cost.
+ unsigned RetParts = TTI->getNumberOfParts(RetTy);
+ if (RetParts > 1)
+ return false;
+ else if (!RetParts && VCost == ICost + JCost)
+ return false;
+
+ for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
+ if (!Tys[i]->isVectorTy())
+ continue;
+
+ unsigned NumParts = TTI->getNumberOfParts(Tys[i]);
+ if (NumParts > 1)
+ return false;
+ else if (!NumParts && VCost == ICost + JCost)
+ return false;
+ }
+
+ CostSavings = ICost + JCost - VCost;
+ }
+ }
+
+ return true;
+ }
+
+ // Figure out whether or not J uses I and update the users and write-set
+ // structures associated with I. Specifically, Users represents the set of
+ // instructions that depend on I. WriteSet represents the set
+ // of memory locations that are dependent on I. If UpdateUsers is true,
+ // and J uses I, then Users is updated to contain J and WriteSet is updated
+ // to contain any memory locations to which J writes. The function returns
+ // true if J uses I. By default, alias analysis is used to determine
+ // whether J reads from memory that overlaps with a location in WriteSet.
+ // If LoadMoveSet is not null, then it is a previously-computed map
+ // where the key is the memory-based user instruction and the value is
+ // the instruction to be compared with I. So, if LoadMoveSet is provided,
+ // then the alias analysis is not used. This is necessary because this
+ // function is called during the process of moving instructions during
+ // vectorization and the results of the alias analysis are not stable during
+ // that process.
+ bool BBVectorize::trackUsesOfI(DenseSet<Value *> &Users,
+ AliasSetTracker &WriteSet, Instruction *I,
+ Instruction *J, bool UpdateUsers,
+ DenseSet<ValuePair> *LoadMoveSetPairs) {
+ bool UsesI = false;
+
+ // This instruction may already be marked as a user due, for example, to
+ // being a member of a selected pair.
+ if (Users.count(J))
+ UsesI = true;
+
+ if (!UsesI)
+ for (User::op_iterator JU = J->op_begin(), JE = J->op_end();
+ JU != JE; ++JU) {
+ Value *V = *JU;
+ if (I == V || Users.count(V)) {
+ UsesI = true;
+ break;
+ }
+ }
+ if (!UsesI && J->mayReadFromMemory()) {
+ if (LoadMoveSetPairs) {
+ UsesI = LoadMoveSetPairs->count(ValuePair(J, I));
+ } else {
+ for (AliasSetTracker::iterator W = WriteSet.begin(),
+ WE = WriteSet.end(); W != WE; ++W) {
+ if (W->aliasesUnknownInst(J, *AA)) {
+ UsesI = true;
+ break;
+ }
+ }
+ }
+ }
+
+ if (UsesI && UpdateUsers) {
+ if (J->mayWriteToMemory()) WriteSet.add(J);
+ Users.insert(J);
+ }
+
+ return UsesI;
+ }
+
+ // This function iterates over all instruction pairs in the provided
+ // basic block and collects all candidate pairs for vectorization.
+ bool BBVectorize::getCandidatePairs(BasicBlock &BB,
+ BasicBlock::iterator &Start,
+ DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+ DenseSet<ValuePair> &FixedOrderPairs,
+ DenseMap<ValuePair, int> &CandidatePairCostSavings,
+ std::vector<Value *> &PairableInsts, bool NonPow2Len) {
+ size_t TotalPairs = 0;
+ BasicBlock::iterator E = BB.end();
+ if (Start == E) return false;
+
+ bool ShouldContinue = false, IAfterStart = false;
+ for (BasicBlock::iterator I = Start++; I != E; ++I) {
+ if (I == Start) IAfterStart = true;
+
+ bool IsSimpleLoadStore;
+ if (!isInstVectorizable(&*I, IsSimpleLoadStore))
+ continue;
+
+ // Look for an instruction with which to pair instruction *I...
+ DenseSet<Value *> Users;
+ AliasSetTracker WriteSet(*AA);
+ if (I->mayWriteToMemory())
+ WriteSet.add(&*I);
+
+ bool JAfterStart = IAfterStart;
+ BasicBlock::iterator J = std::next(I);
+ for (unsigned ss = 0; J != E && ss <= Config.SearchLimit; ++J, ++ss) {
+ if (&*J == Start)
+ JAfterStart = true;
+
+ // Determine if J uses I, if so, exit the loop.
+ bool UsesI = trackUsesOfI(Users, WriteSet, &*I, &*J, !Config.FastDep);
+ if (Config.FastDep) {
+ // Note: For this heuristic to be effective, independent operations
+ // must tend to be intermixed. This is likely to be true from some
+ // kinds of grouped loop unrolling (but not the generic LLVM pass),
+ // but otherwise may require some kind of reordering pass.
+
+ // When using fast dependency analysis,
+ // stop searching after first use:
+ if (UsesI) break;
+ } else {
+ if (UsesI) continue;
+ }
+
+ // J does not use I, and comes before the first use of I, so it can be
+ // merged with I if the instructions are compatible.
+ int CostSavings, FixedOrder;
+ if (!areInstsCompatible(&*I, &*J, IsSimpleLoadStore, NonPow2Len,
+ CostSavings, FixedOrder))
+ continue;
+
+ // J is a candidate for merging with I.
+ if (PairableInsts.empty() ||
+ PairableInsts[PairableInsts.size() - 1] != &*I) {
+ PairableInsts.push_back(&*I);
+ }
+
+ CandidatePairs[&*I].push_back(&*J);
+ ++TotalPairs;
+ if (TTI)
+ CandidatePairCostSavings.insert(
+ ValuePairWithCost(ValuePair(&*I, &*J), CostSavings));
+
+ if (FixedOrder == 1)
+ FixedOrderPairs.insert(ValuePair(&*I, &*J));
+ else if (FixedOrder == -1)
+ FixedOrderPairs.insert(ValuePair(&*J, &*I));
+
+ // The next call to this function must start after the last instruction
+ // selected during this invocation.
+ if (JAfterStart) {
+ Start = std::next(J);
+ IAfterStart = JAfterStart = false;
+ }
+
+ DEBUG(if (DebugCandidateSelection) dbgs() << "BBV: candidate pair "
+ << *I << " <-> " << *J << " (cost savings: " <<
+ CostSavings << ")\n");
+
+ // If we have already found too many pairs, break here and this function
+ // will be called again starting after the last instruction selected
+ // during this invocation.
+ if (PairableInsts.size() >= Config.MaxInsts ||
+ TotalPairs >= Config.MaxPairs) {
+ ShouldContinue = true;
+ break;
+ }
+ }
+
+ if (ShouldContinue)
+ break;
+ }
+
+ DEBUG(dbgs() << "BBV: found " << PairableInsts.size()
+ << " instructions with candidate pairs\n");
+
+ return ShouldContinue;
+ }
+
+ // Finds candidate pairs connected to the pair P = <PI, PJ>. This means that
+ // it looks for pairs such that both members have an input which is an
+ // output of PI or PJ.
+ void BBVectorize::computePairsConnectedTo(
+ DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+ DenseSet<ValuePair> &CandidatePairsSet,
+ std::vector<Value *> &PairableInsts,
+ DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+ DenseMap<VPPair, unsigned> &PairConnectionTypes,
+ ValuePair P) {
+ StoreInst *SI, *SJ;
+
+ // For each possible pairing for this variable, look at the uses of
+ // the first value...
+ for (Value::user_iterator I = P.first->user_begin(),
+ E = P.first->user_end();
+ I != E; ++I) {
+ User *UI = *I;
+ if (isa<LoadInst>(UI)) {
+ // A pair cannot be connected to a load because the load only takes one
+ // operand (the address) and it is a scalar even after vectorization.
+ continue;
+ } else if ((SI = dyn_cast<StoreInst>(UI)) &&
+ P.first == SI->getPointerOperand()) {
+ // Similarly, a pair cannot be connected to a store through its
+ // pointer operand.
+ continue;
+ }
+
+ // For each use of the first variable, look for uses of the second
+ // variable...
+ for (User *UJ : P.second->users()) {
+ if ((SJ = dyn_cast<StoreInst>(UJ)) &&
+ P.second == SJ->getPointerOperand())
+ continue;
+
+ // Look for <I, J>:
+ if (CandidatePairsSet.count(ValuePair(UI, UJ))) {
+ VPPair VP(P, ValuePair(UI, UJ));
+ ConnectedPairs[VP.first].push_back(VP.second);
+ PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionDirect));
+ }
+
+ // Look for <J, I>:
+ if (CandidatePairsSet.count(ValuePair(UJ, UI))) {
+ VPPair VP(P, ValuePair(UJ, UI));
+ ConnectedPairs[VP.first].push_back(VP.second);
+ PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSwap));
+ }
+ }
+
+ if (Config.SplatBreaksChain) continue;
+ // Look for cases where just the first value in the pair is used by
+ // both members of another pair (splatting).
+ for (Value::user_iterator J = P.first->user_begin(); J != E; ++J) {
+ User *UJ = *J;
+ if ((SJ = dyn_cast<StoreInst>(UJ)) &&
+ P.first == SJ->getPointerOperand())
+ continue;
+
+ if (CandidatePairsSet.count(ValuePair(UI, UJ))) {
+ VPPair VP(P, ValuePair(UI, UJ));
+ ConnectedPairs[VP.first].push_back(VP.second);
+ PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat));
+ }
+ }
+ }
+
+ if (Config.SplatBreaksChain) return;
+ // Look for cases where just the second value in the pair is used by
+ // both members of another pair (splatting).
+ for (Value::user_iterator I = P.second->user_begin(),
+ E = P.second->user_end();
+ I != E; ++I) {
+ User *UI = *I;
+ if (isa<LoadInst>(UI))
+ continue;
+ else if ((SI = dyn_cast<StoreInst>(UI)) &&
+ P.second == SI->getPointerOperand())
+ continue;
+
+ for (Value::user_iterator J = P.second->user_begin(); J != E; ++J) {
+ User *UJ = *J;
+ if ((SJ = dyn_cast<StoreInst>(UJ)) &&
+ P.second == SJ->getPointerOperand())
+ continue;
+
+ if (CandidatePairsSet.count(ValuePair(UI, UJ))) {
+ VPPair VP(P, ValuePair(UI, UJ));
+ ConnectedPairs[VP.first].push_back(VP.second);
+ PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat));
+ }
+ }
+ }
+ }
+
+ // This function figures out which pairs are connected. Two pairs are
+ // connected if some output of the first pair forms an input to both members
+ // of the second pair.
+ void BBVectorize::computeConnectedPairs(
+ DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+ DenseSet<ValuePair> &CandidatePairsSet,
+ std::vector<Value *> &PairableInsts,
+ DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+ DenseMap<VPPair, unsigned> &PairConnectionTypes) {
+ for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
+ PE = PairableInsts.end(); PI != PE; ++PI) {
+ DenseMap<Value *, std::vector<Value *> >::iterator PP =
+ CandidatePairs.find(*PI);
+ if (PP == CandidatePairs.end())
+ continue;
+
+ for (std::vector<Value *>::iterator P = PP->second.begin(),
+ E = PP->second.end(); P != E; ++P)
+ computePairsConnectedTo(CandidatePairs, CandidatePairsSet,
+ PairableInsts, ConnectedPairs,
+ PairConnectionTypes, ValuePair(*PI, *P));
+ }
+
+ DEBUG(size_t TotalPairs = 0;
+ for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator I =
+ ConnectedPairs.begin(), IE = ConnectedPairs.end(); I != IE; ++I)
+ TotalPairs += I->second.size();
+ dbgs() << "BBV: found " << TotalPairs
+ << " pair connections.\n");
+ }
+
+ // This function builds a set of use tuples such that <A, B> is in the set
+ // if B is in the use dag of A. If B is in the use dag of A, then B
+ // depends on the output of A.
+ void BBVectorize::buildDepMap(
+ BasicBlock &BB,
+ DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+ std::vector<Value *> &PairableInsts,
+ DenseSet<ValuePair> &PairableInstUsers) {
+ DenseSet<Value *> IsInPair;
+ for (DenseMap<Value *, std::vector<Value *> >::iterator C =
+ CandidatePairs.begin(), E = CandidatePairs.end(); C != E; ++C) {
+ IsInPair.insert(C->first);
+ IsInPair.insert(C->second.begin(), C->second.end());
+ }
+
+ // Iterate through the basic block, recording all users of each
+ // pairable instruction.
+
+ BasicBlock::iterator E = BB.end(), EL =
+ BasicBlock::iterator(cast<Instruction>(PairableInsts.back()));
+ for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E; ++I) {
+ if (IsInPair.find(&*I) == IsInPair.end())
+ continue;
+
+ DenseSet<Value *> Users;
+ AliasSetTracker WriteSet(*AA);
+ if (I->mayWriteToMemory())
+ WriteSet.add(&*I);
+
+ for (BasicBlock::iterator J = std::next(I); J != E; ++J) {
+ (void)trackUsesOfI(Users, WriteSet, &*I, &*J);
+
+ if (J == EL)
+ break;
+ }
+
+ for (DenseSet<Value *>::iterator U = Users.begin(), E = Users.end();
+ U != E; ++U) {
+ if (IsInPair.find(*U) == IsInPair.end()) continue;
+ PairableInstUsers.insert(ValuePair(&*I, *U));
+ }
+
+ if (I == EL)
+ break;
+ }
+ }
+
+ // Returns true if an input to pair P is an output of pair Q and also an
+ // input of pair Q is an output of pair P. If this is the case, then these
+ // two pairs cannot be simultaneously fused.
+ bool BBVectorize::pairsConflict(ValuePair P, ValuePair Q,
+ DenseSet<ValuePair> &PairableInstUsers,
+ DenseMap<ValuePair, std::vector<ValuePair> > *PairableInstUserMap,
+ DenseSet<VPPair> *PairableInstUserPairSet) {
+ // Two pairs are in conflict if they are mutual Users of eachother.
+ bool QUsesP = PairableInstUsers.count(ValuePair(P.first, Q.first)) ||
+ PairableInstUsers.count(ValuePair(P.first, Q.second)) ||
+ PairableInstUsers.count(ValuePair(P.second, Q.first)) ||
+ PairableInstUsers.count(ValuePair(P.second, Q.second));
+ bool PUsesQ = PairableInstUsers.count(ValuePair(Q.first, P.first)) ||
+ PairableInstUsers.count(ValuePair(Q.first, P.second)) ||
+ PairableInstUsers.count(ValuePair(Q.second, P.first)) ||
+ PairableInstUsers.count(ValuePair(Q.second, P.second));
+ if (PairableInstUserMap) {
+ // FIXME: The expensive part of the cycle check is not so much the cycle
+ // check itself but this edge insertion procedure. This needs some
+ // profiling and probably a different data structure.
+ if (PUsesQ) {
+ if (PairableInstUserPairSet->insert(VPPair(Q, P)).second)
+ (*PairableInstUserMap)[Q].push_back(P);
+ }
+ if (QUsesP) {
+ if (PairableInstUserPairSet->insert(VPPair(P, Q)).second)
+ (*PairableInstUserMap)[P].push_back(Q);
+ }
+ }
+
+ return (QUsesP && PUsesQ);
+ }
+
+ // This function walks the use graph of current pairs to see if, starting
+ // from P, the walk returns to P.
+ bool BBVectorize::pairWillFormCycle(ValuePair P,
+ DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
+ DenseSet<ValuePair> &CurrentPairs) {
+ DEBUG(if (DebugCycleCheck)
+ dbgs() << "BBV: starting cycle check for : " << *P.first << " <-> "
+ << *P.second << "\n");
+ // A lookup table of visisted pairs is kept because the PairableInstUserMap
+ // contains non-direct associations.
+ DenseSet<ValuePair> Visited;
+ SmallVector<ValuePair, 32> Q;
+ // General depth-first post-order traversal:
+ Q.push_back(P);
+ do {
+ ValuePair QTop = Q.pop_back_val();
+ Visited.insert(QTop);
+
+ DEBUG(if (DebugCycleCheck)
+ dbgs() << "BBV: cycle check visiting: " << *QTop.first << " <-> "
+ << *QTop.second << "\n");
+ DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ =
+ PairableInstUserMap.find(QTop);
+ if (QQ == PairableInstUserMap.end())
+ continue;
+
+ for (std::vector<ValuePair>::iterator C = QQ->second.begin(),
+ CE = QQ->second.end(); C != CE; ++C) {
+ if (*C == P) {
+ DEBUG(dbgs()
+ << "BBV: rejected to prevent non-trivial cycle formation: "
+ << QTop.first << " <-> " << C->second << "\n");
+ return true;
+ }
+
+ if (CurrentPairs.count(*C) && !Visited.count(*C))
+ Q.push_back(*C);
+ }
+ } while (!Q.empty());
+
+ return false;
+ }
+
+ // This function builds the initial dag of connected pairs with the
+ // pair J at the root.
+ void BBVectorize::buildInitialDAGFor(
+ DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+ DenseSet<ValuePair> &CandidatePairsSet,
+ std::vector<Value *> &PairableInsts,
+ DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+ DenseSet<ValuePair> &PairableInstUsers,
+ DenseMap<Value *, Value *> &ChosenPairs,
+ DenseMap<ValuePair, size_t> &DAG, ValuePair J) {
+ // Each of these pairs is viewed as the root node of a DAG. The DAG
+ // is then walked (depth-first). As this happens, we keep track of
+ // the pairs that compose the DAG and the maximum depth of the DAG.
+ SmallVector<ValuePairWithDepth, 32> Q;
+ // General depth-first post-order traversal:
+ Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first)));
+ do {
+ ValuePairWithDepth QTop = Q.back();
+
+ // Push each child onto the queue:
+ bool MoreChildren = false;
+ size_t MaxChildDepth = QTop.second;
+ DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ =
+ ConnectedPairs.find(QTop.first);
+ if (QQ != ConnectedPairs.end())
+ for (std::vector<ValuePair>::iterator k = QQ->second.begin(),
+ ke = QQ->second.end(); k != ke; ++k) {
+ // Make sure that this child pair is still a candidate:
+ if (CandidatePairsSet.count(*k)) {
+ DenseMap<ValuePair, size_t>::iterator C = DAG.find(*k);
+ if (C == DAG.end()) {
+ size_t d = getDepthFactor(k->first);
+ Q.push_back(ValuePairWithDepth(*k, QTop.second+d));
+ MoreChildren = true;
+ } else {
+ MaxChildDepth = std::max(MaxChildDepth, C->second);
+ }
+ }
+ }
+
+ if (!MoreChildren) {
+ // Record the current pair as part of the DAG:
+ DAG.insert(ValuePairWithDepth(QTop.first, MaxChildDepth));
+ Q.pop_back();
+ }
+ } while (!Q.empty());
+ }
+
+ // Given some initial dag, prune it by removing conflicting pairs (pairs
+ // that cannot be simultaneously chosen for vectorization).
+ void BBVectorize::pruneDAGFor(
+ DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+ std::vector<Value *> &PairableInsts,
+ DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+ DenseSet<ValuePair> &PairableInstUsers,
+ DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
+ DenseSet<VPPair> &PairableInstUserPairSet,
+ DenseMap<Value *, Value *> &ChosenPairs,
+ DenseMap<ValuePair, size_t> &DAG,
+ DenseSet<ValuePair> &PrunedDAG, ValuePair J,
+ bool UseCycleCheck) {
+ SmallVector<ValuePairWithDepth, 32> Q;
+ // General depth-first post-order traversal:
+ Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first)));
+ do {
+ ValuePairWithDepth QTop = Q.pop_back_val();
+ PrunedDAG.insert(QTop.first);
+
+ // Visit each child, pruning as necessary...
+ SmallVector<ValuePairWithDepth, 8> BestChildren;
+ DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ =
+ ConnectedPairs.find(QTop.first);
+ if (QQ == ConnectedPairs.end())
+ continue;
+
+ for (std::vector<ValuePair>::iterator K = QQ->second.begin(),
+ KE = QQ->second.end(); K != KE; ++K) {
+ DenseMap<ValuePair, size_t>::iterator C = DAG.find(*K);
+ if (C == DAG.end()) continue;
+
+ // This child is in the DAG, now we need to make sure it is the
+ // best of any conflicting children. There could be multiple
+ // conflicting children, so first, determine if we're keeping
+ // this child, then delete conflicting children as necessary.
+
+ // It is also necessary to guard against pairing-induced
+ // dependencies. Consider instructions a .. x .. y .. b
+ // such that (a,b) are to be fused and (x,y) are to be fused
+ // but a is an input to x and b is an output from y. This
+ // means that y cannot be moved after b but x must be moved
+ // after b for (a,b) to be fused. In other words, after
+ // fusing (a,b) we have y .. a/b .. x where y is an input
+ // to a/b and x is an output to a/b: x and y can no longer
+ // be legally fused. To prevent this condition, we must
+ // make sure that a child pair added to the DAG is not
+ // both an input and output of an already-selected pair.
+
+ // Pairing-induced dependencies can also form from more complicated
+ // cycles. The pair vs. pair conflicts are easy to check, and so
+ // that is done explicitly for "fast rejection", and because for
+ // child vs. child conflicts, we may prefer to keep the current
+ // pair in preference to the already-selected child.
+ DenseSet<ValuePair> CurrentPairs;
+
+ bool CanAdd = true;
+ for (SmallVectorImpl<ValuePairWithDepth>::iterator C2
+ = BestChildren.begin(), E2 = BestChildren.end();
+ C2 != E2; ++C2) {
+ if (C2->first.first == C->first.first ||
+ C2->first.first == C->first.second ||
+ C2->first.second == C->first.first ||
+ C2->first.second == C->first.second ||
+ pairsConflict(C2->first, C->first, PairableInstUsers,
+ UseCycleCheck ? &PairableInstUserMap : nullptr,
+ UseCycleCheck ? &PairableInstUserPairSet
+ : nullptr)) {
+ if (C2->second >= C->second) {
+ CanAdd = false;
+ break;
+ }
+
+ CurrentPairs.insert(C2->first);
+ }
+ }
+ if (!CanAdd) continue;
+
+ // Even worse, this child could conflict with another node already
+ // selected for the DAG. If that is the case, ignore this child.
+ for (DenseSet<ValuePair>::iterator T = PrunedDAG.begin(),
+ E2 = PrunedDAG.end(); T != E2; ++T) {
+ if (T->first == C->first.first ||
+ T->first == C->first.second ||
+ T->second == C->first.first ||
+ T->second == C->first.second ||
+ pairsConflict(*T, C->first, PairableInstUsers,
+ UseCycleCheck ? &PairableInstUserMap : nullptr,
+ UseCycleCheck ? &PairableInstUserPairSet
+ : nullptr)) {
+ CanAdd = false;
+ break;
+ }
+
+ CurrentPairs.insert(*T);
+ }
+ if (!CanAdd) continue;
+
+ // And check the queue too...
+ for (SmallVectorImpl<ValuePairWithDepth>::iterator C2 = Q.begin(),
+ E2 = Q.end(); C2 != E2; ++C2) {
+ if (C2->first.first == C->first.first ||
+ C2->first.first == C->first.second ||
+ C2->first.second == C->first.first ||
+ C2->first.second == C->first.second ||
+ pairsConflict(C2->first, C->first, PairableInstUsers,
+ UseCycleCheck ? &PairableInstUserMap : nullptr,
+ UseCycleCheck ? &PairableInstUserPairSet
+ : nullptr)) {
+ CanAdd = false;
+ break;
+ }
+
+ CurrentPairs.insert(C2->first);
+ }
+ if (!CanAdd) continue;
+
+ // Last but not least, check for a conflict with any of the
+ // already-chosen pairs.
+ for (DenseMap<Value *, Value *>::iterator C2 =
+ ChosenPairs.begin(), E2 = ChosenPairs.end();
+ C2 != E2; ++C2) {
+ if (pairsConflict(*C2, C->first, PairableInstUsers,
+ UseCycleCheck ? &PairableInstUserMap : nullptr,
+ UseCycleCheck ? &PairableInstUserPairSet
+ : nullptr)) {
+ CanAdd = false;
+ break;
+ }
+
+ CurrentPairs.insert(*C2);
+ }
+ if (!CanAdd) continue;
+
+ // To check for non-trivial cycles formed by the addition of the
+ // current pair we've formed a list of all relevant pairs, now use a
+ // graph walk to check for a cycle. We start from the current pair and
+ // walk the use dag to see if we again reach the current pair. If we
+ // do, then the current pair is rejected.
+
+ // FIXME: It may be more efficient to use a topological-ordering
+ // algorithm to improve the cycle check. This should be investigated.
+ if (UseCycleCheck &&
+ pairWillFormCycle(C->first, PairableInstUserMap, CurrentPairs))
+ continue;
+
+ // This child can be added, but we may have chosen it in preference
+ // to an already-selected child. Check for this here, and if a
+ // conflict is found, then remove the previously-selected child
+ // before adding this one in its place.
+ for (SmallVectorImpl<ValuePairWithDepth>::iterator C2
+ = BestChildren.begin(); C2 != BestChildren.end();) {
+ if (C2->first.first == C->first.first ||
+ C2->first.first == C->first.second ||
+ C2->first.second == C->first.first ||
+ C2->first.second == C->first.second ||
+ pairsConflict(C2->first, C->first, PairableInstUsers))
+ C2 = BestChildren.erase(C2);
+ else
+ ++C2;
+ }
+
+ BestChildren.push_back(ValuePairWithDepth(C->first, C->second));
+ }
+
+ for (SmallVectorImpl<ValuePairWithDepth>::iterator C
+ = BestChildren.begin(), E2 = BestChildren.end();
+ C != E2; ++C) {
+ size_t DepthF = getDepthFactor(C->first.first);
+ Q.push_back(ValuePairWithDepth(C->first, QTop.second+DepthF));
+ }
+ } while (!Q.empty());
+ }
+
+ // This function finds the best dag of mututally-compatible connected
+ // pairs, given the choice of root pairs as an iterator range.
+ void BBVectorize::findBestDAGFor(
+ DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+ DenseSet<ValuePair> &CandidatePairsSet,
+ DenseMap<ValuePair, int> &CandidatePairCostSavings,
+ std::vector<Value *> &PairableInsts,
+ DenseSet<ValuePair> &FixedOrderPairs,
+ DenseMap<VPPair, unsigned> &PairConnectionTypes,
+ DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+ DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
+ DenseSet<ValuePair> &PairableInstUsers,
+ DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
+ DenseSet<VPPair> &PairableInstUserPairSet,
+ DenseMap<Value *, Value *> &ChosenPairs,
+ DenseSet<ValuePair> &BestDAG, size_t &BestMaxDepth,
+ int &BestEffSize, Value *II, std::vector<Value *>&JJ,
+ bool UseCycleCheck) {
+ for (std::vector<Value *>::iterator J = JJ.begin(), JE = JJ.end();
+ J != JE; ++J) {
+ ValuePair IJ(II, *J);
+ if (!CandidatePairsSet.count(IJ))
+ continue;
+
+ // Before going any further, make sure that this pair does not
+ // conflict with any already-selected pairs (see comment below
+ // near the DAG pruning for more details).
+ DenseSet<ValuePair> ChosenPairSet;
+ bool DoesConflict = false;
+ for (DenseMap<Value *, Value *>::iterator C = ChosenPairs.begin(),
+ E = ChosenPairs.end(); C != E; ++C) {
+ if (pairsConflict(*C, IJ, PairableInstUsers,
+ UseCycleCheck ? &PairableInstUserMap : nullptr,
+ UseCycleCheck ? &PairableInstUserPairSet : nullptr)) {
+ DoesConflict = true;
+ break;
+ }
+
+ ChosenPairSet.insert(*C);
+ }
+ if (DoesConflict) continue;
+
+ if (UseCycleCheck &&
+ pairWillFormCycle(IJ, PairableInstUserMap, ChosenPairSet))
+ continue;
+
+ DenseMap<ValuePair, size_t> DAG;
+ buildInitialDAGFor(CandidatePairs, CandidatePairsSet,
+ PairableInsts, ConnectedPairs,
+ PairableInstUsers, ChosenPairs, DAG, IJ);
+
+ // Because we'll keep the child with the largest depth, the largest
+ // depth is still the same in the unpruned DAG.
+ size_t MaxDepth = DAG.lookup(IJ);
+
+ DEBUG(if (DebugPairSelection) dbgs() << "BBV: found DAG for pair {"
+ << *IJ.first << " <-> " << *IJ.second << "} of depth " <<
+ MaxDepth << " and size " << DAG.size() << "\n");
+
+ // At this point the DAG has been constructed, but, may contain
+ // contradictory children (meaning that different children of
+ // some dag node may be attempting to fuse the same instruction).
+ // So now we walk the dag again, in the case of a conflict,
+ // keep only the child with the largest depth. To break a tie,
+ // favor the first child.
+
+ DenseSet<ValuePair> PrunedDAG;
+ pruneDAGFor(CandidatePairs, PairableInsts, ConnectedPairs,
+ PairableInstUsers, PairableInstUserMap,
+ PairableInstUserPairSet,
+ ChosenPairs, DAG, PrunedDAG, IJ, UseCycleCheck);
+
+ int EffSize = 0;
+ if (TTI) {
+ DenseSet<Value *> PrunedDAGInstrs;
+ for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(),
+ E = PrunedDAG.end(); S != E; ++S) {
+ PrunedDAGInstrs.insert(S->first);
+ PrunedDAGInstrs.insert(S->second);
+ }
+
+ // The set of pairs that have already contributed to the total cost.
+ DenseSet<ValuePair> IncomingPairs;
+
+ // If the cost model were perfect, this might not be necessary; but we
+ // need to make sure that we don't get stuck vectorizing our own
+ // shuffle chains.
+ bool HasNontrivialInsts = false;
+
+ // The node weights represent the cost savings associated with
+ // fusing the pair of instructions.
+ for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(),
+ E = PrunedDAG.end(); S != E; ++S) {
+ if (!isa<ShuffleVectorInst>(S->first) &&
+ !isa<InsertElementInst>(S->first) &&
+ !isa<ExtractElementInst>(S->first))
+ HasNontrivialInsts = true;
+
+ bool FlipOrder = false;
+
+ if (getDepthFactor(S->first)) {
+ int ESContrib = CandidatePairCostSavings.find(*S)->second;
+ DEBUG(if (DebugPairSelection) dbgs() << "\tweight {"
+ << *S->first << " <-> " << *S->second << "} = " <<
+ ESContrib << "\n");
+ EffSize += ESContrib;
+ }
+
+ // The edge weights contribute in a negative sense: they represent
+ // the cost of shuffles.
+ DenseMap<ValuePair, std::vector<ValuePair> >::iterator SS =
+ ConnectedPairDeps.find(*S);
+ if (SS != ConnectedPairDeps.end()) {
+ unsigned NumDepsDirect = 0, NumDepsSwap = 0;
+ for (std::vector<ValuePair>::iterator T = SS->second.begin(),
+ TE = SS->second.end(); T != TE; ++T) {
+ VPPair Q(*S, *T);
+ if (!PrunedDAG.count(Q.second))
+ continue;
+ DenseMap<VPPair, unsigned>::iterator R =
+ PairConnectionTypes.find(VPPair(Q.second, Q.first));
+ assert(R != PairConnectionTypes.end() &&
+ "Cannot find pair connection type");
+ if (R->second == PairConnectionDirect)
+ ++NumDepsDirect;
+ else if (R->second == PairConnectionSwap)
+ ++NumDepsSwap;
+ }
+
+ // If there are more swaps than direct connections, then
+ // the pair order will be flipped during fusion. So the real
+ // number of swaps is the minimum number.
+ FlipOrder = !FixedOrderPairs.count(*S) &&
+ ((NumDepsSwap > NumDepsDirect) ||
+ FixedOrderPairs.count(ValuePair(S->second, S->first)));
+
+ for (std::vector<ValuePair>::iterator T = SS->second.begin(),
+ TE = SS->second.end(); T != TE; ++T) {
+ VPPair Q(*S, *T);
+ if (!PrunedDAG.count(Q.second))
+ continue;
+ DenseMap<VPPair, unsigned>::iterator R =
+ PairConnectionTypes.find(VPPair(Q.second, Q.first));
+ assert(R != PairConnectionTypes.end() &&
+ "Cannot find pair connection type");
+ Type *Ty1 = Q.second.first->getType(),
+ *Ty2 = Q.second.second->getType();
+ Type *VTy = getVecTypeForPair(Ty1, Ty2);
+ if ((R->second == PairConnectionDirect && FlipOrder) ||
+ (R->second == PairConnectionSwap && !FlipOrder) ||
+ R->second == PairConnectionSplat) {
+ int ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
+ VTy, VTy);
+
+ if (VTy->getVectorNumElements() == 2) {
+ if (R->second == PairConnectionSplat)
+ ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
+ TargetTransformInfo::SK_Broadcast, VTy));
+ else
+ ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
+ TargetTransformInfo::SK_Reverse, VTy));
+ }
+
+ DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
+ *Q.second.first << " <-> " << *Q.second.second <<
+ "} -> {" <<
+ *S->first << " <-> " << *S->second << "} = " <<
+ ESContrib << "\n");
+ EffSize -= ESContrib;
+ }
+ }
+ }
+
+ // Compute the cost of outgoing edges. We assume that edges outgoing
+ // to shuffles, inserts or extracts can be merged, and so contribute
+ // no additional cost.
+ if (!S->first->getType()->isVoidTy()) {
+ Type *Ty1 = S->first->getType(),
+ *Ty2 = S->second->getType();
+ Type *VTy = getVecTypeForPair(Ty1, Ty2);
+
+ bool NeedsExtraction = false;
+ for (User *U : S->first->users()) {
+ if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(U)) {
+ // Shuffle can be folded if it has no other input
+ if (isa<UndefValue>(SI->getOperand(1)))
+ continue;
+ }
+ if (isa<ExtractElementInst>(U))
+ continue;
+ if (PrunedDAGInstrs.count(U))
+ continue;
+ NeedsExtraction = true;
+ break;
+ }
+
+ if (NeedsExtraction) {
+ int ESContrib;
+ if (Ty1->isVectorTy()) {
+ ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
+ Ty1, VTy);
+ ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
+ TargetTransformInfo::SK_ExtractSubvector, VTy, 0, Ty1));
+ } else
+ ESContrib = (int) TTI->getVectorInstrCost(
+ Instruction::ExtractElement, VTy, 0);
+
+ DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
+ *S->first << "} = " << ESContrib << "\n");
+ EffSize -= ESContrib;
+ }
+
+ NeedsExtraction = false;
+ for (User *U : S->second->users()) {
+ if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(U)) {
+ // Shuffle can be folded if it has no other input
+ if (isa<UndefValue>(SI->getOperand(1)))
+ continue;
+ }
+ if (isa<ExtractElementInst>(U))
+ continue;
+ if (PrunedDAGInstrs.count(U))
+ continue;
+ NeedsExtraction = true;
+ break;
+ }
+
+ if (NeedsExtraction) {
+ int ESContrib;
+ if (Ty2->isVectorTy()) {
+ ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
+ Ty2, VTy);
+ ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
+ TargetTransformInfo::SK_ExtractSubvector, VTy,
+ Ty1->isVectorTy() ? Ty1->getVectorNumElements() : 1, Ty2));
+ } else
+ ESContrib = (int) TTI->getVectorInstrCost(
+ Instruction::ExtractElement, VTy, 1);
+ DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
+ *S->second << "} = " << ESContrib << "\n");
+ EffSize -= ESContrib;
+ }
+ }
+
+ // Compute the cost of incoming edges.
+ if (!isa<LoadInst>(S->first) && !isa<StoreInst>(S->first)) {
+ Instruction *S1 = cast<Instruction>(S->first),
+ *S2 = cast<Instruction>(S->second);
+ for (unsigned o = 0; o < S1->getNumOperands(); ++o) {
+ Value *O1 = S1->getOperand(o), *O2 = S2->getOperand(o);
+
+ // Combining constants into vector constants (or small vector
+ // constants into larger ones are assumed free).
+ if (isa<Constant>(O1) && isa<Constant>(O2))
+ continue;
+
+ if (FlipOrder)
+ std::swap(O1, O2);
+
+ ValuePair VP = ValuePair(O1, O2);
+ ValuePair VPR = ValuePair(O2, O1);
+
+ // Internal edges are not handled here.
+ if (PrunedDAG.count(VP) || PrunedDAG.count(VPR))
+ continue;
+
+ Type *Ty1 = O1->getType(),
+ *Ty2 = O2->getType();
+ Type *VTy = getVecTypeForPair(Ty1, Ty2);
+
+ // Combining vector operations of the same type is also assumed
+ // folded with other operations.
+ if (Ty1 == Ty2) {
+ // If both are insert elements, then both can be widened.
+ InsertElementInst *IEO1 = dyn_cast<InsertElementInst>(O1),
+ *IEO2 = dyn_cast<InsertElementInst>(O2);
+ if (IEO1 && IEO2 && isPureIEChain(IEO1) && isPureIEChain(IEO2))
+ continue;
+ // If both are extract elements, and both have the same input
+ // type, then they can be replaced with a shuffle
+ ExtractElementInst *EIO1 = dyn_cast<ExtractElementInst>(O1),
+ *EIO2 = dyn_cast<ExtractElementInst>(O2);
+ if (EIO1 && EIO2 &&
+ EIO1->getOperand(0)->getType() ==
+ EIO2->getOperand(0)->getType())
+ continue;
+ // If both are a shuffle with equal operand types and only two
+ // unqiue operands, then they can be replaced with a single
+ // shuffle
+ ShuffleVectorInst *SIO1 = dyn_cast<ShuffleVectorInst>(O1),
+ *SIO2 = dyn_cast<ShuffleVectorInst>(O2);
+ if (SIO1 && SIO2 &&
+ SIO1->getOperand(0)->getType() ==
+ SIO2->getOperand(0)->getType()) {
+ SmallSet<Value *, 4> SIOps;
+ SIOps.insert(SIO1->getOperand(0));
+ SIOps.insert(SIO1->getOperand(1));
+ SIOps.insert(SIO2->getOperand(0));
+ SIOps.insert(SIO2->getOperand(1));
+ if (SIOps.size() <= 2)
+ continue;
+ }
+ }
+
+ int ESContrib;
+ // This pair has already been formed.
+ if (IncomingPairs.count(VP)) {
+ continue;
+ } else if (IncomingPairs.count(VPR)) {
+ ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
+ VTy, VTy);
+
+ if (VTy->getVectorNumElements() == 2)
+ ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
+ TargetTransformInfo::SK_Reverse, VTy));
+ } else if (!Ty1->isVectorTy() && !Ty2->isVectorTy()) {
+ ESContrib = (int) TTI->getVectorInstrCost(
+ Instruction::InsertElement, VTy, 0);
+ ESContrib += (int) TTI->getVectorInstrCost(
+ Instruction::InsertElement, VTy, 1);
+ } else if (!Ty1->isVectorTy()) {
+ // O1 needs to be inserted into a vector of size O2, and then
+ // both need to be shuffled together.
+ ESContrib = (int) TTI->getVectorInstrCost(
+ Instruction::InsertElement, Ty2, 0);
+ ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
+ VTy, Ty2);
+ } else if (!Ty2->isVectorTy()) {
+ // O2 needs to be inserted into a vector of size O1, and then
+ // both need to be shuffled together.
+ ESContrib = (int) TTI->getVectorInstrCost(
+ Instruction::InsertElement, Ty1, 0);
+ ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
+ VTy, Ty1);
+ } else {
+ Type *TyBig = Ty1, *TySmall = Ty2;
+ if (Ty2->getVectorNumElements() > Ty1->getVectorNumElements())
+ std::swap(TyBig, TySmall);
+
+ ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
+ VTy, TyBig);
+ if (TyBig != TySmall)
+ ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
+ TyBig, TySmall);
+ }
+
+ DEBUG(if (DebugPairSelection) dbgs() << "\tcost {"
+ << *O1 << " <-> " << *O2 << "} = " <<
+ ESContrib << "\n");
+ EffSize -= ESContrib;
+ IncomingPairs.insert(VP);
+ }
+ }
+ }
+
+ if (!HasNontrivialInsts) {
+ DEBUG(if (DebugPairSelection) dbgs() <<
+ "\tNo non-trivial instructions in DAG;"
+ " override to zero effective size\n");
+ EffSize = 0;
+ }
+ } else {
+ for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(),
+ E = PrunedDAG.end(); S != E; ++S)
+ EffSize += (int) getDepthFactor(S->first);
+ }
+
+ DEBUG(if (DebugPairSelection)
+ dbgs() << "BBV: found pruned DAG for pair {"
+ << *IJ.first << " <-> " << *IJ.second << "} of depth " <<
+ MaxDepth << " and size " << PrunedDAG.size() <<
+ " (effective size: " << EffSize << ")\n");
+ if (((TTI && !UseChainDepthWithTI) ||
+ MaxDepth >= Config.ReqChainDepth) &&
+ EffSize > 0 && EffSize > BestEffSize) {
+ BestMaxDepth = MaxDepth;
+ BestEffSize = EffSize;
+ BestDAG = PrunedDAG;
+ }
+ }
+ }
+
+ // Given the list of candidate pairs, this function selects those
+ // that will be fused into vector instructions.
+ void BBVectorize::choosePairs(
+ DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+ DenseSet<ValuePair> &CandidatePairsSet,
+ DenseMap<ValuePair, int> &CandidatePairCostSavings,
+ std::vector<Value *> &PairableInsts,
+ DenseSet<ValuePair> &FixedOrderPairs,
+ DenseMap<VPPair, unsigned> &PairConnectionTypes,
+ DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+ DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
+ DenseSet<ValuePair> &PairableInstUsers,
+ DenseMap<Value *, Value *>& ChosenPairs) {
+ bool UseCycleCheck =
+ CandidatePairsSet.size() <= Config.MaxCandPairsForCycleCheck;
+
+ DenseMap<Value *, std::vector<Value *> > CandidatePairs2;
+ for (DenseSet<ValuePair>::iterator I = CandidatePairsSet.begin(),
+ E = CandidatePairsSet.end(); I != E; ++I) {
+ std::vector<Value *> &JJ = CandidatePairs2[I->second];
+ if (JJ.empty()) JJ.reserve(32);
+ JJ.push_back(I->first);
+ }
+
+ DenseMap<ValuePair, std::vector<ValuePair> > PairableInstUserMap;
+ DenseSet<VPPair> PairableInstUserPairSet;
+ for (std::vector<Value *>::iterator I = PairableInsts.begin(),
+ E = PairableInsts.end(); I != E; ++I) {
+ // The number of possible pairings for this variable:
+ size_t NumChoices = CandidatePairs.lookup(*I).size();
+ if (!NumChoices) continue;
+
+ std::vector<Value *> &JJ = CandidatePairs[*I];
+
+ // The best pair to choose and its dag:
+ size_t BestMaxDepth = 0;
+ int BestEffSize = 0;
+ DenseSet<ValuePair> BestDAG;
+ findBestDAGFor(CandidatePairs, CandidatePairsSet,
+ CandidatePairCostSavings,
+ PairableInsts, FixedOrderPairs, PairConnectionTypes,
+ ConnectedPairs, ConnectedPairDeps,
+ PairableInstUsers, PairableInstUserMap,
+ PairableInstUserPairSet, ChosenPairs,
+ BestDAG, BestMaxDepth, BestEffSize, *I, JJ,
+ UseCycleCheck);
+
+ if (BestDAG.empty())
+ continue;
+
+ // A dag has been chosen (or not) at this point. If no dag was
+ // chosen, then this instruction, I, cannot be paired (and is no longer
+ // considered).
+
+ DEBUG(dbgs() << "BBV: selected pairs in the best DAG for: "
+ << *cast<Instruction>(*I) << "\n");
+
+ for (DenseSet<ValuePair>::iterator S = BestDAG.begin(),
+ SE2 = BestDAG.end(); S != SE2; ++S) {
+ // Insert the members of this dag into the list of chosen pairs.
+ ChosenPairs.insert(ValuePair(S->first, S->second));
+ DEBUG(dbgs() << "BBV: selected pair: " << *S->first << " <-> " <<
+ *S->second << "\n");
+
+ // Remove all candidate pairs that have values in the chosen dag.
+ std::vector<Value *> &KK = CandidatePairs[S->first];
+ for (std::vector<Value *>::iterator K = KK.begin(), KE = KK.end();
+ K != KE; ++K) {
+ if (*K == S->second)
+ continue;
+
+ CandidatePairsSet.erase(ValuePair(S->first, *K));
+ }
+
+ std::vector<Value *> &LL = CandidatePairs2[S->second];
+ for (std::vector<Value *>::iterator L = LL.begin(), LE = LL.end();
+ L != LE; ++L) {
+ if (*L == S->first)
+ continue;
+
+ CandidatePairsSet.erase(ValuePair(*L, S->second));
+ }
+
+ std::vector<Value *> &MM = CandidatePairs[S->second];
+ for (std::vector<Value *>::iterator M = MM.begin(), ME = MM.end();
+ M != ME; ++M) {
+ assert(*M != S->first && "Flipped pair in candidate list?");
+ CandidatePairsSet.erase(ValuePair(S->second, *M));
+ }
+
+ std::vector<Value *> &NN = CandidatePairs2[S->first];
+ for (std::vector<Value *>::iterator N = NN.begin(), NE = NN.end();
+ N != NE; ++N) {
+ assert(*N != S->second && "Flipped pair in candidate list?");
+ CandidatePairsSet.erase(ValuePair(*N, S->first));
+ }
+ }
+ }
+
+ DEBUG(dbgs() << "BBV: selected " << ChosenPairs.size() << " pairs.\n");
+ }
+
+ std::string getReplacementName(Instruction *I, bool IsInput, unsigned o,
+ unsigned n = 0) {
+ if (!I->hasName())
+ return "";
+
+ return (I->getName() + (IsInput ? ".v.i" : ".v.r") + utostr(o) +
+ (n > 0 ? "." + utostr(n) : "")).str();
+ }
+
+ // Returns the value that is to be used as the pointer input to the vector
+ // instruction that fuses I with J.
+ Value *BBVectorize::getReplacementPointerInput(LLVMContext& Context,
+ Instruction *I, Instruction *J, unsigned o) {
+ Value *IPtr, *JPtr;
+ unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace;
+ int64_t OffsetInElmts;
+
+ // Note: the analysis might fail here, that is why the pair order has
+ // been precomputed (OffsetInElmts must be unused here).
+ (void) getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
+ IAddressSpace, JAddressSpace,
+ OffsetInElmts, false);
+
+ // The pointer value is taken to be the one with the lowest offset.
+ Value *VPtr = IPtr;
+
+ Type *ArgTypeI = IPtr->getType()->getPointerElementType();
+ Type *ArgTypeJ = JPtr->getType()->getPointerElementType();
+ Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
+ Type *VArgPtrType
+ = PointerType::get(VArgType,
+ IPtr->getType()->getPointerAddressSpace());
+ return new BitCastInst(VPtr, VArgPtrType, getReplacementName(I, true, o),
+ /* insert before */ I);
+ }
+
+ void BBVectorize::fillNewShuffleMask(LLVMContext& Context, Instruction *J,
+ unsigned MaskOffset, unsigned NumInElem,
+ unsigned NumInElem1, unsigned IdxOffset,
+ std::vector<Constant*> &Mask) {
+ unsigned NumElem1 = J->getType()->getVectorNumElements();
+ for (unsigned v = 0; v < NumElem1; ++v) {
+ int m = cast<ShuffleVectorInst>(J)->getMaskValue(v);
+ if (m < 0) {
+ Mask[v+MaskOffset] = UndefValue::get(Type::getInt32Ty(Context));
+ } else {
+ unsigned mm = m + (int) IdxOffset;
+ if (m >= (int) NumInElem1)
+ mm += (int) NumInElem;
+
+ Mask[v+MaskOffset] =
+ ConstantInt::get(Type::getInt32Ty(Context), mm);
+ }
+ }
+ }
+
+ // Returns the value that is to be used as the vector-shuffle mask to the
+ // vector instruction that fuses I with J.
+ Value *BBVectorize::getReplacementShuffleMask(LLVMContext& Context,
+ Instruction *I, Instruction *J) {
+ // This is the shuffle mask. We need to append the second
+ // mask to the first, and the numbers need to be adjusted.
+
+ Type *ArgTypeI = I->getType();
+ Type *ArgTypeJ = J->getType();
+ Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
+
+ unsigned NumElemI = ArgTypeI->getVectorNumElements();
+
+ // Get the total number of elements in the fused vector type.
+ // By definition, this must equal the number of elements in
+ // the final mask.
+ unsigned NumElem = VArgType->getVectorNumElements();
+ std::vector<Constant*> Mask(NumElem);
+
+ Type *OpTypeI = I->getOperand(0)->getType();
+ unsigned NumInElemI = OpTypeI->getVectorNumElements();
+ Type *OpTypeJ = J->getOperand(0)->getType();
+ unsigned NumInElemJ = OpTypeJ->getVectorNumElements();
+
+ // The fused vector will be:
+ // -----------------------------------------------------
+ // | NumInElemI | NumInElemJ | NumInElemI | NumInElemJ |
+ // -----------------------------------------------------
+ // from which we'll extract NumElem total elements (where the first NumElemI
+ // of them come from the mask in I and the remainder come from the mask
+ // in J.
+
+ // For the mask from the first pair...
+ fillNewShuffleMask(Context, I, 0, NumInElemJ, NumInElemI,
+ 0, Mask);
+
+ // For the mask from the second pair...
+ fillNewShuffleMask(Context, J, NumElemI, NumInElemI, NumInElemJ,
+ NumInElemI, Mask);
+
+ return ConstantVector::get(Mask);
+ }
+
+ bool BBVectorize::expandIEChain(LLVMContext& Context, Instruction *I,
+ Instruction *J, unsigned o, Value *&LOp,
+ unsigned numElemL,
+ Type *ArgTypeL, Type *ArgTypeH,
+ bool IBeforeJ, unsigned IdxOff) {
+ bool ExpandedIEChain = false;
+ if (InsertElementInst *LIE = dyn_cast<InsertElementInst>(LOp)) {
+ // If we have a pure insertelement chain, then this can be rewritten
+ // into a chain that directly builds the larger type.
+ if (isPureIEChain(LIE)) {
+ SmallVector<Value *, 8> VectElemts(numElemL,
+ UndefValue::get(ArgTypeL->getScalarType()));
+ InsertElementInst *LIENext = LIE;
+ do {
+ unsigned Idx =
+ cast<ConstantInt>(LIENext->getOperand(2))->getSExtValue();
+ VectElemts[Idx] = LIENext->getOperand(1);
+ } while ((LIENext =
+ dyn_cast<InsertElementInst>(LIENext->getOperand(0))));
+
+ LIENext = nullptr;
+ Value *LIEPrev = UndefValue::get(ArgTypeH);
+ for (unsigned i = 0; i < numElemL; ++i) {
+ if (isa<UndefValue>(VectElemts[i])) continue;
+ LIENext = InsertElementInst::Create(LIEPrev, VectElemts[i],
+ ConstantInt::get(Type::getInt32Ty(Context),
+ i + IdxOff),
+ getReplacementName(IBeforeJ ? I : J,
+ true, o, i+1));
+ LIENext->insertBefore(IBeforeJ ? J : I);
+ LIEPrev = LIENext;
+ }
+
+ LOp = LIENext ? (Value*) LIENext : UndefValue::get(ArgTypeH);
+ ExpandedIEChain = true;
+ }
+ }
+
+ return ExpandedIEChain;
+ }
+
+ static unsigned getNumScalarElements(Type *Ty) {
+ if (VectorType *VecTy = dyn_cast<VectorType>(Ty))
+ return VecTy->getNumElements();
+ return 1;
+ }
+
+ // Returns the value to be used as the specified operand of the vector
+ // instruction that fuses I with J.
+ Value *BBVectorize::getReplacementInput(LLVMContext& Context, Instruction *I,
+ Instruction *J, unsigned o, bool IBeforeJ) {
+ Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
+ Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1);
+
+ // Compute the fused vector type for this operand
+ Type *ArgTypeI = I->getOperand(o)->getType();
+ Type *ArgTypeJ = J->getOperand(o)->getType();
+ VectorType *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
+
+ Instruction *L = I, *H = J;
+ Type *ArgTypeL = ArgTypeI, *ArgTypeH = ArgTypeJ;
+
+ unsigned numElemL = getNumScalarElements(ArgTypeL);
+ unsigned numElemH = getNumScalarElements(ArgTypeH);
+
+ Value *LOp = L->getOperand(o);
+ Value *HOp = H->getOperand(o);
+ unsigned numElem = VArgType->getNumElements();
+
+ // First, we check if we can reuse the "original" vector outputs (if these
+ // exist). We might need a shuffle.
+ ExtractElementInst *LEE = dyn_cast<ExtractElementInst>(LOp);
+ ExtractElementInst *HEE = dyn_cast<ExtractElementInst>(HOp);
+ ShuffleVectorInst *LSV = dyn_cast<ShuffleVectorInst>(LOp);
+ ShuffleVectorInst *HSV = dyn_cast<ShuffleVectorInst>(HOp);
+
+ // FIXME: If we're fusing shuffle instructions, then we can't apply this
+ // optimization. The input vectors to the shuffle might be a different
+ // length from the shuffle outputs. Unfortunately, the replacement
+ // shuffle mask has already been formed, and the mask entries are sensitive
+ // to the sizes of the inputs.
+ bool IsSizeChangeShuffle =
+ isa<ShuffleVectorInst>(L) &&
+ (LOp->getType() != L->getType() || HOp->getType() != H->getType());
+
+ if ((LEE || LSV) && (HEE || HSV) && !IsSizeChangeShuffle) {
+ // We can have at most two unique vector inputs.
+ bool CanUseInputs = true;
+ Value *I1, *I2 = nullptr;
+ if (LEE) {
+ I1 = LEE->getOperand(0);
+ } else {
+ I1 = LSV->getOperand(0);
+ I2 = LSV->getOperand(1);
+ if (I2 == I1 || isa<UndefValue>(I2))
+ I2 = nullptr;
+ }
+
+ if (HEE) {
+ Value *I3 = HEE->getOperand(0);
+ if (!I2 && I3 != I1)
+ I2 = I3;
+ else if (I3 != I1 && I3 != I2)
+ CanUseInputs = false;
+ } else {
+ Value *I3 = HSV->getOperand(0);
+ if (!I2 && I3 != I1)
+ I2 = I3;
+ else if (I3 != I1 && I3 != I2)
+ CanUseInputs = false;
+
+ if (CanUseInputs) {
+ Value *I4 = HSV->getOperand(1);
+ if (!isa<UndefValue>(I4)) {
+ if (!I2 && I4 != I1)
+ I2 = I4;
+ else if (I4 != I1 && I4 != I2)
+ CanUseInputs = false;
+ }
+ }
+ }
+
+ if (CanUseInputs) {
+ unsigned LOpElem =
+ cast<Instruction>(LOp)->getOperand(0)->getType()
+ ->getVectorNumElements();
+
+ unsigned HOpElem =
+ cast<Instruction>(HOp)->getOperand(0)->getType()
+ ->getVectorNumElements();
+
+ // We have one or two input vectors. We need to map each index of the
+ // operands to the index of the original vector.
+ SmallVector<std::pair<int, int>, 8> II(numElem);
+ for (unsigned i = 0; i < numElemL; ++i) {
+ int Idx, INum;
+ if (LEE) {
+ Idx =
+ cast<ConstantInt>(LEE->getOperand(1))->getSExtValue();
+ INum = LEE->getOperand(0) == I1 ? 0 : 1;
+ } else {
+ Idx = LSV->getMaskValue(i);
+ if (Idx < (int) LOpElem) {
+ INum = LSV->getOperand(0) == I1 ? 0 : 1;
+ } else {
+ Idx -= LOpElem;
+ INum = LSV->getOperand(1) == I1 ? 0 : 1;
+ }
+ }
+
+ II[i] = std::pair<int, int>(Idx, INum);
+ }
+ for (unsigned i = 0; i < numElemH; ++i) {
+ int Idx, INum;
+ if (HEE) {
+ Idx =
+ cast<ConstantInt>(HEE->getOperand(1))->getSExtValue();
+ INum = HEE->getOperand(0) == I1 ? 0 : 1;
+ } else {
+ Idx = HSV->getMaskValue(i);
+ if (Idx < (int) HOpElem) {
+ INum = HSV->getOperand(0) == I1 ? 0 : 1;
+ } else {
+ Idx -= HOpElem;
+ INum = HSV->getOperand(1) == I1 ? 0 : 1;
+ }
+ }
+
+ II[i + numElemL] = std::pair<int, int>(Idx, INum);
+ }
+
+ // We now have an array which tells us from which index of which
+ // input vector each element of the operand comes.
+ VectorType *I1T = cast<VectorType>(I1->getType());
+ unsigned I1Elem = I1T->getNumElements();
+
+ if (!I2) {
+ // In this case there is only one underlying vector input. Check for
+ // the trivial case where we can use the input directly.
+ if (I1Elem == numElem) {
+ bool ElemInOrder = true;
+ for (unsigned i = 0; i < numElem; ++i) {
+ if (II[i].first != (int) i && II[i].first != -1) {
+ ElemInOrder = false;
+ break;
+ }
+ }
+
+ if (ElemInOrder)
+ return I1;
+ }
+
+ // A shuffle is needed.
+ std::vector<Constant *> Mask(numElem);
+ for (unsigned i = 0; i < numElem; ++i) {
+ int Idx = II[i].first;
+ if (Idx == -1)
+ Mask[i] = UndefValue::get(Type::getInt32Ty(Context));
+ else
+ Mask[i] = ConstantInt::get(Type::getInt32Ty(Context), Idx);
+ }
+
+ Instruction *S =
+ new ShuffleVectorInst(I1, UndefValue::get(I1T),
+ ConstantVector::get(Mask),
+ getReplacementName(IBeforeJ ? I : J,
+ true, o));
+ S->insertBefore(IBeforeJ ? J : I);
+ return S;
+ }
+
+ VectorType *I2T = cast<VectorType>(I2->getType());
+ unsigned I2Elem = I2T->getNumElements();
+
+ // This input comes from two distinct vectors. The first step is to
+ // make sure that both vectors are the same length. If not, the
+ // smaller one will need to grow before they can be shuffled together.
+ if (I1Elem < I2Elem) {
+ std::vector<Constant *> Mask(I2Elem);
+ unsigned v = 0;
+ for (; v < I1Elem; ++v)
+ Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
+ for (; v < I2Elem; ++v)
+ Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
+
+ Instruction *NewI1 =
+ new ShuffleVectorInst(I1, UndefValue::get(I1T),
+ ConstantVector::get(Mask),
+ getReplacementName(IBeforeJ ? I : J,
+ true, o, 1));
+ NewI1->insertBefore(IBeforeJ ? J : I);
+ I1 = NewI1;
+ I1Elem = I2Elem;
+ } else if (I1Elem > I2Elem) {
+ std::vector<Constant *> Mask(I1Elem);
+ unsigned v = 0;
+ for (; v < I2Elem; ++v)
+ Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
+ for (; v < I1Elem; ++v)
+ Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
+
+ Instruction *NewI2 =
+ new ShuffleVectorInst(I2, UndefValue::get(I2T),
+ ConstantVector::get(Mask),
+ getReplacementName(IBeforeJ ? I : J,
+ true, o, 1));
+ NewI2->insertBefore(IBeforeJ ? J : I);
+ I2 = NewI2;
+ }
+
+ // Now that both I1 and I2 are the same length we can shuffle them
+ // together (and use the result).
+ std::vector<Constant *> Mask(numElem);
+ for (unsigned v = 0; v < numElem; ++v) {
+ if (II[v].first == -1) {
+ Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
+ } else {
+ int Idx = II[v].first + II[v].second * I1Elem;
+ Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx);
+ }
+ }
+
+ Instruction *NewOp =
+ new ShuffleVectorInst(I1, I2, ConstantVector::get(Mask),
+ getReplacementName(IBeforeJ ? I : J, true, o));
+ NewOp->insertBefore(IBeforeJ ? J : I);
+ return NewOp;
+ }
+ }
+
+ Type *ArgType = ArgTypeL;
+ if (numElemL < numElemH) {
+ if (numElemL == 1 && expandIEChain(Context, I, J, o, HOp, numElemH,
+ ArgTypeL, VArgType, IBeforeJ, 1)) {
+ // This is another short-circuit case: we're combining a scalar into
+ // a vector that is formed by an IE chain. We've just expanded the IE
+ // chain, now insert the scalar and we're done.
+
+ Instruction *S = InsertElementInst::Create(HOp, LOp, CV0,
+ getReplacementName(IBeforeJ ? I : J, true, o));
+ S->insertBefore(IBeforeJ ? J : I);
+ return S;
+ } else if (!expandIEChain(Context, I, J, o, LOp, numElemL, ArgTypeL,
+ ArgTypeH, IBeforeJ)) {
+ // The two vector inputs to the shuffle must be the same length,
+ // so extend the smaller vector to be the same length as the larger one.
+ Instruction *NLOp;
+ if (numElemL > 1) {
+
+ std::vector<Constant *> Mask(numElemH);
+ unsigned v = 0;
+ for (; v < numElemL; ++v)
+ Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
+ for (; v < numElemH; ++v)
+ Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
+
+ NLOp = new ShuffleVectorInst(LOp, UndefValue::get(ArgTypeL),
+ ConstantVector::get(Mask),
+ getReplacementName(IBeforeJ ? I : J,
+ true, o, 1));
+ } else {
+ NLOp = InsertElementInst::Create(UndefValue::get(ArgTypeH), LOp, CV0,
+ getReplacementName(IBeforeJ ? I : J,
+ true, o, 1));
+ }
+
+ NLOp->insertBefore(IBeforeJ ? J : I);
+ LOp = NLOp;
+ }
+
+ ArgType = ArgTypeH;
+ } else if (numElemL > numElemH) {
+ if (numElemH == 1 && expandIEChain(Context, I, J, o, LOp, numElemL,
+ ArgTypeH, VArgType, IBeforeJ)) {
+ Instruction *S =
+ InsertElementInst::Create(LOp, HOp,
+ ConstantInt::get(Type::getInt32Ty(Context),
+ numElemL),
+ getReplacementName(IBeforeJ ? I : J,
+ true, o));
+ S->insertBefore(IBeforeJ ? J : I);
+ return S;
+ } else if (!expandIEChain(Context, I, J, o, HOp, numElemH, ArgTypeH,
+ ArgTypeL, IBeforeJ)) {
+ Instruction *NHOp;
+ if (numElemH > 1) {
+ std::vector<Constant *> Mask(numElemL);
+ unsigned v = 0;
+ for (; v < numElemH; ++v)
+ Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
+ for (; v < numElemL; ++v)
+ Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
+
+ NHOp = new ShuffleVectorInst(HOp, UndefValue::get(ArgTypeH),
+ ConstantVector::get(Mask),
+ getReplacementName(IBeforeJ ? I : J,
+ true, o, 1));
+ } else {
+ NHOp = InsertElementInst::Create(UndefValue::get(ArgTypeL), HOp, CV0,
+ getReplacementName(IBeforeJ ? I : J,
+ true, o, 1));
+ }
+
+ NHOp->insertBefore(IBeforeJ ? J : I);
+ HOp = NHOp;
+ }
+ }
+
+ if (ArgType->isVectorTy()) {
+ unsigned numElem = VArgType->getVectorNumElements();
+ std::vector<Constant*> Mask(numElem);
+ for (unsigned v = 0; v < numElem; ++v) {
+ unsigned Idx = v;
+ // If the low vector was expanded, we need to skip the extra
+ // undefined entries.
+ if (v >= numElemL && numElemH > numElemL)
+ Idx += (numElemH - numElemL);
+ Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx);
+ }
+
+ Instruction *BV = new ShuffleVectorInst(LOp, HOp,
+ ConstantVector::get(Mask),
+ getReplacementName(IBeforeJ ? I : J, true, o));
+ BV->insertBefore(IBeforeJ ? J : I);
+ return BV;
+ }
+
+ Instruction *BV1 = InsertElementInst::Create(
+ UndefValue::get(VArgType), LOp, CV0,
+ getReplacementName(IBeforeJ ? I : J,
+ true, o, 1));
+ BV1->insertBefore(IBeforeJ ? J : I);
+ Instruction *BV2 = InsertElementInst::Create(BV1, HOp, CV1,
+ getReplacementName(IBeforeJ ? I : J,
+ true, o, 2));
+ BV2->insertBefore(IBeforeJ ? J : I);
+ return BV2;
+ }
+
+ // This function creates an array of values that will be used as the inputs
+ // to the vector instruction that fuses I with J.
+ void BBVectorize::getReplacementInputsForPair(LLVMContext& Context,
+ Instruction *I, Instruction *J,
+ SmallVectorImpl<Value *> &ReplacedOperands,
+ bool IBeforeJ) {
+ unsigned NumOperands = I->getNumOperands();
+
+ for (unsigned p = 0, o = NumOperands-1; p < NumOperands; ++p, --o) {
+ // Iterate backward so that we look at the store pointer
+ // first and know whether or not we need to flip the inputs.
+
+ if (isa<LoadInst>(I) || (o == 1 && isa<StoreInst>(I))) {
+ // This is the pointer for a load/store instruction.
+ ReplacedOperands[o] = getReplacementPointerInput(Context, I, J, o);
+ continue;
+ } else if (isa<CallInst>(I)) {
+ Function *F = cast<CallInst>(I)->getCalledFunction();
+ Intrinsic::ID IID = F->getIntrinsicID();
+ if (o == NumOperands-1) {
+ BasicBlock &BB = *I->getParent();
+
+ Module *M = BB.getParent()->getParent();
+ Type *ArgTypeI = I->getType();
+ Type *ArgTypeJ = J->getType();
+ Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
+
+ ReplacedOperands[o] = Intrinsic::getDeclaration(M, IID, VArgType);
+ continue;
+ } else if ((IID == Intrinsic::powi || IID == Intrinsic::ctlz ||
+ IID == Intrinsic::cttz) && o == 1) {
+ // The second argument of powi/ctlz/cttz is a single integer/constant
+ // and we've already checked that both arguments are equal.
+ // As a result, we just keep I's second argument.
+ ReplacedOperands[o] = I->getOperand(o);
+ continue;
+ }
+ } else if (isa<ShuffleVectorInst>(I) && o == NumOperands-1) {
+ ReplacedOperands[o] = getReplacementShuffleMask(Context, I, J);
+ continue;
+ }
+
+ ReplacedOperands[o] = getReplacementInput(Context, I, J, o, IBeforeJ);
+ }
+ }
+
+ // This function creates two values that represent the outputs of the
+ // original I and J instructions. These are generally vector shuffles
+ // or extracts. In many cases, these will end up being unused and, thus,
+ // eliminated by later passes.
+ void BBVectorize::replaceOutputsOfPair(LLVMContext& Context, Instruction *I,
+ Instruction *J, Instruction *K,
+ Instruction *&InsertionPt,
+ Instruction *&K1, Instruction *&K2) {
+ if (isa<StoreInst>(I))
+ return;
+
+ Type *IType = I->getType();
+ Type *JType = J->getType();
+
+ VectorType *VType = getVecTypeForPair(IType, JType);
+ unsigned numElem = VType->getNumElements();
+
+ unsigned numElemI = getNumScalarElements(IType);
+ unsigned numElemJ = getNumScalarElements(JType);
+
+ if (IType->isVectorTy()) {
+ std::vector<Constant *> Mask1(numElemI), Mask2(numElemI);
+ for (unsigned v = 0; v < numElemI; ++v) {
+ Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
+ Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemJ + v);
+ }
+
+ K1 = new ShuffleVectorInst(K, UndefValue::get(VType),
+ ConstantVector::get(Mask1),
+ getReplacementName(K, false, 1));
+ } else {
+ Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
+ K1 = ExtractElementInst::Create(K, CV0, getReplacementName(K, false, 1));
+ }
+
+ if (JType->isVectorTy()) {
+ std::vector<Constant *> Mask1(numElemJ), Mask2(numElemJ);
+ for (unsigned v = 0; v < numElemJ; ++v) {
+ Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
+ Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemI + v);
+ }
+
+ K2 = new ShuffleVectorInst(K, UndefValue::get(VType),
+ ConstantVector::get(Mask2),
+ getReplacementName(K, false, 2));
+ } else {
+ Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem - 1);
+ K2 = ExtractElementInst::Create(K, CV1, getReplacementName(K, false, 2));
+ }
+
+ K1->insertAfter(K);
+ K2->insertAfter(K1);
+ InsertionPt = K2;
+ }
+
+ // Move all uses of the function I (including pairing-induced uses) after J.
+ bool BBVectorize::canMoveUsesOfIAfterJ(BasicBlock &BB,
+ DenseSet<ValuePair> &LoadMoveSetPairs,
+ Instruction *I, Instruction *J) {
+ // Skip to the first instruction past I.
+ BasicBlock::iterator L = std::next(BasicBlock::iterator(I));
+
+ DenseSet<Value *> Users;
+ AliasSetTracker WriteSet(*AA);
+ if (I->mayWriteToMemory()) WriteSet.add(I);
+
+ for (; cast<Instruction>(L) != J; ++L)
+ (void)trackUsesOfI(Users, WriteSet, I, &*L, true, &LoadMoveSetPairs);
+
+ assert(cast<Instruction>(L) == J &&
+ "Tracking has not proceeded far enough to check for dependencies");
+ // If J is now in the use set of I, then trackUsesOfI will return true
+ // and we have a dependency cycle (and the fusing operation must abort).
+ return !trackUsesOfI(Users, WriteSet, I, J, true, &LoadMoveSetPairs);
+ }
+
+ // Move all uses of the function I (including pairing-induced uses) after J.
+ void BBVectorize::moveUsesOfIAfterJ(BasicBlock &BB,
+ DenseSet<ValuePair> &LoadMoveSetPairs,
+ Instruction *&InsertionPt,
+ Instruction *I, Instruction *J) {
+ // Skip to the first instruction past I.
+ BasicBlock::iterator L = std::next(BasicBlock::iterator(I));
+
+ DenseSet<Value *> Users;
+ AliasSetTracker WriteSet(*AA);
+ if (I->mayWriteToMemory()) WriteSet.add(I);
+
+ for (; cast<Instruction>(L) != J;) {
+ if (trackUsesOfI(Users, WriteSet, I, &*L, true, &LoadMoveSetPairs)) {
+ // Move this instruction
+ Instruction *InstToMove = &*L++;
+
+ DEBUG(dbgs() << "BBV: moving: " << *InstToMove <<
+ " to after " << *InsertionPt << "\n");
+ InstToMove->removeFromParent();
+ InstToMove->insertAfter(InsertionPt);
+ InsertionPt = InstToMove;
+ } else {
+ ++L;
+ }
+ }
+ }
+
+ // Collect all load instruction that are in the move set of a given first
+ // pair member. These loads depend on the first instruction, I, and so need
+ // to be moved after J (the second instruction) when the pair is fused.
+ void BBVectorize::collectPairLoadMoveSet(BasicBlock &BB,
+ DenseMap<Value *, Value *> &ChosenPairs,
+ DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
+ DenseSet<ValuePair> &LoadMoveSetPairs,
+ Instruction *I) {
+ // Skip to the first instruction past I.
+ BasicBlock::iterator L = std::next(BasicBlock::iterator(I));
+
+ DenseSet<Value *> Users;
+ AliasSetTracker WriteSet(*AA);
+ if (I->mayWriteToMemory()) WriteSet.add(I);
+
+ // Note: We cannot end the loop when we reach J because J could be moved
+ // farther down the use chain by another instruction pairing. Also, J
+ // could be before I if this is an inverted input.
+ for (BasicBlock::iterator E = BB.end(); L != E; ++L) {
+ if (trackUsesOfI(Users, WriteSet, I, &*L)) {
+ if (L->mayReadFromMemory()) {
+ LoadMoveSet[&*L].push_back(I);
+ LoadMoveSetPairs.insert(ValuePair(&*L, I));
+ }
+ }
+ }
+ }
+
+ // In cases where both load/stores and the computation of their pointers
+ // are chosen for vectorization, we can end up in a situation where the
+ // aliasing analysis starts returning different query results as the
+ // process of fusing instruction pairs continues. Because the algorithm
+ // relies on finding the same use dags here as were found earlier, we'll
+ // need to precompute the necessary aliasing information here and then
+ // manually update it during the fusion process.
+ void BBVectorize::collectLoadMoveSet(BasicBlock &BB,
+ std::vector<Value *> &PairableInsts,
+ DenseMap<Value *, Value *> &ChosenPairs,
+ DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
+ DenseSet<ValuePair> &LoadMoveSetPairs) {
+ for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
+ PIE = PairableInsts.end(); PI != PIE; ++PI) {
+ DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(*PI);
+ if (P == ChosenPairs.end()) continue;
+
+ Instruction *I = cast<Instruction>(P->first);
+ collectPairLoadMoveSet(BB, ChosenPairs, LoadMoveSet,
+ LoadMoveSetPairs, I);
+ }
+ }
+
+ // This function fuses the chosen instruction pairs into vector instructions,
+ // taking care preserve any needed scalar outputs and, then, it reorders the
+ // remaining instructions as needed (users of the first member of the pair
+ // need to be moved to after the location of the second member of the pair
+ // because the vector instruction is inserted in the location of the pair's
+ // second member).
+ void BBVectorize::fuseChosenPairs(BasicBlock &BB,
+ std::vector<Value *> &PairableInsts,
+ DenseMap<Value *, Value *> &ChosenPairs,
+ DenseSet<ValuePair> &FixedOrderPairs,
+ DenseMap<VPPair, unsigned> &PairConnectionTypes,
+ DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+ DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps) {
+ LLVMContext& Context = BB.getContext();
+
+ // During the vectorization process, the order of the pairs to be fused
+ // could be flipped. So we'll add each pair, flipped, into the ChosenPairs
+ // list. After a pair is fused, the flipped pair is removed from the list.
+ DenseSet<ValuePair> FlippedPairs;
+ for (DenseMap<Value *, Value *>::iterator P = ChosenPairs.begin(),
+ E = ChosenPairs.end(); P != E; ++P)
+ FlippedPairs.insert(ValuePair(P->second, P->first));
+ for (DenseSet<ValuePair>::iterator P = FlippedPairs.begin(),
+ E = FlippedPairs.end(); P != E; ++P)
+ ChosenPairs.insert(*P);
+
+ DenseMap<Value *, std::vector<Value *> > LoadMoveSet;
+ DenseSet<ValuePair> LoadMoveSetPairs;
+ collectLoadMoveSet(BB, PairableInsts, ChosenPairs,
+ LoadMoveSet, LoadMoveSetPairs);
+
+ DEBUG(dbgs() << "BBV: initial: \n" << BB << "\n");
+
+ for (BasicBlock::iterator PI = BB.getFirstInsertionPt(); PI != BB.end();) {
+ DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(&*PI);
+ if (P == ChosenPairs.end()) {
+ ++PI;
+ continue;
+ }
+
+ if (getDepthFactor(P->first) == 0) {
+ // These instructions are not really fused, but are tracked as though
+ // they are. Any case in which it would be interesting to fuse them
+ // will be taken care of by InstCombine.
+ --NumFusedOps;
+ ++PI;
+ continue;
+ }
+
+ Instruction *I = cast<Instruction>(P->first),
+ *J = cast<Instruction>(P->second);
+
+ DEBUG(dbgs() << "BBV: fusing: " << *I <<
+ " <-> " << *J << "\n");
+
+ // Remove the pair and flipped pair from the list.
+ DenseMap<Value *, Value *>::iterator FP = ChosenPairs.find(P->second);
+ assert(FP != ChosenPairs.end() && "Flipped pair not found in list");
+ ChosenPairs.erase(FP);
+ ChosenPairs.erase(P);
+
+ if (!canMoveUsesOfIAfterJ(BB, LoadMoveSetPairs, I, J)) {
+ DEBUG(dbgs() << "BBV: fusion of: " << *I <<
+ " <-> " << *J <<
+ " aborted because of non-trivial dependency cycle\n");
+ --NumFusedOps;
+ ++PI;
+ continue;
+ }
+
+ // If the pair must have the other order, then flip it.
+ bool FlipPairOrder = FixedOrderPairs.count(ValuePair(J, I));
+ if (!FlipPairOrder && !FixedOrderPairs.count(ValuePair(I, J))) {
+ // This pair does not have a fixed order, and so we might want to
+ // flip it if that will yield fewer shuffles. We count the number
+ // of dependencies connected via swaps, and those directly connected,
+ // and flip the order if the number of swaps is greater.
+ bool OrigOrder = true;
+ DenseMap<ValuePair, std::vector<ValuePair> >::iterator IJ =
+ ConnectedPairDeps.find(ValuePair(I, J));
+ if (IJ == ConnectedPairDeps.end()) {
+ IJ = ConnectedPairDeps.find(ValuePair(J, I));
+ OrigOrder = false;
+ }
+
+ if (IJ != ConnectedPairDeps.end()) {
+ unsigned NumDepsDirect = 0, NumDepsSwap = 0;
+ for (std::vector<ValuePair>::iterator T = IJ->second.begin(),
+ TE = IJ->second.end(); T != TE; ++T) {
+ VPPair Q(IJ->first, *T);
+ DenseMap<VPPair, unsigned>::iterator R =
+ PairConnectionTypes.find(VPPair(Q.second, Q.first));
+ assert(R != PairConnectionTypes.end() &&
+ "Cannot find pair connection type");
+ if (R->second == PairConnectionDirect)
+ ++NumDepsDirect;
+ else if (R->second == PairConnectionSwap)
+ ++NumDepsSwap;
+ }
+
+ if (!OrigOrder)
+ std::swap(NumDepsDirect, NumDepsSwap);
+
+ if (NumDepsSwap > NumDepsDirect) {
+ FlipPairOrder = true;
+ DEBUG(dbgs() << "BBV: reordering pair: " << *I <<
+ " <-> " << *J << "\n");
+ }
+ }
+ }
+
+ Instruction *L = I, *H = J;
+ if (FlipPairOrder)
+ std::swap(H, L);
+
+ // If the pair being fused uses the opposite order from that in the pair
+ // connection map, then we need to flip the types.
+ DenseMap<ValuePair, std::vector<ValuePair> >::iterator HL =
+ ConnectedPairs.find(ValuePair(H, L));
+ if (HL != ConnectedPairs.end())
+ for (std::vector<ValuePair>::iterator T = HL->second.begin(),
+ TE = HL->second.end(); T != TE; ++T) {
+ VPPair Q(HL->first, *T);
+ DenseMap<VPPair, unsigned>::iterator R = PairConnectionTypes.find(Q);
+ assert(R != PairConnectionTypes.end() &&
+ "Cannot find pair connection type");
+ if (R->second == PairConnectionDirect)
+ R->second = PairConnectionSwap;
+ else if (R->second == PairConnectionSwap)
+ R->second = PairConnectionDirect;
+ }
+
+ bool LBeforeH = !FlipPairOrder;
+ unsigned NumOperands = I->getNumOperands();
+ SmallVector<Value *, 3> ReplacedOperands(NumOperands);
+ getReplacementInputsForPair(Context, L, H, ReplacedOperands,
+ LBeforeH);
+
+ // Make a copy of the original operation, change its type to the vector
+ // type and replace its operands with the vector operands.
+ Instruction *K = L->clone();
+ if (L->hasName())
+ K->takeName(L);
+ else if (H->hasName())
+ K->takeName(H);
+
+ if (auto CS = CallSite(K)) {
+ SmallVector<Type *, 3> Tys;
+ FunctionType *Old = CS.getFunctionType();
+ unsigned NumOld = Old->getNumParams();
+ assert(NumOld <= ReplacedOperands.size());
+ for (unsigned i = 0; i != NumOld; ++i)
+ Tys.push_back(ReplacedOperands[i]->getType());
+ CS.mutateFunctionType(
+ FunctionType::get(getVecTypeForPair(L->getType(), H->getType()),
+ Tys, Old->isVarArg()));
+ } else if (!isa<StoreInst>(K))
+ K->mutateType(getVecTypeForPair(L->getType(), H->getType()));
+
+ unsigned KnownIDs[] = {LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
+ LLVMContext::MD_noalias, LLVMContext::MD_fpmath,
+ LLVMContext::MD_invariant_group};
+ combineMetadata(K, H, KnownIDs);
+ K->intersectOptionalDataWith(H);
+
+ for (unsigned o = 0; o < NumOperands; ++o)
+ K->setOperand(o, ReplacedOperands[o]);
+
+ K->insertAfter(J);
+
+ // Instruction insertion point:
+ Instruction *InsertionPt = K;
+ Instruction *K1 = nullptr, *K2 = nullptr;
+ replaceOutputsOfPair(Context, L, H, K, InsertionPt, K1, K2);
+
+ // The use dag of the first original instruction must be moved to after
+ // the location of the second instruction. The entire use dag of the
+ // first instruction is disjoint from the input dag of the second
+ // (by definition), and so commutes with it.
+
+ moveUsesOfIAfterJ(BB, LoadMoveSetPairs, InsertionPt, I, J);
+
+ if (!isa<StoreInst>(I)) {
+ L->replaceAllUsesWith(K1);
+ H->replaceAllUsesWith(K2);
+ }
+
+ // Instructions that may read from memory may be in the load move set.
+ // Once an instruction is fused, we no longer need its move set, and so
+ // the values of the map never need to be updated. However, when a load
+ // is fused, we need to merge the entries from both instructions in the
+ // pair in case those instructions were in the move set of some other
+ // yet-to-be-fused pair. The loads in question are the keys of the map.
+ if (I->mayReadFromMemory()) {
+ std::vector<ValuePair> NewSetMembers;
+ DenseMap<Value *, std::vector<Value *> >::iterator II =
+ LoadMoveSet.find(I);
+ if (II != LoadMoveSet.end())
+ for (std::vector<Value *>::iterator N = II->second.begin(),
+ NE = II->second.end(); N != NE; ++N)
+ NewSetMembers.push_back(ValuePair(K, *N));
+ DenseMap<Value *, std::vector<Value *> >::iterator JJ =
+ LoadMoveSet.find(J);
+ if (JJ != LoadMoveSet.end())
+ for (std::vector<Value *>::iterator N = JJ->second.begin(),
+ NE = JJ->second.end(); N != NE; ++N)
+ NewSetMembers.push_back(ValuePair(K, *N));
+ for (std::vector<ValuePair>::iterator A = NewSetMembers.begin(),
+ AE = NewSetMembers.end(); A != AE; ++A) {
+ LoadMoveSet[A->first].push_back(A->second);
+ LoadMoveSetPairs.insert(*A);
+ }
+ }
+
+ // Before removing I, set the iterator to the next instruction.
+ PI = std::next(BasicBlock::iterator(I));
+ if (cast<Instruction>(PI) == J)
+ ++PI;
+
+ SE->forgetValue(I);
+ SE->forgetValue(J);
+ I->eraseFromParent();
+ J->eraseFromParent();
+
+ DEBUG(if (PrintAfterEveryPair) dbgs() << "BBV: block is now: \n" <<
+ BB << "\n");
+ }
+
+ DEBUG(dbgs() << "BBV: final: \n" << BB << "\n");
+ }
+}
+
+char BBVectorize::ID = 0;
+static const char bb_vectorize_name[] = "Basic-Block Vectorization";
+INITIALIZE_PASS_BEGIN(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
+INITIALIZE_PASS_END(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)
+
+BasicBlockPass *llvm::createBBVectorizePass(const VectorizeConfig &C) {
+ return new BBVectorize(C);
+}
+
+bool
+llvm::vectorizeBasicBlock(Pass *P, BasicBlock &BB, const VectorizeConfig &C) {
+ BBVectorize BBVectorizer(P, *BB.getParent(), C);
+ return BBVectorizer.vectorizeBB(BB);
+}
+
+//===----------------------------------------------------------------------===//
+VectorizeConfig::VectorizeConfig() {
+ VectorBits = ::VectorBits;
+ VectorizeBools = !::NoBools;
+ VectorizeInts = !::NoInts;
+ VectorizeFloats = !::NoFloats;
+ VectorizePointers = !::NoPointers;
+ VectorizeCasts = !::NoCasts;
+ VectorizeMath = !::NoMath;
+ VectorizeBitManipulations = !::NoBitManipulation;
+ VectorizeFMA = !::NoFMA;
+ VectorizeSelect = !::NoSelect;
+ VectorizeCmp = !::NoCmp;
+ VectorizeGEP = !::NoGEP;
+ VectorizeMemOps = !::NoMemOps;
+ AlignedOnly = ::AlignedOnly;
+ ReqChainDepth= ::ReqChainDepth;
+ SearchLimit = ::SearchLimit;
+ MaxCandPairsForCycleCheck = ::MaxCandPairsForCycleCheck;
+ SplatBreaksChain = ::SplatBreaksChain;
+ MaxInsts = ::MaxInsts;
+ MaxPairs = ::MaxPairs;
+ MaxIter = ::MaxIter;
+ Pow2LenOnly = ::Pow2LenOnly;
+ NoMemOpBoost = ::NoMemOpBoost;
+ FastDep = ::FastDep;
+}
diff --git a/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
new file mode 100644
index 0000000..2c0d317
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -0,0 +1,5823 @@
+//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
+// and generates target-independent LLVM-IR.
+// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
+// of instructions in order to estimate the profitability of vectorization.
+//
+// The loop vectorizer combines consecutive loop iterations into a single
+// 'wide' iteration. After this transformation the index is incremented
+// by the SIMD vector width, and not by one.
+//
+// This pass has three parts:
+// 1. The main loop pass that drives the different parts.
+// 2. LoopVectorizationLegality - A unit that checks for the legality
+// of the vectorization.
+// 3. InnerLoopVectorizer - A unit that performs the actual
+// widening of instructions.
+// 4. LoopVectorizationCostModel - A unit that checks for the profitability
+// of vectorization. It decides on the optimal vector width, which
+// can be one, if vectorization is not profitable.
+//
+//===----------------------------------------------------------------------===//
+//
+// The reduction-variable vectorization is based on the paper:
+// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
+//
+// Variable uniformity checks are inspired by:
+// Karrenberg, R. and Hack, S. Whole Function Vectorization.
+//
+// The interleaved access vectorization is based on the paper:
+// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
+// Data for SIMD
+//
+// Other ideas/concepts are from:
+// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
+//
+// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
+// Vectorizing Compilers.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Vectorize.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/DemandedBits.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include <algorithm>
+#include <functional>
+#include <map>
+#include <tuple>
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+#define LV_NAME "loop-vectorize"
+#define DEBUG_TYPE LV_NAME
+
+STATISTIC(LoopsVectorized, "Number of loops vectorized");
+STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
+
+static cl::opt<bool>
+EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
+ cl::desc("Enable if-conversion during vectorization."));
+
+/// We don't vectorize loops with a known constant trip count below this number.
+static cl::opt<unsigned>
+TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16),
+ cl::Hidden,
+ cl::desc("Don't vectorize loops with a constant "
+ "trip count that is smaller than this "
+ "value."));
+
+static cl::opt<bool> MaximizeBandwidth(
+ "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
+ cl::desc("Maximize bandwidth when selecting vectorization factor which "
+ "will be determined by the smallest type in loop."));
+
+/// This enables versioning on the strides of symbolically striding memory
+/// accesses in code like the following.
+/// for (i = 0; i < N; ++i)
+/// A[i * Stride1] += B[i * Stride2] ...
+///
+/// Will be roughly translated to
+/// if (Stride1 == 1 && Stride2 == 1) {
+/// for (i = 0; i < N; i+=4)
+/// A[i:i+3] += ...
+/// } else
+/// ...
+static cl::opt<bool> EnableMemAccessVersioning(
+ "enable-mem-access-versioning", cl::init(true), cl::Hidden,
+ cl::desc("Enable symbolic stride memory access versioning"));
+
+static cl::opt<bool> EnableInterleavedMemAccesses(
+ "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
+ cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
+
+/// Maximum factor for an interleaved memory access.
+static cl::opt<unsigned> MaxInterleaveGroupFactor(
+ "max-interleave-group-factor", cl::Hidden,
+ cl::desc("Maximum factor for an interleaved access group (default = 8)"),
+ cl::init(8));
+
+/// We don't interleave loops with a known constant trip count below this
+/// number.
+static const unsigned TinyTripCountInterleaveThreshold = 128;
+
+static cl::opt<unsigned> ForceTargetNumScalarRegs(
+ "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
+ cl::desc("A flag that overrides the target's number of scalar registers."));
+
+static cl::opt<unsigned> ForceTargetNumVectorRegs(
+ "force-target-num-vector-regs", cl::init(0), cl::Hidden,
+ cl::desc("A flag that overrides the target's number of vector registers."));
+
+/// Maximum vectorization interleave count.
+static const unsigned MaxInterleaveFactor = 16;
+
+static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
+ "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
+ cl::desc("A flag that overrides the target's max interleave factor for "
+ "scalar loops."));
+
+static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
+ "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
+ cl::desc("A flag that overrides the target's max interleave factor for "
+ "vectorized loops."));
+
+static cl::opt<unsigned> ForceTargetInstructionCost(
+ "force-target-instruction-cost", cl::init(0), cl::Hidden,
+ cl::desc("A flag that overrides the target's expected cost for "
+ "an instruction to a single constant value. Mostly "
+ "useful for getting consistent testing."));
+
+static cl::opt<unsigned> SmallLoopCost(
+ "small-loop-cost", cl::init(20), cl::Hidden,
+ cl::desc(
+ "The cost of a loop that is considered 'small' by the interleaver."));
+
+static cl::opt<bool> LoopVectorizeWithBlockFrequency(
+ "loop-vectorize-with-block-frequency", cl::init(false), cl::Hidden,
+ cl::desc("Enable the use of the block frequency analysis to access PGO "
+ "heuristics minimizing code growth in cold regions and being more "
+ "aggressive in hot regions."));
+
+// Runtime interleave loops for load/store throughput.
+static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
+ "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
+ cl::desc(
+ "Enable runtime interleaving until load/store ports are saturated"));
+
+/// The number of stores in a loop that are allowed to need predication.
+static cl::opt<unsigned> NumberOfStoresToPredicate(
+ "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
+ cl::desc("Max number of stores to be predicated behind an if."));
+
+static cl::opt<bool> EnableIndVarRegisterHeur(
+ "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
+ cl::desc("Count the induction variable only once when interleaving"));
+
+static cl::opt<bool> EnableCondStoresVectorization(
+ "enable-cond-stores-vec", cl::init(false), cl::Hidden,
+ cl::desc("Enable if predication of stores during vectorization."));
+
+static cl::opt<unsigned> MaxNestedScalarReductionIC(
+ "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
+ cl::desc("The maximum interleave count to use when interleaving a scalar "
+ "reduction in a nested loop."));
+
+static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
+ "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
+ cl::desc("The maximum allowed number of runtime memory checks with a "
+ "vectorize(enable) pragma."));
+
+static cl::opt<unsigned> VectorizeSCEVCheckThreshold(
+ "vectorize-scev-check-threshold", cl::init(16), cl::Hidden,
+ cl::desc("The maximum number of SCEV checks allowed."));
+
+static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold(
+ "pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden,
+ cl::desc("The maximum number of SCEV checks allowed with a "
+ "vectorize(enable) pragma"));
+
+namespace {
+
+// Forward declarations.
+class LoopVectorizeHints;
+class LoopVectorizationLegality;
+class LoopVectorizationCostModel;
+class LoopVectorizationRequirements;
+
+/// \brief This modifies LoopAccessReport to initialize message with
+/// loop-vectorizer-specific part.
+class VectorizationReport : public LoopAccessReport {
+public:
+ VectorizationReport(Instruction *I = nullptr)
+ : LoopAccessReport("loop not vectorized: ", I) {}
+
+ /// \brief This allows promotion of the loop-access analysis report into the
+ /// loop-vectorizer report. It modifies the message to add the
+ /// loop-vectorizer-specific part of the message.
+ explicit VectorizationReport(const LoopAccessReport &R)
+ : LoopAccessReport(Twine("loop not vectorized: ") + R.str(),
+ R.getInstr()) {}
+};
+
+/// A helper function for converting Scalar types to vector types.
+/// If the incoming type is void, we return void. If the VF is 1, we return
+/// the scalar type.
+static Type* ToVectorTy(Type *Scalar, unsigned VF) {
+ if (Scalar->isVoidTy() || VF == 1)
+ return Scalar;
+ return VectorType::get(Scalar, VF);
+}
+
+/// A helper function that returns GEP instruction and knows to skip a
+/// 'bitcast'. The 'bitcast' may be skipped if the source and the destination
+/// pointee types of the 'bitcast' have the same size.
+/// For example:
+/// bitcast double** %var to i64* - can be skipped
+/// bitcast double** %var to i8* - can not
+static GetElementPtrInst *getGEPInstruction(Value *Ptr) {
+
+ if (isa<GetElementPtrInst>(Ptr))
+ return cast<GetElementPtrInst>(Ptr);
+
+ if (isa<BitCastInst>(Ptr) &&
+ isa<GetElementPtrInst>(cast<BitCastInst>(Ptr)->getOperand(0))) {
+ Type *BitcastTy = Ptr->getType();
+ Type *GEPTy = cast<BitCastInst>(Ptr)->getSrcTy();
+ if (!isa<PointerType>(BitcastTy) || !isa<PointerType>(GEPTy))
+ return nullptr;
+ Type *Pointee1Ty = cast<PointerType>(BitcastTy)->getPointerElementType();
+ Type *Pointee2Ty = cast<PointerType>(GEPTy)->getPointerElementType();
+ const DataLayout &DL = cast<BitCastInst>(Ptr)->getModule()->getDataLayout();
+ if (DL.getTypeSizeInBits(Pointee1Ty) == DL.getTypeSizeInBits(Pointee2Ty))
+ return cast<GetElementPtrInst>(cast<BitCastInst>(Ptr)->getOperand(0));
+ }
+ return nullptr;
+}
+
+/// InnerLoopVectorizer vectorizes loops which contain only one basic
+/// block to a specified vectorization factor (VF).
+/// This class performs the widening of scalars into vectors, or multiple
+/// scalars. This class also implements the following features:
+/// * It inserts an epilogue loop for handling loops that don't have iteration
+/// counts that are known to be a multiple of the vectorization factor.
+/// * It handles the code generation for reduction variables.
+/// * Scalarization (implementation using scalars) of un-vectorizable
+/// instructions.
+/// InnerLoopVectorizer does not perform any vectorization-legality
+/// checks, and relies on the caller to check for the different legality
+/// aspects. The InnerLoopVectorizer relies on the
+/// LoopVectorizationLegality class to provide information about the induction
+/// and reduction variables that were found to a given vectorization factor.
+class InnerLoopVectorizer {
+public:
+ InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
+ LoopInfo *LI, DominatorTree *DT,
+ const TargetLibraryInfo *TLI,
+ const TargetTransformInfo *TTI, unsigned VecWidth,
+ unsigned UnrollFactor)
+ : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
+ VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()),
+ Induction(nullptr), OldInduction(nullptr), WidenMap(UnrollFactor),
+ TripCount(nullptr), VectorTripCount(nullptr), Legal(nullptr),
+ AddedSafetyChecks(false) {}
+
+ // Perform the actual loop widening (vectorization).
+ // MinimumBitWidths maps scalar integer values to the smallest bitwidth they
+ // can be validly truncated to. The cost model has assumed this truncation
+ // will happen when vectorizing.
+ void vectorize(LoopVectorizationLegality *L,
+ MapVector<Instruction*,uint64_t> MinimumBitWidths) {
+ MinBWs = MinimumBitWidths;
+ Legal = L;
+ // Create a new empty loop. Unlink the old loop and connect the new one.
+ createEmptyLoop();
+ // Widen each instruction in the old loop to a new one in the new loop.
+ // Use the Legality module to find the induction and reduction variables.
+ vectorizeLoop();
+ }
+
+ // Return true if any runtime check is added.
+ bool IsSafetyChecksAdded() {
+ return AddedSafetyChecks;
+ }
+
+ virtual ~InnerLoopVectorizer() {}
+
+protected:
+ /// A small list of PHINodes.
+ typedef SmallVector<PHINode*, 4> PhiVector;
+ /// When we unroll loops we have multiple vector values for each scalar.
+ /// This data structure holds the unrolled and vectorized values that
+ /// originated from one scalar instruction.
+ typedef SmallVector<Value*, 2> VectorParts;
+
+ // When we if-convert we need to create edge masks. We have to cache values
+ // so that we don't end up with exponential recursion/IR.
+ typedef DenseMap<std::pair<BasicBlock*, BasicBlock*>,
+ VectorParts> EdgeMaskCache;
+
+ /// Create an empty loop, based on the loop ranges of the old loop.
+ void createEmptyLoop();
+ /// Create a new induction variable inside L.
+ PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
+ Value *Step, Instruction *DL);
+ /// Copy and widen the instructions from the old loop.
+ virtual void vectorizeLoop();
+
+ /// \brief The Loop exit block may have single value PHI nodes where the
+ /// incoming value is 'Undef'. While vectorizing we only handled real values
+ /// that were defined inside the loop. Here we fix the 'undef case'.
+ /// See PR14725.
+ void fixLCSSAPHIs();
+
+ /// Shrinks vector element sizes based on information in "MinBWs".
+ void truncateToMinimalBitwidths();
+
+ /// A helper function that computes the predicate of the block BB, assuming
+ /// that the header block of the loop is set to True. It returns the *entry*
+ /// mask for the block BB.
+ VectorParts createBlockInMask(BasicBlock *BB);
+ /// A helper function that computes the predicate of the edge between SRC
+ /// and DST.
+ VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
+
+ /// A helper function to vectorize a single BB within the innermost loop.
+ void vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV);
+
+ /// Vectorize a single PHINode in a block. This method handles the induction
+ /// variable canonicalization. It supports both VF = 1 for unrolled loops and
+ /// arbitrary length vectors.
+ void widenPHIInstruction(Instruction *PN, VectorParts &Entry,
+ unsigned UF, unsigned VF, PhiVector *PV);
+
+ /// Insert the new loop to the loop hierarchy and pass manager
+ /// and update the analysis passes.
+ void updateAnalysis();
+
+ /// This instruction is un-vectorizable. Implement it as a sequence
+ /// of scalars. If \p IfPredicateStore is true we need to 'hide' each
+ /// scalarized instruction behind an if block predicated on the control
+ /// dependence of the instruction.
+ virtual void scalarizeInstruction(Instruction *Instr,
+ bool IfPredicateStore=false);
+
+ /// Vectorize Load and Store instructions,
+ virtual void vectorizeMemoryInstruction(Instruction *Instr);
+
+ /// Create a broadcast instruction. This method generates a broadcast
+ /// instruction (shuffle) for loop invariant values and for the induction
+ /// value. If this is the induction variable then we extend it to N, N+1, ...
+ /// this is needed because each iteration in the loop corresponds to a SIMD
+ /// element.
+ virtual Value *getBroadcastInstrs(Value *V);
+
+ /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
+ /// to each vector element of Val. The sequence starts at StartIndex.
+ virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step);
+
+ /// When we go over instructions in the basic block we rely on previous
+ /// values within the current basic block or on loop invariant values.
+ /// When we widen (vectorize) values we place them in the map. If the values
+ /// are not within the map, they have to be loop invariant, so we simply
+ /// broadcast them into a vector.
+ VectorParts &getVectorValue(Value *V);
+
+ /// Try to vectorize the interleaved access group that \p Instr belongs to.
+ void vectorizeInterleaveGroup(Instruction *Instr);
+
+ /// Generate a shuffle sequence that will reverse the vector Vec.
+ virtual Value *reverseVector(Value *Vec);
+
+ /// Returns (and creates if needed) the original loop trip count.
+ Value *getOrCreateTripCount(Loop *NewLoop);
+
+ /// Returns (and creates if needed) the trip count of the widened loop.
+ Value *getOrCreateVectorTripCount(Loop *NewLoop);
+
+ /// Emit a bypass check to see if the trip count would overflow, or we
+ /// wouldn't have enough iterations to execute one vector loop.
+ void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
+ /// Emit a bypass check to see if the vector trip count is nonzero.
+ void emitVectorLoopEnteredCheck(Loop *L, BasicBlock *Bypass);
+ /// Emit a bypass check to see if all of the SCEV assumptions we've
+ /// had to make are correct.
+ void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
+ /// Emit bypass checks to check any memory assumptions we may have made.
+ void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
+
+ /// This is a helper class that holds the vectorizer state. It maps scalar
+ /// instructions to vector instructions. When the code is 'unrolled' then
+ /// then a single scalar value is mapped to multiple vector parts. The parts
+ /// are stored in the VectorPart type.
+ struct ValueMap {
+ /// C'tor. UnrollFactor controls the number of vectors ('parts') that
+ /// are mapped.
+ ValueMap(unsigned UnrollFactor) : UF(UnrollFactor) {}
+
+ /// \return True if 'Key' is saved in the Value Map.
+ bool has(Value *Key) const { return MapStorage.count(Key); }
+
+ /// Initializes a new entry in the map. Sets all of the vector parts to the
+ /// save value in 'Val'.
+ /// \return A reference to a vector with splat values.
+ VectorParts &splat(Value *Key, Value *Val) {
+ VectorParts &Entry = MapStorage[Key];
+ Entry.assign(UF, Val);
+ return Entry;
+ }
+
+ ///\return A reference to the value that is stored at 'Key'.
+ VectorParts &get(Value *Key) {
+ VectorParts &Entry = MapStorage[Key];
+ if (Entry.empty())
+ Entry.resize(UF);
+ assert(Entry.size() == UF);
+ return Entry;
+ }
+
+ private:
+ /// The unroll factor. Each entry in the map stores this number of vector
+ /// elements.
+ unsigned UF;
+
+ /// Map storage. We use std::map and not DenseMap because insertions to a
+ /// dense map invalidates its iterators.
+ std::map<Value *, VectorParts> MapStorage;
+ };
+
+ /// The original loop.
+ Loop *OrigLoop;
+ /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
+ /// dynamic knowledge to simplify SCEV expressions and converts them to a
+ /// more usable form.
+ PredicatedScalarEvolution &PSE;
+ /// Loop Info.
+ LoopInfo *LI;
+ /// Dominator Tree.
+ DominatorTree *DT;
+ /// Alias Analysis.
+ AliasAnalysis *AA;
+ /// Target Library Info.
+ const TargetLibraryInfo *TLI;
+ /// Target Transform Info.
+ const TargetTransformInfo *TTI;
+
+ /// The vectorization SIMD factor to use. Each vector will have this many
+ /// vector elements.
+ unsigned VF;
+
+protected:
+ /// The vectorization unroll factor to use. Each scalar is vectorized to this
+ /// many different vector instructions.
+ unsigned UF;
+
+ /// The builder that we use
+ IRBuilder<> Builder;
+
+ // --- Vectorization state ---
+
+ /// The vector-loop preheader.
+ BasicBlock *LoopVectorPreHeader;
+ /// The scalar-loop preheader.
+ BasicBlock *LoopScalarPreHeader;
+ /// Middle Block between the vector and the scalar.
+ BasicBlock *LoopMiddleBlock;
+ ///The ExitBlock of the scalar loop.
+ BasicBlock *LoopExitBlock;
+ ///The vector loop body.
+ SmallVector<BasicBlock *, 4> LoopVectorBody;
+ ///The scalar loop body.
+ BasicBlock *LoopScalarBody;
+ /// A list of all bypass blocks. The first block is the entry of the loop.
+ SmallVector<BasicBlock *, 4> LoopBypassBlocks;
+
+ /// The new Induction variable which was added to the new block.
+ PHINode *Induction;
+ /// The induction variable of the old basic block.
+ PHINode *OldInduction;
+ /// Maps scalars to widened vectors.
+ ValueMap WidenMap;
+ /// Store instructions that should be predicated, as a pair
+ /// <StoreInst, Predicate>
+ SmallVector<std::pair<StoreInst*,Value*>, 4> PredicatedStores;
+ EdgeMaskCache MaskCache;
+ /// Trip count of the original loop.
+ Value *TripCount;
+ /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
+ Value *VectorTripCount;
+
+ /// Map of scalar integer values to the smallest bitwidth they can be legally
+ /// represented as. The vector equivalents of these values should be truncated
+ /// to this type.
+ MapVector<Instruction*,uint64_t> MinBWs;
+ LoopVectorizationLegality *Legal;
+
+ // Record whether runtime check is added.
+ bool AddedSafetyChecks;
+};
+
+class InnerLoopUnroller : public InnerLoopVectorizer {
+public:
+ InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
+ LoopInfo *LI, DominatorTree *DT,
+ const TargetLibraryInfo *TLI,
+ const TargetTransformInfo *TTI, unsigned UnrollFactor)
+ : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, 1, UnrollFactor) {}
+
+private:
+ void scalarizeInstruction(Instruction *Instr,
+ bool IfPredicateStore = false) override;
+ void vectorizeMemoryInstruction(Instruction *Instr) override;
+ Value *getBroadcastInstrs(Value *V) override;
+ Value *getStepVector(Value *Val, int StartIdx, Value *Step) override;
+ Value *reverseVector(Value *Vec) override;
+};
+
+/// \brief Look for a meaningful debug location on the instruction or it's
+/// operands.
+static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
+ if (!I)
+ return I;
+
+ DebugLoc Empty;
+ if (I->getDebugLoc() != Empty)
+ return I;
+
+ for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
+ if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
+ if (OpInst->getDebugLoc() != Empty)
+ return OpInst;
+ }
+
+ return I;
+}
+
+/// \brief Set the debug location in the builder using the debug location in the
+/// instruction.
+static void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
+ if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr))
+ B.SetCurrentDebugLocation(Inst->getDebugLoc());
+ else
+ B.SetCurrentDebugLocation(DebugLoc());
+}
+
+#ifndef NDEBUG
+/// \return string containing a file name and a line # for the given loop.
+static std::string getDebugLocString(const Loop *L) {
+ std::string Result;
+ if (L) {
+ raw_string_ostream OS(Result);
+ if (const DebugLoc LoopDbgLoc = L->getStartLoc())
+ LoopDbgLoc.print(OS);
+ else
+ // Just print the module name.
+ OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
+ OS.flush();
+ }
+ return Result;
+}
+#endif
+
+/// \brief Propagate known metadata from one instruction to another.
+static void propagateMetadata(Instruction *To, const Instruction *From) {
+ SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata;
+ From->getAllMetadataOtherThanDebugLoc(Metadata);
+
+ for (auto M : Metadata) {
+ unsigned Kind = M.first;
+
+ // These are safe to transfer (this is safe for TBAA, even when we
+ // if-convert, because should that metadata have had a control dependency
+ // on the condition, and thus actually aliased with some other
+ // non-speculated memory access when the condition was false, this would be
+ // caught by the runtime overlap checks).
+ if (Kind != LLVMContext::MD_tbaa &&
+ Kind != LLVMContext::MD_alias_scope &&
+ Kind != LLVMContext::MD_noalias &&
+ Kind != LLVMContext::MD_fpmath &&
+ Kind != LLVMContext::MD_nontemporal)
+ continue;
+
+ To->setMetadata(Kind, M.second);
+ }
+}
+
+/// \brief Propagate known metadata from one instruction to a vector of others.
+static void propagateMetadata(SmallVectorImpl<Value *> &To,
+ const Instruction *From) {
+ for (Value *V : To)
+ if (Instruction *I = dyn_cast<Instruction>(V))
+ propagateMetadata(I, From);
+}
+
+/// \brief The group of interleaved loads/stores sharing the same stride and
+/// close to each other.
+///
+/// Each member in this group has an index starting from 0, and the largest
+/// index should be less than interleaved factor, which is equal to the absolute
+/// value of the access's stride.
+///
+/// E.g. An interleaved load group of factor 4:
+/// for (unsigned i = 0; i < 1024; i+=4) {
+/// a = A[i]; // Member of index 0
+/// b = A[i+1]; // Member of index 1
+/// d = A[i+3]; // Member of index 3
+/// ...
+/// }
+///
+/// An interleaved store group of factor 4:
+/// for (unsigned i = 0; i < 1024; i+=4) {
+/// ...
+/// A[i] = a; // Member of index 0
+/// A[i+1] = b; // Member of index 1
+/// A[i+2] = c; // Member of index 2
+/// A[i+3] = d; // Member of index 3
+/// }
+///
+/// Note: the interleaved load group could have gaps (missing members), but
+/// the interleaved store group doesn't allow gaps.
+class InterleaveGroup {
+public:
+ InterleaveGroup(Instruction *Instr, int Stride, unsigned Align)
+ : Align(Align), SmallestKey(0), LargestKey(0), InsertPos(Instr) {
+ assert(Align && "The alignment should be non-zero");
+
+ Factor = std::abs(Stride);
+ assert(Factor > 1 && "Invalid interleave factor");
+
+ Reverse = Stride < 0;
+ Members[0] = Instr;
+ }
+
+ bool isReverse() const { return Reverse; }
+ unsigned getFactor() const { return Factor; }
+ unsigned getAlignment() const { return Align; }
+ unsigned getNumMembers() const { return Members.size(); }
+
+ /// \brief Try to insert a new member \p Instr with index \p Index and
+ /// alignment \p NewAlign. The index is related to the leader and it could be
+ /// negative if it is the new leader.
+ ///
+ /// \returns false if the instruction doesn't belong to the group.
+ bool insertMember(Instruction *Instr, int Index, unsigned NewAlign) {
+ assert(NewAlign && "The new member's alignment should be non-zero");
+
+ int Key = Index + SmallestKey;
+
+ // Skip if there is already a member with the same index.
+ if (Members.count(Key))
+ return false;
+
+ if (Key > LargestKey) {
+ // The largest index is always less than the interleave factor.
+ if (Index >= static_cast<int>(Factor))
+ return false;
+
+ LargestKey = Key;
+ } else if (Key < SmallestKey) {
+ // The largest index is always less than the interleave factor.
+ if (LargestKey - Key >= static_cast<int>(Factor))
+ return false;
+
+ SmallestKey = Key;
+ }
+
+ // It's always safe to select the minimum alignment.
+ Align = std::min(Align, NewAlign);
+ Members[Key] = Instr;
+ return true;
+ }
+
+ /// \brief Get the member with the given index \p Index
+ ///
+ /// \returns nullptr if contains no such member.
+ Instruction *getMember(unsigned Index) const {
+ int Key = SmallestKey + Index;
+ if (!Members.count(Key))
+ return nullptr;
+
+ return Members.find(Key)->second;
+ }
+
+ /// \brief Get the index for the given member. Unlike the key in the member
+ /// map, the index starts from 0.
+ unsigned getIndex(Instruction *Instr) const {
+ for (auto I : Members)
+ if (I.second == Instr)
+ return I.first - SmallestKey;
+
+ llvm_unreachable("InterleaveGroup contains no such member");
+ }
+
+ Instruction *getInsertPos() const { return InsertPos; }
+ void setInsertPos(Instruction *Inst) { InsertPos = Inst; }
+
+private:
+ unsigned Factor; // Interleave Factor.
+ bool Reverse;
+ unsigned Align;
+ DenseMap<int, Instruction *> Members;
+ int SmallestKey;
+ int LargestKey;
+
+ // To avoid breaking dependences, vectorized instructions of an interleave
+ // group should be inserted at either the first load or the last store in
+ // program order.
+ //
+ // E.g. %even = load i32 // Insert Position
+ // %add = add i32 %even // Use of %even
+ // %odd = load i32
+ //
+ // store i32 %even
+ // %odd = add i32 // Def of %odd
+ // store i32 %odd // Insert Position
+ Instruction *InsertPos;
+};
+
+/// \brief Drive the analysis of interleaved memory accesses in the loop.
+///
+/// Use this class to analyze interleaved accesses only when we can vectorize
+/// a loop. Otherwise it's meaningless to do analysis as the vectorization
+/// on interleaved accesses is unsafe.
+///
+/// The analysis collects interleave groups and records the relationships
+/// between the member and the group in a map.
+class InterleavedAccessInfo {
+public:
+ InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L,
+ DominatorTree *DT)
+ : PSE(PSE), TheLoop(L), DT(DT) {}
+
+ ~InterleavedAccessInfo() {
+ SmallSet<InterleaveGroup *, 4> DelSet;
+ // Avoid releasing a pointer twice.
+ for (auto &I : InterleaveGroupMap)
+ DelSet.insert(I.second);
+ for (auto *Ptr : DelSet)
+ delete Ptr;
+ }
+
+ /// \brief Analyze the interleaved accesses and collect them in interleave
+ /// groups. Substitute symbolic strides using \p Strides.
+ void analyzeInterleaving(const ValueToValueMap &Strides);
+
+ /// \brief Check if \p Instr belongs to any interleave group.
+ bool isInterleaved(Instruction *Instr) const {
+ return InterleaveGroupMap.count(Instr);
+ }
+
+ /// \brief Get the interleave group that \p Instr belongs to.
+ ///
+ /// \returns nullptr if doesn't have such group.
+ InterleaveGroup *getInterleaveGroup(Instruction *Instr) const {
+ if (InterleaveGroupMap.count(Instr))
+ return InterleaveGroupMap.find(Instr)->second;
+ return nullptr;
+ }
+
+private:
+ /// A wrapper around ScalarEvolution, used to add runtime SCEV checks.
+ /// Simplifies SCEV expressions in the context of existing SCEV assumptions.
+ /// The interleaved access analysis can also add new predicates (for example
+ /// by versioning strides of pointers).
+ PredicatedScalarEvolution &PSE;
+ Loop *TheLoop;
+ DominatorTree *DT;
+
+ /// Holds the relationships between the members and the interleave group.
+ DenseMap<Instruction *, InterleaveGroup *> InterleaveGroupMap;
+
+ /// \brief The descriptor for a strided memory access.
+ struct StrideDescriptor {
+ StrideDescriptor(int Stride, const SCEV *Scev, unsigned Size,
+ unsigned Align)
+ : Stride(Stride), Scev(Scev), Size(Size), Align(Align) {}
+
+ StrideDescriptor() : Stride(0), Scev(nullptr), Size(0), Align(0) {}
+
+ int Stride; // The access's stride. It is negative for a reverse access.
+ const SCEV *Scev; // The scalar expression of this access
+ unsigned Size; // The size of the memory object.
+ unsigned Align; // The alignment of this access.
+ };
+
+ /// \brief Create a new interleave group with the given instruction \p Instr,
+ /// stride \p Stride and alignment \p Align.
+ ///
+ /// \returns the newly created interleave group.
+ InterleaveGroup *createInterleaveGroup(Instruction *Instr, int Stride,
+ unsigned Align) {
+ assert(!InterleaveGroupMap.count(Instr) &&
+ "Already in an interleaved access group");
+ InterleaveGroupMap[Instr] = new InterleaveGroup(Instr, Stride, Align);
+ return InterleaveGroupMap[Instr];
+ }
+
+ /// \brief Release the group and remove all the relationships.
+ void releaseGroup(InterleaveGroup *Group) {
+ for (unsigned i = 0; i < Group->getFactor(); i++)
+ if (Instruction *Member = Group->getMember(i))
+ InterleaveGroupMap.erase(Member);
+
+ delete Group;
+ }
+
+ /// \brief Collect all the accesses with a constant stride in program order.
+ void collectConstStridedAccesses(
+ MapVector<Instruction *, StrideDescriptor> &StrideAccesses,
+ const ValueToValueMap &Strides);
+};
+
+/// Utility class for getting and setting loop vectorizer hints in the form
+/// of loop metadata.
+/// This class keeps a number of loop annotations locally (as member variables)
+/// and can, upon request, write them back as metadata on the loop. It will
+/// initially scan the loop for existing metadata, and will update the local
+/// values based on information in the loop.
+/// We cannot write all values to metadata, as the mere presence of some info,
+/// for example 'force', means a decision has been made. So, we need to be
+/// careful NOT to add them if the user hasn't specifically asked so.
+class LoopVectorizeHints {
+ enum HintKind {
+ HK_WIDTH,
+ HK_UNROLL,
+ HK_FORCE
+ };
+
+ /// Hint - associates name and validation with the hint value.
+ struct Hint {
+ const char * Name;
+ unsigned Value; // This may have to change for non-numeric values.
+ HintKind Kind;
+
+ Hint(const char * Name, unsigned Value, HintKind Kind)
+ : Name(Name), Value(Value), Kind(Kind) { }
+
+ bool validate(unsigned Val) {
+ switch (Kind) {
+ case HK_WIDTH:
+ return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth;
+ case HK_UNROLL:
+ return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor;
+ case HK_FORCE:
+ return (Val <= 1);
+ }
+ return false;
+ }
+ };
+
+ /// Vectorization width.
+ Hint Width;
+ /// Vectorization interleave factor.
+ Hint Interleave;
+ /// Vectorization forced
+ Hint Force;
+
+ /// Return the loop metadata prefix.
+ static StringRef Prefix() { return "llvm.loop."; }
+
+public:
+ enum ForceKind {
+ FK_Undefined = -1, ///< Not selected.
+ FK_Disabled = 0, ///< Forcing disabled.
+ FK_Enabled = 1, ///< Forcing enabled.
+ };
+
+ LoopVectorizeHints(const Loop *L, bool DisableInterleaving)
+ : Width("vectorize.width", VectorizerParams::VectorizationFactor,
+ HK_WIDTH),
+ Interleave("interleave.count", DisableInterleaving, HK_UNROLL),
+ Force("vectorize.enable", FK_Undefined, HK_FORCE),
+ TheLoop(L) {
+ // Populate values with existing loop metadata.
+ getHintsFromMetadata();
+
+ // force-vector-interleave overrides DisableInterleaving.
+ if (VectorizerParams::isInterleaveForced())
+ Interleave.Value = VectorizerParams::VectorizationInterleave;
+
+ DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs()
+ << "LV: Interleaving disabled by the pass manager\n");
+ }
+
+ /// Mark the loop L as already vectorized by setting the width to 1.
+ void setAlreadyVectorized() {
+ Width.Value = Interleave.Value = 1;
+ Hint Hints[] = {Width, Interleave};
+ writeHintsToMetadata(Hints);
+ }
+
+ bool allowVectorization(Function *F, Loop *L, bool AlwaysVectorize) const {
+ if (getForce() == LoopVectorizeHints::FK_Disabled) {
+ DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
+ emitOptimizationRemarkAnalysis(F->getContext(),
+ vectorizeAnalysisPassName(), *F,
+ L->getStartLoc(), emitRemark());
+ return false;
+ }
+
+ if (!AlwaysVectorize && getForce() != LoopVectorizeHints::FK_Enabled) {
+ DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
+ emitOptimizationRemarkAnalysis(F->getContext(),
+ vectorizeAnalysisPassName(), *F,
+ L->getStartLoc(), emitRemark());
+ return false;
+ }
+
+ if (getWidth() == 1 && getInterleave() == 1) {
+ // FIXME: Add a separate metadata to indicate when the loop has already
+ // been vectorized instead of setting width and count to 1.
+ DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
+ // FIXME: Add interleave.disable metadata. This will allow
+ // vectorize.disable to be used without disabling the pass and errors
+ // to differentiate between disabled vectorization and a width of 1.
+ emitOptimizationRemarkAnalysis(
+ F->getContext(), vectorizeAnalysisPassName(), *F, L->getStartLoc(),
+ "loop not vectorized: vectorization and interleaving are explicitly "
+ "disabled, or vectorize width and interleave count are both set to "
+ "1");
+ return false;
+ }
+
+ return true;
+ }
+
+ /// Dumps all the hint information.
+ std::string emitRemark() const {
+ VectorizationReport R;
+ if (Force.Value == LoopVectorizeHints::FK_Disabled)
+ R << "vectorization is explicitly disabled";
+ else {
+ R << "use -Rpass-analysis=loop-vectorize for more info";
+ if (Force.Value == LoopVectorizeHints::FK_Enabled) {
+ R << " (Force=true";
+ if (Width.Value != 0)
+ R << ", Vector Width=" << Width.Value;
+ if (Interleave.Value != 0)
+ R << ", Interleave Count=" << Interleave.Value;
+ R << ")";
+ }
+ }
+
+ return R.str();
+ }
+
+ unsigned getWidth() const { return Width.Value; }
+ unsigned getInterleave() const { return Interleave.Value; }
+ enum ForceKind getForce() const { return (ForceKind)Force.Value; }
+ const char *vectorizeAnalysisPassName() const {
+ // If hints are provided that don't disable vectorization use the
+ // AlwaysPrint pass name to force the frontend to print the diagnostic.
+ if (getWidth() == 1)
+ return LV_NAME;
+ if (getForce() == LoopVectorizeHints::FK_Disabled)
+ return LV_NAME;
+ if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth() == 0)
+ return LV_NAME;
+ return DiagnosticInfo::AlwaysPrint;
+ }
+
+ bool allowReordering() const {
+ // When enabling loop hints are provided we allow the vectorizer to change
+ // the order of operations that is given by the scalar loop. This is not
+ // enabled by default because can be unsafe or inefficient. For example,
+ // reordering floating-point operations will change the way round-off
+ // error accumulates in the loop.
+ return getForce() == LoopVectorizeHints::FK_Enabled || getWidth() > 1;
+ }
+
+private:
+ /// Find hints specified in the loop metadata and update local values.
+ void getHintsFromMetadata() {
+ MDNode *LoopID = TheLoop->getLoopID();
+ if (!LoopID)
+ return;
+
+ // First operand should refer to the loop id itself.
+ assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
+ assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
+
+ for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+ const MDString *S = nullptr;
+ SmallVector<Metadata *, 4> Args;
+
+ // The expected hint is either a MDString or a MDNode with the first
+ // operand a MDString.
+ if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) {
+ if (!MD || MD->getNumOperands() == 0)
+ continue;
+ S = dyn_cast<MDString>(MD->getOperand(0));
+ for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i)
+ Args.push_back(MD->getOperand(i));
+ } else {
+ S = dyn_cast<MDString>(LoopID->getOperand(i));
+ assert(Args.size() == 0 && "too many arguments for MDString");
+ }
+
+ if (!S)
+ continue;
+
+ // Check if the hint starts with the loop metadata prefix.
+ StringRef Name = S->getString();
+ if (Args.size() == 1)
+ setHint(Name, Args[0]);
+ }
+ }
+
+ /// Checks string hint with one operand and set value if valid.
+ void setHint(StringRef Name, Metadata *Arg) {
+ if (!Name.startswith(Prefix()))
+ return;
+ Name = Name.substr(Prefix().size(), StringRef::npos);
+
+ const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg);
+ if (!C) return;
+ unsigned Val = C->getZExtValue();
+
+ Hint *Hints[] = {&Width, &Interleave, &Force};
+ for (auto H : Hints) {
+ if (Name == H->Name) {
+ if (H->validate(Val))
+ H->Value = Val;
+ else
+ DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n");
+ break;
+ }
+ }
+ }
+
+ /// Create a new hint from name / value pair.
+ MDNode *createHintMetadata(StringRef Name, unsigned V) const {
+ LLVMContext &Context = TheLoop->getHeader()->getContext();
+ Metadata *MDs[] = {MDString::get(Context, Name),
+ ConstantAsMetadata::get(
+ ConstantInt::get(Type::getInt32Ty(Context), V))};
+ return MDNode::get(Context, MDs);
+ }
+
+ /// Matches metadata with hint name.
+ bool matchesHintMetadataName(MDNode *Node, ArrayRef<Hint> HintTypes) {
+ MDString* Name = dyn_cast<MDString>(Node->getOperand(0));
+ if (!Name)
+ return false;
+
+ for (auto H : HintTypes)
+ if (Name->getString().endswith(H.Name))
+ return true;
+ return false;
+ }
+
+ /// Sets current hints into loop metadata, keeping other values intact.
+ void writeHintsToMetadata(ArrayRef<Hint> HintTypes) {
+ if (HintTypes.size() == 0)
+ return;
+
+ // Reserve the first element to LoopID (see below).
+ SmallVector<Metadata *, 4> MDs(1);
+ // If the loop already has metadata, then ignore the existing operands.
+ MDNode *LoopID = TheLoop->getLoopID();
+ if (LoopID) {
+ for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+ MDNode *Node = cast<MDNode>(LoopID->getOperand(i));
+ // If node in update list, ignore old value.
+ if (!matchesHintMetadataName(Node, HintTypes))
+ MDs.push_back(Node);
+ }
+ }
+
+ // Now, add the missing hints.
+ for (auto H : HintTypes)
+ MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value));
+
+ // Replace current metadata node with new one.
+ LLVMContext &Context = TheLoop->getHeader()->getContext();
+ MDNode *NewLoopID = MDNode::get(Context, MDs);
+ // Set operand 0 to refer to the loop id itself.
+ NewLoopID->replaceOperandWith(0, NewLoopID);
+
+ TheLoop->setLoopID(NewLoopID);
+ }
+
+ /// The loop these hints belong to.
+ const Loop *TheLoop;
+};
+
+static void emitAnalysisDiag(const Function *TheFunction, const Loop *TheLoop,
+ const LoopVectorizeHints &Hints,
+ const LoopAccessReport &Message) {
+ const char *Name = Hints.vectorizeAnalysisPassName();
+ LoopAccessReport::emitAnalysis(Message, TheFunction, TheLoop, Name);
+}
+
+static void emitMissedWarning(Function *F, Loop *L,
+ const LoopVectorizeHints &LH) {
+ emitOptimizationRemarkMissed(F->getContext(), LV_NAME, *F, L->getStartLoc(),
+ LH.emitRemark());
+
+ if (LH.getForce() == LoopVectorizeHints::FK_Enabled) {
+ if (LH.getWidth() != 1)
+ emitLoopVectorizeWarning(
+ F->getContext(), *F, L->getStartLoc(),
+ "failed explicitly specified loop vectorization");
+ else if (LH.getInterleave() != 1)
+ emitLoopInterleaveWarning(
+ F->getContext(), *F, L->getStartLoc(),
+ "failed explicitly specified loop interleaving");
+ }
+}
+
+/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
+/// to what vectorization factor.
+/// This class does not look at the profitability of vectorization, only the
+/// legality. This class has two main kinds of checks:
+/// * Memory checks - The code in canVectorizeMemory checks if vectorization
+/// will change the order of memory accesses in a way that will change the
+/// correctness of the program.
+/// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory
+/// checks for a number of different conditions, such as the availability of a
+/// single induction variable, that all types are supported and vectorize-able,
+/// etc. This code reflects the capabilities of InnerLoopVectorizer.
+/// This class is also used by InnerLoopVectorizer for identifying
+/// induction variable and the different reduction variables.
+class LoopVectorizationLegality {
+public:
+ LoopVectorizationLegality(Loop *L, PredicatedScalarEvolution &PSE,
+ DominatorTree *DT, TargetLibraryInfo *TLI,
+ AliasAnalysis *AA, Function *F,
+ const TargetTransformInfo *TTI,
+ LoopAccessAnalysis *LAA,
+ LoopVectorizationRequirements *R,
+ const LoopVectorizeHints *H)
+ : NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TheFunction(F),
+ TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), InterleaveInfo(PSE, L, DT),
+ Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false),
+ Requirements(R), Hints(H) {}
+
+ /// ReductionList contains the reduction descriptors for all
+ /// of the reductions that were found in the loop.
+ typedef DenseMap<PHINode *, RecurrenceDescriptor> ReductionList;
+
+ /// InductionList saves induction variables and maps them to the
+ /// induction descriptor.
+ typedef MapVector<PHINode*, InductionDescriptor> InductionList;
+
+ /// Returns true if it is legal to vectorize this loop.
+ /// This does not mean that it is profitable to vectorize this
+ /// loop, only that it is legal to do so.
+ bool canVectorize();
+
+ /// Returns the Induction variable.
+ PHINode *getInduction() { return Induction; }
+
+ /// Returns the reduction variables found in the loop.
+ ReductionList *getReductionVars() { return &Reductions; }
+
+ /// Returns the induction variables found in the loop.
+ InductionList *getInductionVars() { return &Inductions; }
+
+ /// Returns the widest induction type.
+ Type *getWidestInductionType() { return WidestIndTy; }
+
+ /// Returns True if V is an induction variable in this loop.
+ bool isInductionVariable(const Value *V);
+
+ /// Returns True if PN is a reduction variable in this loop.
+ bool isReductionVariable(PHINode *PN) { return Reductions.count(PN); }
+
+ /// Return true if the block BB needs to be predicated in order for the loop
+ /// to be vectorized.
+ bool blockNeedsPredication(BasicBlock *BB);
+
+ /// Check if this pointer is consecutive when vectorizing. This happens
+ /// when the last index of the GEP is the induction variable, or that the
+ /// pointer itself is an induction variable.
+ /// This check allows us to vectorize A[idx] into a wide load/store.
+ /// Returns:
+ /// 0 - Stride is unknown or non-consecutive.
+ /// 1 - Address is consecutive.
+ /// -1 - Address is consecutive, and decreasing.
+ int isConsecutivePtr(Value *Ptr);
+
+ /// Returns true if the value V is uniform within the loop.
+ bool isUniform(Value *V);
+
+ /// Returns true if this instruction will remain scalar after vectorization.
+ bool isUniformAfterVectorization(Instruction* I) { return Uniforms.count(I); }
+
+ /// Returns the information that we collected about runtime memory check.
+ const RuntimePointerChecking *getRuntimePointerChecking() const {
+ return LAI->getRuntimePointerChecking();
+ }
+
+ const LoopAccessInfo *getLAI() const {
+ return LAI;
+ }
+
+ /// \brief Check if \p Instr belongs to any interleaved access group.
+ bool isAccessInterleaved(Instruction *Instr) {
+ return InterleaveInfo.isInterleaved(Instr);
+ }
+
+ /// \brief Get the interleaved access group that \p Instr belongs to.
+ const InterleaveGroup *getInterleavedAccessGroup(Instruction *Instr) {
+ return InterleaveInfo.getInterleaveGroup(Instr);
+ }
+
+ unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); }
+
+ bool hasStride(Value *V) { return StrideSet.count(V); }
+ bool mustCheckStrides() { return !StrideSet.empty(); }
+ SmallPtrSet<Value *, 8>::iterator strides_begin() {
+ return StrideSet.begin();
+ }
+ SmallPtrSet<Value *, 8>::iterator strides_end() { return StrideSet.end(); }
+
+ /// Returns true if the target machine supports masked store operation
+ /// for the given \p DataType and kind of access to \p Ptr.
+ bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
+ return isConsecutivePtr(Ptr) && TTI->isLegalMaskedStore(DataType);
+ }
+ /// Returns true if the target machine supports masked load operation
+ /// for the given \p DataType and kind of access to \p Ptr.
+ bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
+ return isConsecutivePtr(Ptr) && TTI->isLegalMaskedLoad(DataType);
+ }
+ /// Returns true if vector representation of the instruction \p I
+ /// requires mask.
+ bool isMaskRequired(const Instruction* I) {
+ return (MaskedOp.count(I) != 0);
+ }
+ unsigned getNumStores() const {
+ return LAI->getNumStores();
+ }
+ unsigned getNumLoads() const {
+ return LAI->getNumLoads();
+ }
+ unsigned getNumPredStores() const {
+ return NumPredStores;
+ }
+private:
+ /// Check if a single basic block loop is vectorizable.
+ /// At this point we know that this is a loop with a constant trip count
+ /// and we only need to check individual instructions.
+ bool canVectorizeInstrs();
+
+ /// When we vectorize loops we may change the order in which
+ /// we read and write from memory. This method checks if it is
+ /// legal to vectorize the code, considering only memory constrains.
+ /// Returns true if the loop is vectorizable
+ bool canVectorizeMemory();
+
+ /// Return true if we can vectorize this loop using the IF-conversion
+ /// transformation.
+ bool canVectorizeWithIfConvert();
+
+ /// Collect the variables that need to stay uniform after vectorization.
+ void collectLoopUniforms();
+
+ /// Return true if all of the instructions in the block can be speculatively
+ /// executed. \p SafePtrs is a list of addresses that are known to be legal
+ /// and we know that we can read from them without segfault.
+ bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs);
+
+ /// \brief Collect memory access with loop invariant strides.
+ ///
+ /// Looks for accesses like "a[i * StrideA]" where "StrideA" is loop
+ /// invariant.
+ void collectStridedAccess(Value *LoadOrStoreInst);
+
+ /// Report an analysis message to assist the user in diagnosing loops that are
+ /// not vectorized. These are handled as LoopAccessReport rather than
+ /// VectorizationReport because the << operator of VectorizationReport returns
+ /// LoopAccessReport.
+ void emitAnalysis(const LoopAccessReport &Message) const {
+ emitAnalysisDiag(TheFunction, TheLoop, *Hints, Message);
+ }
+
+ unsigned NumPredStores;
+
+ /// The loop that we evaluate.
+ Loop *TheLoop;
+ /// A wrapper around ScalarEvolution used to add runtime SCEV checks.
+ /// Applies dynamic knowledge to simplify SCEV expressions in the context
+ /// of existing SCEV assumptions. The analysis will also add a minimal set
+ /// of new predicates if this is required to enable vectorization and
+ /// unrolling.
+ PredicatedScalarEvolution &PSE;
+ /// Target Library Info.
+ TargetLibraryInfo *TLI;
+ /// Parent function
+ Function *TheFunction;
+ /// Target Transform Info
+ const TargetTransformInfo *TTI;
+ /// Dominator Tree.
+ DominatorTree *DT;
+ // LoopAccess analysis.
+ LoopAccessAnalysis *LAA;
+ // And the loop-accesses info corresponding to this loop. This pointer is
+ // null until canVectorizeMemory sets it up.
+ const LoopAccessInfo *LAI;
+
+ /// The interleave access information contains groups of interleaved accesses
+ /// with the same stride and close to each other.
+ InterleavedAccessInfo InterleaveInfo;
+
+ // --- vectorization state --- //
+
+ /// Holds the integer induction variable. This is the counter of the
+ /// loop.
+ PHINode *Induction;
+ /// Holds the reduction variables.
+ ReductionList Reductions;
+ /// Holds all of the induction variables that we found in the loop.
+ /// Notice that inductions don't need to start at zero and that induction
+ /// variables can be pointers.
+ InductionList Inductions;
+ /// Holds the widest induction type encountered.
+ Type *WidestIndTy;
+
+ /// Allowed outside users. This holds the reduction
+ /// vars which can be accessed from outside the loop.
+ SmallPtrSet<Value*, 4> AllowedExit;
+ /// This set holds the variables which are known to be uniform after
+ /// vectorization.
+ SmallPtrSet<Instruction*, 4> Uniforms;
+
+ /// Can we assume the absence of NaNs.
+ bool HasFunNoNaNAttr;
+
+ /// Vectorization requirements that will go through late-evaluation.
+ LoopVectorizationRequirements *Requirements;
+
+ /// Used to emit an analysis of any legality issues.
+ const LoopVectorizeHints *Hints;
+
+ ValueToValueMap Strides;
+ SmallPtrSet<Value *, 8> StrideSet;
+
+ /// While vectorizing these instructions we have to generate a
+ /// call to the appropriate masked intrinsic
+ SmallPtrSet<const Instruction *, 8> MaskedOp;
+};
+
+/// LoopVectorizationCostModel - estimates the expected speedups due to
+/// vectorization.
+/// In many cases vectorization is not profitable. This can happen because of
+/// a number of reasons. In this class we mainly attempt to predict the
+/// expected speedup/slowdowns due to the supported instruction set. We use the
+/// TargetTransformInfo to query the different backends for the cost of
+/// different operations.
+class LoopVectorizationCostModel {
+public:
+ LoopVectorizationCostModel(Loop *L, PredicatedScalarEvolution &PSE,
+ LoopInfo *LI, LoopVectorizationLegality *Legal,
+ const TargetTransformInfo &TTI,
+ const TargetLibraryInfo *TLI, DemandedBits *DB,
+ AssumptionCache *AC, const Function *F,
+ const LoopVectorizeHints *Hints)
+ : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
+ AC(AC), TheFunction(F), Hints(Hints) {}
+
+ /// Information about vectorization costs
+ struct VectorizationFactor {
+ unsigned Width; // Vector width with best cost
+ unsigned Cost; // Cost of the loop with that width
+ };
+ /// \return The most profitable vectorization factor and the cost of that VF.
+ /// This method checks every power of two up to VF. If UserVF is not ZERO
+ /// then this vectorization factor will be selected if vectorization is
+ /// possible.
+ VectorizationFactor selectVectorizationFactor(bool OptForSize);
+
+ /// \return The size (in bits) of the smallest and widest types in the code
+ /// that needs to be vectorized. We ignore values that remain scalar such as
+ /// 64 bit loop indices.
+ std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
+
+ /// \return The desired interleave count.
+ /// If interleave count has been specified by metadata it will be returned.
+ /// Otherwise, the interleave count is computed and returned. VF and LoopCost
+ /// are the selected vectorization factor and the cost of the selected VF.
+ unsigned selectInterleaveCount(bool OptForSize, unsigned VF,
+ unsigned LoopCost);
+
+ /// \return The most profitable unroll factor.
+ /// This method finds the best unroll-factor based on register pressure and
+ /// other parameters. VF and LoopCost are the selected vectorization factor
+ /// and the cost of the selected VF.
+ unsigned computeInterleaveCount(bool OptForSize, unsigned VF,
+ unsigned LoopCost);
+
+ /// \brief A struct that represents some properties of the register usage
+ /// of a loop.
+ struct RegisterUsage {
+ /// Holds the number of loop invariant values that are used in the loop.
+ unsigned LoopInvariantRegs;
+ /// Holds the maximum number of concurrent live intervals in the loop.
+ unsigned MaxLocalUsers;
+ /// Holds the number of instructions in the loop.
+ unsigned NumInstructions;
+ };
+
+ /// \return Returns information about the register usages of the loop for the
+ /// given vectorization factors.
+ SmallVector<RegisterUsage, 8>
+ calculateRegisterUsage(const SmallVector<unsigned, 8> &VFs);
+
+ /// Collect values we want to ignore in the cost model.
+ void collectValuesToIgnore();
+
+private:
+ /// Returns the expected execution cost. The unit of the cost does
+ /// not matter because we use the 'cost' units to compare different
+ /// vector widths. The cost that is returned is *not* normalized by
+ /// the factor width.
+ unsigned expectedCost(unsigned VF);
+
+ /// Returns the execution time cost of an instruction for a given vector
+ /// width. Vector width of one means scalar.
+ unsigned getInstructionCost(Instruction *I, unsigned VF);
+
+ /// Returns whether the instruction is a load or store and will be a emitted
+ /// as a vector operation.
+ bool isConsecutiveLoadOrStore(Instruction *I);
+
+ /// Report an analysis message to assist the user in diagnosing loops that are
+ /// not vectorized. These are handled as LoopAccessReport rather than
+ /// VectorizationReport because the << operator of VectorizationReport returns
+ /// LoopAccessReport.
+ void emitAnalysis(const LoopAccessReport &Message) const {
+ emitAnalysisDiag(TheFunction, TheLoop, *Hints, Message);
+ }
+
+public:
+ /// Map of scalar integer values to the smallest bitwidth they can be legally
+ /// represented as. The vector equivalents of these values should be truncated
+ /// to this type.
+ MapVector<Instruction*,uint64_t> MinBWs;
+
+ /// The loop that we evaluate.
+ Loop *TheLoop;
+ /// Predicated scalar evolution analysis.
+ PredicatedScalarEvolution &PSE;
+ /// Loop Info analysis.
+ LoopInfo *LI;
+ /// Vectorization legality.
+ LoopVectorizationLegality *Legal;
+ /// Vector target information.
+ const TargetTransformInfo &TTI;
+ /// Target Library Info.
+ const TargetLibraryInfo *TLI;
+ /// Demanded bits analysis.
+ DemandedBits *DB;
+ /// Assumption cache.
+ AssumptionCache *AC;
+ const Function *TheFunction;
+ /// Loop Vectorize Hint.
+ const LoopVectorizeHints *Hints;
+ /// Values to ignore in the cost model.
+ SmallPtrSet<const Value *, 16> ValuesToIgnore;
+ /// Values to ignore in the cost model when VF > 1.
+ SmallPtrSet<const Value *, 16> VecValuesToIgnore;
+};
+
+/// \brief This holds vectorization requirements that must be verified late in
+/// the process. The requirements are set by legalize and costmodel. Once
+/// vectorization has been determined to be possible and profitable the
+/// requirements can be verified by looking for metadata or compiler options.
+/// For example, some loops require FP commutativity which is only allowed if
+/// vectorization is explicitly specified or if the fast-math compiler option
+/// has been provided.
+/// Late evaluation of these requirements allows helpful diagnostics to be
+/// composed that tells the user what need to be done to vectorize the loop. For
+/// example, by specifying #pragma clang loop vectorize or -ffast-math. Late
+/// evaluation should be used only when diagnostics can generated that can be
+/// followed by a non-expert user.
+class LoopVectorizationRequirements {
+public:
+ LoopVectorizationRequirements()
+ : NumRuntimePointerChecks(0), UnsafeAlgebraInst(nullptr) {}
+
+ void addUnsafeAlgebraInst(Instruction *I) {
+ // First unsafe algebra instruction.
+ if (!UnsafeAlgebraInst)
+ UnsafeAlgebraInst = I;
+ }
+
+ void addRuntimePointerChecks(unsigned Num) { NumRuntimePointerChecks = Num; }
+
+ bool doesNotMeet(Function *F, Loop *L, const LoopVectorizeHints &Hints) {
+ const char *Name = Hints.vectorizeAnalysisPassName();
+ bool Failed = false;
+ if (UnsafeAlgebraInst && !Hints.allowReordering()) {
+ emitOptimizationRemarkAnalysisFPCommute(
+ F->getContext(), Name, *F, UnsafeAlgebraInst->getDebugLoc(),
+ VectorizationReport() << "cannot prove it is safe to reorder "
+ "floating-point operations");
+ Failed = true;
+ }
+
+ // Test if runtime memcheck thresholds are exceeded.
+ bool PragmaThresholdReached =
+ NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
+ bool ThresholdReached =
+ NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
+ if ((ThresholdReached && !Hints.allowReordering()) ||
+ PragmaThresholdReached) {
+ emitOptimizationRemarkAnalysisAliasing(
+ F->getContext(), Name, *F, L->getStartLoc(),
+ VectorizationReport()
+ << "cannot prove it is safe to reorder memory operations");
+ DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
+ Failed = true;
+ }
+
+ return Failed;
+ }
+
+private:
+ unsigned NumRuntimePointerChecks;
+ Instruction *UnsafeAlgebraInst;
+};
+
+static void addInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) {
+ if (L.empty())
+ return V.push_back(&L);
+
+ for (Loop *InnerL : L)
+ addInnerLoop(*InnerL, V);
+}
+
+/// The LoopVectorize Pass.
+struct LoopVectorize : public FunctionPass {
+ /// Pass identification, replacement for typeid
+ static char ID;
+
+ explicit LoopVectorize(bool NoUnrolling = false, bool AlwaysVectorize = true)
+ : FunctionPass(ID),
+ DisableUnrolling(NoUnrolling),
+ AlwaysVectorize(AlwaysVectorize) {
+ initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
+ }
+
+ ScalarEvolution *SE;
+ LoopInfo *LI;
+ TargetTransformInfo *TTI;
+ DominatorTree *DT;
+ BlockFrequencyInfo *BFI;
+ TargetLibraryInfo *TLI;
+ DemandedBits *DB;
+ AliasAnalysis *AA;
+ AssumptionCache *AC;
+ LoopAccessAnalysis *LAA;
+ bool DisableUnrolling;
+ bool AlwaysVectorize;
+
+ BlockFrequency ColdEntryFreq;
+
+ bool runOnFunction(Function &F) override {
+ SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
+ auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+ TLI = TLIP ? &TLIP->getTLI() : nullptr;
+ AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+ AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ LAA = &getAnalysis<LoopAccessAnalysis>();
+ DB = &getAnalysis<DemandedBits>();
+
+ // Compute some weights outside of the loop over the loops. Compute this
+ // using a BranchProbability to re-use its scaling math.
+ const BranchProbability ColdProb(1, 5); // 20%
+ ColdEntryFreq = BlockFrequency(BFI->getEntryFreq()) * ColdProb;
+
+ // Don't attempt if
+ // 1. the target claims to have no vector registers, and
+ // 2. interleaving won't help ILP.
+ //
+ // The second condition is necessary because, even if the target has no
+ // vector registers, loop vectorization may still enable scalar
+ // interleaving.
+ if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2)
+ return false;
+
+ // Build up a worklist of inner-loops to vectorize. This is necessary as
+ // the act of vectorizing or partially unrolling a loop creates new loops
+ // and can invalidate iterators across the loops.
+ SmallVector<Loop *, 8> Worklist;
+
+ for (Loop *L : *LI)
+ addInnerLoop(*L, Worklist);
+
+ LoopsAnalyzed += Worklist.size();
+
+ // Now walk the identified inner loops.
+ bool Changed = false;
+ while (!Worklist.empty())
+ Changed |= processLoop(Worklist.pop_back_val());
+
+ // Process each loop nest in the function.
+ return Changed;
+ }
+
+ static void AddRuntimeUnrollDisableMetaData(Loop *L) {
+ SmallVector<Metadata *, 4> MDs;
+ // Reserve first location for self reference to the LoopID metadata node.
+ MDs.push_back(nullptr);
+ bool IsUnrollMetadata = false;
+ MDNode *LoopID = L->getLoopID();
+ if (LoopID) {
+ // First find existing loop unrolling disable metadata.
+ for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+ MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
+ if (MD) {
+ const MDString *S = dyn_cast<MDString>(MD->getOperand(0));
+ IsUnrollMetadata =
+ S && S->getString().startswith("llvm.loop.unroll.disable");
+ }
+ MDs.push_back(LoopID->getOperand(i));
+ }
+ }
+
+ if (!IsUnrollMetadata) {
+ // Add runtime unroll disable metadata.
+ LLVMContext &Context = L->getHeader()->getContext();
+ SmallVector<Metadata *, 1> DisableOperands;
+ DisableOperands.push_back(
+ MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
+ MDNode *DisableNode = MDNode::get(Context, DisableOperands);
+ MDs.push_back(DisableNode);
+ MDNode *NewLoopID = MDNode::get(Context, MDs);
+ // Set operand 0 to refer to the loop id itself.
+ NewLoopID->replaceOperandWith(0, NewLoopID);
+ L->setLoopID(NewLoopID);
+ }
+ }
+
+ bool processLoop(Loop *L) {
+ assert(L->empty() && "Only process inner loops.");
+
+#ifndef NDEBUG
+ const std::string DebugLocStr = getDebugLocString(L);
+#endif /* NDEBUG */
+
+ DEBUG(dbgs() << "\nLV: Checking a loop in \""
+ << L->getHeader()->getParent()->getName() << "\" from "
+ << DebugLocStr << "\n");
+
+ LoopVectorizeHints Hints(L, DisableUnrolling);
+
+ DEBUG(dbgs() << "LV: Loop hints:"
+ << " force="
+ << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
+ ? "disabled"
+ : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
+ ? "enabled"
+ : "?")) << " width=" << Hints.getWidth()
+ << " unroll=" << Hints.getInterleave() << "\n");
+
+ // Function containing loop
+ Function *F = L->getHeader()->getParent();
+
+ // Looking at the diagnostic output is the only way to determine if a loop
+ // was vectorized (other than looking at the IR or machine code), so it
+ // is important to generate an optimization remark for each loop. Most of
+ // these messages are generated by emitOptimizationRemarkAnalysis. Remarks
+ // generated by emitOptimizationRemark and emitOptimizationRemarkMissed are
+ // less verbose reporting vectorized loops and unvectorized loops that may
+ // benefit from vectorization, respectively.
+
+ if (!Hints.allowVectorization(F, L, AlwaysVectorize)) {
+ DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
+ return false;
+ }
+
+ // Check the loop for a trip count threshold:
+ // do not vectorize loops with a tiny trip count.
+ const unsigned TC = SE->getSmallConstantTripCount(L);
+ if (TC > 0u && TC < TinyTripCountVectorThreshold) {
+ DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
+ << "This loop is not worth vectorizing.");
+ if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
+ DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
+ else {
+ DEBUG(dbgs() << "\n");
+ emitAnalysisDiag(F, L, Hints, VectorizationReport()
+ << "vectorization is not beneficial "
+ "and is not explicitly forced");
+ return false;
+ }
+ }
+
+ PredicatedScalarEvolution PSE(*SE);
+
+ // Check if it is legal to vectorize the loop.
+ LoopVectorizationRequirements Requirements;
+ LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, LAA,
+ &Requirements, &Hints);
+ if (!LVL.canVectorize()) {
+ DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
+ emitMissedWarning(F, L, Hints);
+ return false;
+ }
+
+ // Use the cost model.
+ LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, F,
+ &Hints);
+ CM.collectValuesToIgnore();
+
+ // Check the function attributes to find out if this function should be
+ // optimized for size.
+ bool OptForSize = Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
+ F->optForSize();
+
+ // Compute the weighted frequency of this loop being executed and see if it
+ // is less than 20% of the function entry baseline frequency. Note that we
+ // always have a canonical loop here because we think we *can* vectorize.
+ // FIXME: This is hidden behind a flag due to pervasive problems with
+ // exactly what block frequency models.
+ if (LoopVectorizeWithBlockFrequency) {
+ BlockFrequency LoopEntryFreq = BFI->getBlockFreq(L->getLoopPreheader());
+ if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
+ LoopEntryFreq < ColdEntryFreq)
+ OptForSize = true;
+ }
+
+ // Check the function attributes to see if implicit floats are allowed.
+ // FIXME: This check doesn't seem possibly correct -- what if the loop is
+ // an integer loop and the vector instructions selected are purely integer
+ // vector instructions?
+ if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
+ DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
+ "attribute is used.\n");
+ emitAnalysisDiag(
+ F, L, Hints,
+ VectorizationReport()
+ << "loop not vectorized due to NoImplicitFloat attribute");
+ emitMissedWarning(F, L, Hints);
+ return false;
+ }
+
+ // Select the optimal vectorization factor.
+ const LoopVectorizationCostModel::VectorizationFactor VF =
+ CM.selectVectorizationFactor(OptForSize);
+
+ // Select the interleave count.
+ unsigned IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost);
+
+ // Get user interleave count.
+ unsigned UserIC = Hints.getInterleave();
+
+ // Identify the diagnostic messages that should be produced.
+ std::string VecDiagMsg, IntDiagMsg;
+ bool VectorizeLoop = true, InterleaveLoop = true;
+
+ if (Requirements.doesNotMeet(F, L, Hints)) {
+ DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
+ "requirements.\n");
+ emitMissedWarning(F, L, Hints);
+ return false;
+ }
+
+ if (VF.Width == 1) {
+ DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
+ VecDiagMsg =
+ "the cost-model indicates that vectorization is not beneficial";
+ VectorizeLoop = false;
+ }
+
+ if (IC == 1 && UserIC <= 1) {
+ // Tell the user interleaving is not beneficial.
+ DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
+ IntDiagMsg =
+ "the cost-model indicates that interleaving is not beneficial";
+ InterleaveLoop = false;
+ if (UserIC == 1)
+ IntDiagMsg +=
+ " and is explicitly disabled or interleave count is set to 1";
+ } else if (IC > 1 && UserIC == 1) {
+ // Tell the user interleaving is beneficial, but it explicitly disabled.
+ DEBUG(dbgs()
+ << "LV: Interleaving is beneficial but is explicitly disabled.");
+ IntDiagMsg = "the cost-model indicates that interleaving is beneficial "
+ "but is explicitly disabled or interleave count is set to 1";
+ InterleaveLoop = false;
+ }
+
+ // Override IC if user provided an interleave count.
+ IC = UserIC > 0 ? UserIC : IC;
+
+ // Emit diagnostic messages, if any.
+ const char *VAPassName = Hints.vectorizeAnalysisPassName();
+ if (!VectorizeLoop && !InterleaveLoop) {
+ // Do not vectorize or interleaving the loop.
+ emitOptimizationRemarkAnalysis(F->getContext(), VAPassName, *F,
+ L->getStartLoc(), VecDiagMsg);
+ emitOptimizationRemarkAnalysis(F->getContext(), LV_NAME, *F,
+ L->getStartLoc(), IntDiagMsg);
+ return false;
+ } else if (!VectorizeLoop && InterleaveLoop) {
+ DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
+ emitOptimizationRemarkAnalysis(F->getContext(), VAPassName, *F,
+ L->getStartLoc(), VecDiagMsg);
+ } else if (VectorizeLoop && !InterleaveLoop) {
+ DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "
+ << DebugLocStr << '\n');
+ emitOptimizationRemarkAnalysis(F->getContext(), LV_NAME, *F,
+ L->getStartLoc(), IntDiagMsg);
+ } else if (VectorizeLoop && InterleaveLoop) {
+ DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "
+ << DebugLocStr << '\n');
+ DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
+ }
+
+ if (!VectorizeLoop) {
+ assert(IC > 1 && "interleave count should not be 1 or 0");
+ // If we decided that it is not legal to vectorize the loop then
+ // interleave it.
+ InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, IC);
+ Unroller.vectorize(&LVL, CM.MinBWs);
+
+ emitOptimizationRemark(F->getContext(), LV_NAME, *F, L->getStartLoc(),
+ Twine("interleaved loop (interleaved count: ") +
+ Twine(IC) + ")");
+ } else {
+ // If we decided that it is *legal* to vectorize the loop then do it.
+ InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, VF.Width, IC);
+ LB.vectorize(&LVL, CM.MinBWs);
+ ++LoopsVectorized;
+
+ // Add metadata to disable runtime unrolling scalar loop when there's no
+ // runtime check about strides and memory. Because at this situation,
+ // scalar loop is rarely used not worthy to be unrolled.
+ if (!LB.IsSafetyChecksAdded())
+ AddRuntimeUnrollDisableMetaData(L);
+
+ // Report the vectorization decision.
+ emitOptimizationRemark(F->getContext(), LV_NAME, *F, L->getStartLoc(),
+ Twine("vectorized loop (vectorization width: ") +
+ Twine(VF.Width) + ", interleaved count: " +
+ Twine(IC) + ")");
+ }
+
+ // Mark the loop as already vectorized to avoid vectorizing again.
+ Hints.setAlreadyVectorized();
+
+ DEBUG(verifyFunction(*L->getHeader()->getParent()));
+ return true;
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequiredID(LoopSimplifyID);
+ AU.addRequiredID(LCSSAID);
+ AU.addRequired<BlockFrequencyInfoWrapperPass>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addRequired<LoopAccessAnalysis>();
+ AU.addRequired<DemandedBits>();
+ AU.addPreserved<LoopInfoWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<BasicAAWrapperPass>();
+ AU.addPreserved<AAResultsWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ }
+
+};
+
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
+// LoopVectorizationCostModel.
+//===----------------------------------------------------------------------===//
+
+Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
+ // We need to place the broadcast of invariant variables outside the loop.
+ Instruction *Instr = dyn_cast<Instruction>(V);
+ bool NewInstr =
+ (Instr && std::find(LoopVectorBody.begin(), LoopVectorBody.end(),
+ Instr->getParent()) != LoopVectorBody.end());
+ bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr;
+
+ // Place the code for broadcasting invariant variables in the new preheader.
+ IRBuilder<>::InsertPointGuard Guard(Builder);
+ if (Invariant)
+ Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
+
+ // Broadcast the scalar into all locations in the vector.
+ Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
+
+ return Shuf;
+}
+
+Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx,
+ Value *Step) {
+ assert(Val->getType()->isVectorTy() && "Must be a vector");
+ assert(Val->getType()->getScalarType()->isIntegerTy() &&
+ "Elem must be an integer");
+ assert(Step->getType() == Val->getType()->getScalarType() &&
+ "Step has wrong type");
+ // Create the types.
+ Type *ITy = Val->getType()->getScalarType();
+ VectorType *Ty = cast<VectorType>(Val->getType());
+ int VLen = Ty->getNumElements();
+ SmallVector<Constant*, 8> Indices;
+
+ // Create a vector of consecutive numbers from zero to VF.
+ for (int i = 0; i < VLen; ++i)
+ Indices.push_back(ConstantInt::get(ITy, StartIdx + i));
+
+ // Add the consecutive indices to the vector value.
+ Constant *Cv = ConstantVector::get(Indices);
+ assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
+ Step = Builder.CreateVectorSplat(VLen, Step);
+ assert(Step->getType() == Val->getType() && "Invalid step vec");
+ // FIXME: The newly created binary instructions should contain nsw/nuw flags,
+ // which can be found from the original scalar operations.
+ Step = Builder.CreateMul(Cv, Step);
+ return Builder.CreateAdd(Val, Step, "induction");
+}
+
+int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
+ assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr");
+ auto *SE = PSE.getSE();
+ // Make sure that the pointer does not point to structs.
+ if (Ptr->getType()->getPointerElementType()->isAggregateType())
+ return 0;
+
+ // If this value is a pointer induction variable we know it is consecutive.
+ PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr);
+ if (Phi && Inductions.count(Phi)) {
+ InductionDescriptor II = Inductions[Phi];
+ return II.getConsecutiveDirection();
+ }
+
+ GetElementPtrInst *Gep = getGEPInstruction(Ptr);
+ if (!Gep)
+ return 0;
+
+ unsigned NumOperands = Gep->getNumOperands();
+ Value *GpPtr = Gep->getPointerOperand();
+ // If this GEP value is a consecutive pointer induction variable and all of
+ // the indices are constant then we know it is consecutive. We can
+ Phi = dyn_cast<PHINode>(GpPtr);
+ if (Phi && Inductions.count(Phi)) {
+
+ // Make sure that the pointer does not point to structs.
+ PointerType *GepPtrType = cast<PointerType>(GpPtr->getType());
+ if (GepPtrType->getElementType()->isAggregateType())
+ return 0;
+
+ // Make sure that all of the index operands are loop invariant.
+ for (unsigned i = 1; i < NumOperands; ++i)
+ if (!SE->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)), TheLoop))
+ return 0;
+
+ InductionDescriptor II = Inductions[Phi];
+ return II.getConsecutiveDirection();
+ }
+
+ unsigned InductionOperand = getGEPInductionOperand(Gep);
+
+ // Check that all of the gep indices are uniform except for our induction
+ // operand.
+ for (unsigned i = 0; i != NumOperands; ++i)
+ if (i != InductionOperand &&
+ !SE->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)), TheLoop))
+ return 0;
+
+ // We can emit wide load/stores only if the last non-zero index is the
+ // induction variable.
+ const SCEV *Last = nullptr;
+ if (!Strides.count(Gep))
+ Last = PSE.getSCEV(Gep->getOperand(InductionOperand));
+ else {
+ // Because of the multiplication by a stride we can have a s/zext cast.
+ // We are going to replace this stride by 1 so the cast is safe to ignore.
+ //
+ // %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ // %0 = trunc i64 %indvars.iv to i32
+ // %mul = mul i32 %0, %Stride1
+ // %idxprom = zext i32 %mul to i64 << Safe cast.
+ // %arrayidx = getelementptr inbounds i32* %B, i64 %idxprom
+ //
+ Last = replaceSymbolicStrideSCEV(PSE, Strides,
+ Gep->getOperand(InductionOperand), Gep);
+ if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(Last))
+ Last =
+ (C->getSCEVType() == scSignExtend || C->getSCEVType() == scZeroExtend)
+ ? C->getOperand()
+ : Last;
+ }
+ if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) {
+ const SCEV *Step = AR->getStepRecurrence(*SE);
+
+ // The memory is consecutive because the last index is consecutive
+ // and all other indices are loop invariant.
+ if (Step->isOne())
+ return 1;
+ if (Step->isAllOnesValue())
+ return -1;
+ }
+
+ return 0;
+}
+
+bool LoopVectorizationLegality::isUniform(Value *V) {
+ return LAI->isUniform(V);
+}
+
+InnerLoopVectorizer::VectorParts&
+InnerLoopVectorizer::getVectorValue(Value *V) {
+ assert(V != Induction && "The new induction variable should not be used.");
+ assert(!V->getType()->isVectorTy() && "Can't widen a vector");
+
+ // If we have a stride that is replaced by one, do it here.
+ if (Legal->hasStride(V))
+ V = ConstantInt::get(V->getType(), 1);
+
+ // If we have this scalar in the map, return it.
+ if (WidenMap.has(V))
+ return WidenMap.get(V);
+
+ // If this scalar is unknown, assume that it is a constant or that it is
+ // loop invariant. Broadcast V and save the value for future uses.
+ Value *B = getBroadcastInstrs(V);
+ return WidenMap.splat(V, B);
+}
+
+Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
+ assert(Vec->getType()->isVectorTy() && "Invalid type");
+ SmallVector<Constant*, 8> ShuffleMask;
+ for (unsigned i = 0; i < VF; ++i)
+ ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
+
+ return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
+ ConstantVector::get(ShuffleMask),
+ "reverse");
+}
+
+// Get a mask to interleave \p NumVec vectors into a wide vector.
+// I.e. <0, VF, VF*2, ..., VF*(NumVec-1), 1, VF+1, VF*2+1, ...>
+// E.g. For 2 interleaved vectors, if VF is 4, the mask is:
+// <0, 4, 1, 5, 2, 6, 3, 7>
+static Constant *getInterleavedMask(IRBuilder<> &Builder, unsigned VF,
+ unsigned NumVec) {
+ SmallVector<Constant *, 16> Mask;
+ for (unsigned i = 0; i < VF; i++)
+ for (unsigned j = 0; j < NumVec; j++)
+ Mask.push_back(Builder.getInt32(j * VF + i));
+
+ return ConstantVector::get(Mask);
+}
+
+// Get the strided mask starting from index \p Start.
+// I.e. <Start, Start + Stride, ..., Start + Stride*(VF-1)>
+static Constant *getStridedMask(IRBuilder<> &Builder, unsigned Start,
+ unsigned Stride, unsigned VF) {
+ SmallVector<Constant *, 16> Mask;
+ for (unsigned i = 0; i < VF; i++)
+ Mask.push_back(Builder.getInt32(Start + i * Stride));
+
+ return ConstantVector::get(Mask);
+}
+
+// Get a mask of two parts: The first part consists of sequential integers
+// starting from 0, The second part consists of UNDEFs.
+// I.e. <0, 1, 2, ..., NumInt - 1, undef, ..., undef>
+static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned NumInt,
+ unsigned NumUndef) {
+ SmallVector<Constant *, 16> Mask;
+ for (unsigned i = 0; i < NumInt; i++)
+ Mask.push_back(Builder.getInt32(i));
+
+ Constant *Undef = UndefValue::get(Builder.getInt32Ty());
+ for (unsigned i = 0; i < NumUndef; i++)
+ Mask.push_back(Undef);
+
+ return ConstantVector::get(Mask);
+}
+
+// Concatenate two vectors with the same element type. The 2nd vector should
+// not have more elements than the 1st vector. If the 2nd vector has less
+// elements, extend it with UNDEFs.
+static Value *ConcatenateTwoVectors(IRBuilder<> &Builder, Value *V1,
+ Value *V2) {
+ VectorType *VecTy1 = dyn_cast<VectorType>(V1->getType());
+ VectorType *VecTy2 = dyn_cast<VectorType>(V2->getType());
+ assert(VecTy1 && VecTy2 &&
+ VecTy1->getScalarType() == VecTy2->getScalarType() &&
+ "Expect two vectors with the same element type");
+
+ unsigned NumElts1 = VecTy1->getNumElements();
+ unsigned NumElts2 = VecTy2->getNumElements();
+ assert(NumElts1 >= NumElts2 && "Unexpect the first vector has less elements");
+
+ if (NumElts1 > NumElts2) {
+ // Extend with UNDEFs.
+ Constant *ExtMask =
+ getSequentialMask(Builder, NumElts2, NumElts1 - NumElts2);
+ V2 = Builder.CreateShuffleVector(V2, UndefValue::get(VecTy2), ExtMask);
+ }
+
+ Constant *Mask = getSequentialMask(Builder, NumElts1 + NumElts2, 0);
+ return Builder.CreateShuffleVector(V1, V2, Mask);
+}
+
+// Concatenate vectors in the given list. All vectors have the same type.
+static Value *ConcatenateVectors(IRBuilder<> &Builder,
+ ArrayRef<Value *> InputList) {
+ unsigned NumVec = InputList.size();
+ assert(NumVec > 1 && "Should be at least two vectors");
+
+ SmallVector<Value *, 8> ResList;
+ ResList.append(InputList.begin(), InputList.end());
+ do {
+ SmallVector<Value *, 8> TmpList;
+ for (unsigned i = 0; i < NumVec - 1; i += 2) {
+ Value *V0 = ResList[i], *V1 = ResList[i + 1];
+ assert((V0->getType() == V1->getType() || i == NumVec - 2) &&
+ "Only the last vector may have a different type");
+
+ TmpList.push_back(ConcatenateTwoVectors(Builder, V0, V1));
+ }
+
+ // Push the last vector if the total number of vectors is odd.
+ if (NumVec % 2 != 0)
+ TmpList.push_back(ResList[NumVec - 1]);
+
+ ResList = TmpList;
+ NumVec = ResList.size();
+ } while (NumVec > 1);
+
+ return ResList[0];
+}
+
+// Try to vectorize the interleave group that \p Instr belongs to.
+//
+// E.g. Translate following interleaved load group (factor = 3):
+// for (i = 0; i < N; i+=3) {
+// R = Pic[i]; // Member of index 0
+// G = Pic[i+1]; // Member of index 1
+// B = Pic[i+2]; // Member of index 2
+// ... // do something to R, G, B
+// }
+// To:
+// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
+// %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements
+// %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements
+// %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements
+//
+// Or translate following interleaved store group (factor = 3):
+// for (i = 0; i < N; i+=3) {
+// ... do something to R, G, B
+// Pic[i] = R; // Member of index 0
+// Pic[i+1] = G; // Member of index 1
+// Pic[i+2] = B; // Member of index 2
+// }
+// To:
+// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
+// %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
+// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
+// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
+// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
+void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
+ const InterleaveGroup *Group = Legal->getInterleavedAccessGroup(Instr);
+ assert(Group && "Fail to get an interleaved access group.");
+
+ // Skip if current instruction is not the insert position.
+ if (Instr != Group->getInsertPos())
+ return;
+
+ LoadInst *LI = dyn_cast<LoadInst>(Instr);
+ StoreInst *SI = dyn_cast<StoreInst>(Instr);
+ Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
+
+ // Prepare for the vector type of the interleaved load/store.
+ Type *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
+ unsigned InterleaveFactor = Group->getFactor();
+ Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
+ Type *PtrTy = VecTy->getPointerTo(Ptr->getType()->getPointerAddressSpace());
+
+ // Prepare for the new pointers.
+ setDebugLocFromInst(Builder, Ptr);
+ VectorParts &PtrParts = getVectorValue(Ptr);
+ SmallVector<Value *, 2> NewPtrs;
+ unsigned Index = Group->getIndex(Instr);
+ for (unsigned Part = 0; Part < UF; Part++) {
+ // Extract the pointer for current instruction from the pointer vector. A
+ // reverse access uses the pointer in the last lane.
+ Value *NewPtr = Builder.CreateExtractElement(
+ PtrParts[Part],
+ Group->isReverse() ? Builder.getInt32(VF - 1) : Builder.getInt32(0));
+
+ // Notice current instruction could be any index. Need to adjust the address
+ // to the member of index 0.
+ //
+ // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
+ // b = A[i]; // Member of index 0
+ // Current pointer is pointed to A[i+1], adjust it to A[i].
+ //
+ // E.g. A[i+1] = a; // Member of index 1
+ // A[i] = b; // Member of index 0
+ // A[i+2] = c; // Member of index 2 (Current instruction)
+ // Current pointer is pointed to A[i+2], adjust it to A[i].
+ NewPtr = Builder.CreateGEP(NewPtr, Builder.getInt32(-Index));
+
+ // Cast to the vector pointer type.
+ NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
+ }
+
+ setDebugLocFromInst(Builder, Instr);
+ Value *UndefVec = UndefValue::get(VecTy);
+
+ // Vectorize the interleaved load group.
+ if (LI) {
+ for (unsigned Part = 0; Part < UF; Part++) {
+ Instruction *NewLoadInstr = Builder.CreateAlignedLoad(
+ NewPtrs[Part], Group->getAlignment(), "wide.vec");
+
+ for (unsigned i = 0; i < InterleaveFactor; i++) {
+ Instruction *Member = Group->getMember(i);
+
+ // Skip the gaps in the group.
+ if (!Member)
+ continue;
+
+ Constant *StrideMask = getStridedMask(Builder, i, InterleaveFactor, VF);
+ Value *StridedVec = Builder.CreateShuffleVector(
+ NewLoadInstr, UndefVec, StrideMask, "strided.vec");
+
+ // If this member has different type, cast the result type.
+ if (Member->getType() != ScalarTy) {
+ VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
+ StridedVec = Builder.CreateBitOrPointerCast(StridedVec, OtherVTy);
+ }
+
+ VectorParts &Entry = WidenMap.get(Member);
+ Entry[Part] =
+ Group->isReverse() ? reverseVector(StridedVec) : StridedVec;
+ }
+
+ propagateMetadata(NewLoadInstr, Instr);
+ }
+ return;
+ }
+
+ // The sub vector type for current instruction.
+ VectorType *SubVT = VectorType::get(ScalarTy, VF);
+
+ // Vectorize the interleaved store group.
+ for (unsigned Part = 0; Part < UF; Part++) {
+ // Collect the stored vector from each member.
+ SmallVector<Value *, 4> StoredVecs;
+ for (unsigned i = 0; i < InterleaveFactor; i++) {
+ // Interleaved store group doesn't allow a gap, so each index has a member
+ Instruction *Member = Group->getMember(i);
+ assert(Member && "Fail to get a member from an interleaved store group");
+
+ Value *StoredVec =
+ getVectorValue(dyn_cast<StoreInst>(Member)->getValueOperand())[Part];
+ if (Group->isReverse())
+ StoredVec = reverseVector(StoredVec);
+
+ // If this member has different type, cast it to an unified type.
+ if (StoredVec->getType() != SubVT)
+ StoredVec = Builder.CreateBitOrPointerCast(StoredVec, SubVT);
+
+ StoredVecs.push_back(StoredVec);
+ }
+
+ // Concatenate all vectors into a wide vector.
+ Value *WideVec = ConcatenateVectors(Builder, StoredVecs);
+
+ // Interleave the elements in the wide vector.
+ Constant *IMask = getInterleavedMask(Builder, VF, InterleaveFactor);
+ Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
+ "interleaved.vec");
+
+ Instruction *NewStoreInstr =
+ Builder.CreateAlignedStore(IVec, NewPtrs[Part], Group->getAlignment());
+ propagateMetadata(NewStoreInstr, Instr);
+ }
+}
+
+void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
+ // Attempt to issue a wide load.
+ LoadInst *LI = dyn_cast<LoadInst>(Instr);
+ StoreInst *SI = dyn_cast<StoreInst>(Instr);
+
+ assert((LI || SI) && "Invalid Load/Store instruction");
+
+ // Try to vectorize the interleave group if this access is interleaved.
+ if (Legal->isAccessInterleaved(Instr))
+ return vectorizeInterleaveGroup(Instr);
+
+ Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType();
+ Type *DataTy = VectorType::get(ScalarDataTy, VF);
+ Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
+ unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment();
+ // An alignment of 0 means target abi alignment. We need to use the scalar's
+ // target abi alignment in such a case.
+ const DataLayout &DL = Instr->getModule()->getDataLayout();
+ if (!Alignment)
+ Alignment = DL.getABITypeAlignment(ScalarDataTy);
+ unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
+ unsigned ScalarAllocatedSize = DL.getTypeAllocSize(ScalarDataTy);
+ unsigned VectorElementSize = DL.getTypeStoreSize(DataTy) / VF;
+
+ if (SI && Legal->blockNeedsPredication(SI->getParent()) &&
+ !Legal->isMaskRequired(SI))
+ return scalarizeInstruction(Instr, true);
+
+ if (ScalarAllocatedSize != VectorElementSize)
+ return scalarizeInstruction(Instr);
+
+ // If the pointer is loop invariant or if it is non-consecutive,
+ // scalarize the load.
+ int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
+ bool Reverse = ConsecutiveStride < 0;
+ bool UniformLoad = LI && Legal->isUniform(Ptr);
+ if (!ConsecutiveStride || UniformLoad)
+ return scalarizeInstruction(Instr);
+
+ Constant *Zero = Builder.getInt32(0);
+ VectorParts &Entry = WidenMap.get(Instr);
+
+ // Handle consecutive loads/stores.
+ GetElementPtrInst *Gep = getGEPInstruction(Ptr);
+ if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) {
+ setDebugLocFromInst(Builder, Gep);
+ Value *PtrOperand = Gep->getPointerOperand();
+ Value *FirstBasePtr = getVectorValue(PtrOperand)[0];
+ FirstBasePtr = Builder.CreateExtractElement(FirstBasePtr, Zero);
+
+ // Create the new GEP with the new induction variable.
+ GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
+ Gep2->setOperand(0, FirstBasePtr);
+ Gep2->setName("gep.indvar.base");
+ Ptr = Builder.Insert(Gep2);
+ } else if (Gep) {
+ setDebugLocFromInst(Builder, Gep);
+ assert(PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getPointerOperand()),
+ OrigLoop) &&
+ "Base ptr must be invariant");
+
+ // The last index does not have to be the induction. It can be
+ // consecutive and be a function of the index. For example A[I+1];
+ unsigned NumOperands = Gep->getNumOperands();
+ unsigned InductionOperand = getGEPInductionOperand(Gep);
+ // Create the new GEP with the new induction variable.
+ GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
+
+ for (unsigned i = 0; i < NumOperands; ++i) {
+ Value *GepOperand = Gep->getOperand(i);
+ Instruction *GepOperandInst = dyn_cast<Instruction>(GepOperand);
+
+ // Update last index or loop invariant instruction anchored in loop.
+ if (i == InductionOperand ||
+ (GepOperandInst && OrigLoop->contains(GepOperandInst))) {
+ assert((i == InductionOperand ||
+ PSE.getSE()->isLoopInvariant(PSE.getSCEV(GepOperandInst),
+ OrigLoop)) &&
+ "Must be last index or loop invariant");
+
+ VectorParts &GEPParts = getVectorValue(GepOperand);
+ Value *Index = GEPParts[0];
+ Index = Builder.CreateExtractElement(Index, Zero);
+ Gep2->setOperand(i, Index);
+ Gep2->setName("gep.indvar.idx");
+ }
+ }
+ Ptr = Builder.Insert(Gep2);
+ } else {
+ // Use the induction element ptr.
+ assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
+ setDebugLocFromInst(Builder, Ptr);
+ VectorParts &PtrVal = getVectorValue(Ptr);
+ Ptr = Builder.CreateExtractElement(PtrVal[0], Zero);
+ }
+
+ VectorParts Mask = createBlockInMask(Instr->getParent());
+ // Handle Stores:
+ if (SI) {
+ assert(!Legal->isUniform(SI->getPointerOperand()) &&
+ "We do not allow storing to uniform addresses");
+ setDebugLocFromInst(Builder, SI);
+ // We don't want to update the value in the map as it might be used in
+ // another expression. So don't use a reference type for "StoredVal".
+ VectorParts StoredVal = getVectorValue(SI->getValueOperand());
+
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ // Calculate the pointer for the specific unroll-part.
+ Value *PartPtr =
+ Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF));
+
+ if (Reverse) {
+ // If we store to reverse consecutive memory locations, then we need
+ // to reverse the order of elements in the stored value.
+ StoredVal[Part] = reverseVector(StoredVal[Part]);
+ // If the address is consecutive but reversed, then the
+ // wide store needs to start at the last vector element.
+ PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF));
+ PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF));
+ Mask[Part] = reverseVector(Mask[Part]);
+ }
+
+ Value *VecPtr = Builder.CreateBitCast(PartPtr,
+ DataTy->getPointerTo(AddressSpace));
+
+ Instruction *NewSI;
+ if (Legal->isMaskRequired(SI))
+ NewSI = Builder.CreateMaskedStore(StoredVal[Part], VecPtr, Alignment,
+ Mask[Part]);
+ else
+ NewSI = Builder.CreateAlignedStore(StoredVal[Part], VecPtr, Alignment);
+ propagateMetadata(NewSI, SI);
+ }
+ return;
+ }
+
+ // Handle loads.
+ assert(LI && "Must have a load instruction");
+ setDebugLocFromInst(Builder, LI);
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ // Calculate the pointer for the specific unroll-part.
+ Value *PartPtr =
+ Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF));
+
+ if (Reverse) {
+ // If the address is consecutive but reversed, then the
+ // wide load needs to start at the last vector element.
+ PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF));
+ PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF));
+ Mask[Part] = reverseVector(Mask[Part]);
+ }
+
+ Instruction* NewLI;
+ Value *VecPtr = Builder.CreateBitCast(PartPtr,
+ DataTy->getPointerTo(AddressSpace));
+ if (Legal->isMaskRequired(LI))
+ NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
+ UndefValue::get(DataTy),
+ "wide.masked.load");
+ else
+ NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load");
+ propagateMetadata(NewLI, LI);
+ Entry[Part] = Reverse ? reverseVector(NewLI) : NewLI;
+ }
+}
+
+void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
+ bool IfPredicateStore) {
+ assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
+ // Holds vector parameters or scalars, in case of uniform vals.
+ SmallVector<VectorParts, 4> Params;
+
+ setDebugLocFromInst(Builder, Instr);
+
+ // Find all of the vectorized parameters.
+ for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
+ Value *SrcOp = Instr->getOperand(op);
+
+ // If we are accessing the old induction variable, use the new one.
+ if (SrcOp == OldInduction) {
+ Params.push_back(getVectorValue(SrcOp));
+ continue;
+ }
+
+ // Try using previously calculated values.
+ Instruction *SrcInst = dyn_cast<Instruction>(SrcOp);
+
+ // If the src is an instruction that appeared earlier in the basic block,
+ // then it should already be vectorized.
+ if (SrcInst && OrigLoop->contains(SrcInst)) {
+ assert(WidenMap.has(SrcInst) && "Source operand is unavailable");
+ // The parameter is a vector value from earlier.
+ Params.push_back(WidenMap.get(SrcInst));
+ } else {
+ // The parameter is a scalar from outside the loop. Maybe even a constant.
+ VectorParts Scalars;
+ Scalars.append(UF, SrcOp);
+ Params.push_back(Scalars);
+ }
+ }
+
+ assert(Params.size() == Instr->getNumOperands() &&
+ "Invalid number of operands");
+
+ // Does this instruction return a value ?
+ bool IsVoidRetTy = Instr->getType()->isVoidTy();
+
+ Value *UndefVec = IsVoidRetTy ? nullptr :
+ UndefValue::get(VectorType::get(Instr->getType(), VF));
+ // Create a new entry in the WidenMap and initialize it to Undef or Null.
+ VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
+
+ VectorParts Cond;
+ if (IfPredicateStore) {
+ assert(Instr->getParent()->getSinglePredecessor() &&
+ "Only support single predecessor blocks");
+ Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(),
+ Instr->getParent());
+ }
+
+ // For each vector unroll 'part':
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ // For each scalar that we create:
+ for (unsigned Width = 0; Width < VF; ++Width) {
+
+ // Start if-block.
+ Value *Cmp = nullptr;
+ if (IfPredicateStore) {
+ Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Width));
+ Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp,
+ ConstantInt::get(Cmp->getType(), 1));
+ }
+
+ Instruction *Cloned = Instr->clone();
+ if (!IsVoidRetTy)
+ Cloned->setName(Instr->getName() + ".cloned");
+ // Replace the operands of the cloned instructions with extracted scalars.
+ for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
+ Value *Op = Params[op][Part];
+ // Param is a vector. Need to extract the right lane.
+ if (Op->getType()->isVectorTy())
+ Op = Builder.CreateExtractElement(Op, Builder.getInt32(Width));
+ Cloned->setOperand(op, Op);
+ }
+
+ // Place the cloned scalar in the new loop.
+ Builder.Insert(Cloned);
+
+ // If the original scalar returns a value we need to place it in a vector
+ // so that future users will be able to use it.
+ if (!IsVoidRetTy)
+ VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned,
+ Builder.getInt32(Width));
+ // End if-block.
+ if (IfPredicateStore)
+ PredicatedStores.push_back(std::make_pair(cast<StoreInst>(Cloned),
+ Cmp));
+ }
+ }
+}
+
+PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
+ Value *End, Value *Step,
+ Instruction *DL) {
+ BasicBlock *Header = L->getHeader();
+ BasicBlock *Latch = L->getLoopLatch();
+ // As we're just creating this loop, it's possible no latch exists
+ // yet. If so, use the header as this will be a single block loop.
+ if (!Latch)
+ Latch = Header;
+
+ IRBuilder<> Builder(&*Header->getFirstInsertionPt());
+ setDebugLocFromInst(Builder, getDebugLocFromInstOrOperands(OldInduction));
+ auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
+
+ Builder.SetInsertPoint(Latch->getTerminator());
+
+ // Create i+1 and fill the PHINode.
+ Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
+ Induction->addIncoming(Start, L->getLoopPreheader());
+ Induction->addIncoming(Next, Latch);
+ // Create the compare.
+ Value *ICmp = Builder.CreateICmpEQ(Next, End);
+ Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
+
+ // Now we have two terminators. Remove the old one from the block.
+ Latch->getTerminator()->eraseFromParent();
+
+ return Induction;
+}
+
+Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
+ if (TripCount)
+ return TripCount;
+
+ IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
+ // Find the loop boundaries.
+ ScalarEvolution *SE = PSE.getSE();
+ const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(OrigLoop);
+ assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
+ "Invalid loop count");
+
+ Type *IdxTy = Legal->getWidestInductionType();
+
+ // The exit count might have the type of i64 while the phi is i32. This can
+ // happen if we have an induction variable that is sign extended before the
+ // compare. The only way that we get a backedge taken count is that the
+ // induction variable was signed and as such will not overflow. In such a case
+ // truncation is legal.
+ if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
+ IdxTy->getPrimitiveSizeInBits())
+ BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
+ BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
+
+ // Get the total trip count from the count by adding 1.
+ const SCEV *ExitCount = SE->getAddExpr(
+ BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
+
+ const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+
+ // Expand the trip count and place the new instructions in the preheader.
+ // Notice that the pre-header does not change, only the loop body.
+ SCEVExpander Exp(*SE, DL, "induction");
+
+ // Count holds the overall loop count (N).
+ TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
+ L->getLoopPreheader()->getTerminator());
+
+ if (TripCount->getType()->isPointerTy())
+ TripCount =
+ CastInst::CreatePointerCast(TripCount, IdxTy,
+ "exitcount.ptrcnt.to.int",
+ L->getLoopPreheader()->getTerminator());
+
+ return TripCount;
+}
+
+Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
+ if (VectorTripCount)
+ return VectorTripCount;
+
+ Value *TC = getOrCreateTripCount(L);
+ IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
+
+ // Now we need to generate the expression for N - (N % VF), which is
+ // the part that the vectorized body will execute.
+ // The loop step is equal to the vectorization factor (num of SIMD elements)
+ // times the unroll factor (num of SIMD instructions).
+ Constant *Step = ConstantInt::get(TC->getType(), VF * UF);
+ Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
+ VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
+
+ return VectorTripCount;
+}
+
+void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
+ BasicBlock *Bypass) {
+ Value *Count = getOrCreateTripCount(L);
+ BasicBlock *BB = L->getLoopPreheader();
+ IRBuilder<> Builder(BB->getTerminator());
+
+ // Generate code to check that the loop's trip count that we computed by
+ // adding one to the backedge-taken count will not overflow.
+ Value *CheckMinIters =
+ Builder.CreateICmpULT(Count,
+ ConstantInt::get(Count->getType(), VF * UF),
+ "min.iters.check");
+
+ BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(),
+ "min.iters.checked");
+ if (L->getParentLoop())
+ L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
+ ReplaceInstWithInst(BB->getTerminator(),
+ BranchInst::Create(Bypass, NewBB, CheckMinIters));
+ LoopBypassBlocks.push_back(BB);
+}
+
+void InnerLoopVectorizer::emitVectorLoopEnteredCheck(Loop *L,
+ BasicBlock *Bypass) {
+ Value *TC = getOrCreateVectorTripCount(L);
+ BasicBlock *BB = L->getLoopPreheader();
+ IRBuilder<> Builder(BB->getTerminator());
+
+ // Now, compare the new count to zero. If it is zero skip the vector loop and
+ // jump to the scalar loop.
+ Value *Cmp = Builder.CreateICmpEQ(TC, Constant::getNullValue(TC->getType()),
+ "cmp.zero");
+
+ // Generate code to check that the loop's trip count that we computed by
+ // adding one to the backedge-taken count will not overflow.
+ BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(),
+ "vector.ph");
+ if (L->getParentLoop())
+ L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
+ ReplaceInstWithInst(BB->getTerminator(),
+ BranchInst::Create(Bypass, NewBB, Cmp));
+ LoopBypassBlocks.push_back(BB);
+}
+
+void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
+ BasicBlock *BB = L->getLoopPreheader();
+
+ // Generate the code to check that the SCEV assumptions that we made.
+ // We want the new basic block to start at the first instruction in a
+ // sequence of instructions that form a check.
+ SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
+ "scev.check");
+ Value *SCEVCheck =
+ Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator());
+
+ if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
+ if (C->isZero())
+ return;
+
+ // Create a new block containing the stride check.
+ BB->setName("vector.scevcheck");
+ auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
+ if (L->getParentLoop())
+ L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
+ ReplaceInstWithInst(BB->getTerminator(),
+ BranchInst::Create(Bypass, NewBB, SCEVCheck));
+ LoopBypassBlocks.push_back(BB);
+ AddedSafetyChecks = true;
+}
+
+void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L,
+ BasicBlock *Bypass) {
+ BasicBlock *BB = L->getLoopPreheader();
+
+ // Generate the code that checks in runtime if arrays overlap. We put the
+ // checks into a separate block to make the more common case of few elements
+ // faster.
+ Instruction *FirstCheckInst;
+ Instruction *MemRuntimeCheck;
+ std::tie(FirstCheckInst, MemRuntimeCheck) =
+ Legal->getLAI()->addRuntimeChecks(BB->getTerminator());
+ if (!MemRuntimeCheck)
+ return;
+
+ // Create a new block containing the memory check.
+ BB->setName("vector.memcheck");
+ auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
+ if (L->getParentLoop())
+ L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
+ ReplaceInstWithInst(BB->getTerminator(),
+ BranchInst::Create(Bypass, NewBB, MemRuntimeCheck));
+ LoopBypassBlocks.push_back(BB);
+ AddedSafetyChecks = true;
+}
+
+
+void InnerLoopVectorizer::createEmptyLoop() {
+ /*
+ In this function we generate a new loop. The new loop will contain
+ the vectorized instructions while the old loop will continue to run the
+ scalar remainder.
+
+ [ ] <-- loop iteration number check.
+ / |
+ / v
+ | [ ] <-- vector loop bypass (may consist of multiple blocks).
+ | / |
+ | / v
+ || [ ] <-- vector pre header.
+ |/ |
+ | v
+ | [ ] \
+ | [ ]_| <-- vector loop.
+ | |
+ | v
+ | -[ ] <--- middle-block.
+ | / |
+ | / v
+ -|- >[ ] <--- new preheader.
+ | |
+ | v
+ | [ ] \
+ | [ ]_| <-- old scalar loop to handle remainder.
+ \ |
+ \ v
+ >[ ] <-- exit block.
+ ...
+ */
+
+ BasicBlock *OldBasicBlock = OrigLoop->getHeader();
+ BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
+ BasicBlock *ExitBlock = OrigLoop->getExitBlock();
+ assert(VectorPH && "Invalid loop structure");
+ assert(ExitBlock && "Must have an exit block");
+
+ // Some loops have a single integer induction variable, while other loops
+ // don't. One example is c++ iterators that often have multiple pointer
+ // induction variables. In the code below we also support a case where we
+ // don't have a single induction variable.
+ //
+ // We try to obtain an induction variable from the original loop as hard
+ // as possible. However if we don't find one that:
+ // - is an integer
+ // - counts from zero, stepping by one
+ // - is the size of the widest induction variable type
+ // then we create a new one.
+ OldInduction = Legal->getInduction();
+ Type *IdxTy = Legal->getWidestInductionType();
+
+ // Split the single block loop into the two loop structure described above.
+ BasicBlock *VecBody =
+ VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
+ BasicBlock *MiddleBlock =
+ VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
+ BasicBlock *ScalarPH =
+ MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
+
+ // Create and register the new vector loop.
+ Loop* Lp = new Loop();
+ Loop *ParentLoop = OrigLoop->getParentLoop();
+
+ // Insert the new loop into the loop nest and register the new basic blocks
+ // before calling any utilities such as SCEV that require valid LoopInfo.
+ if (ParentLoop) {
+ ParentLoop->addChildLoop(Lp);
+ ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
+ ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
+ } else {
+ LI->addTopLevelLoop(Lp);
+ }
+ Lp->addBasicBlockToLoop(VecBody, *LI);
+
+ // Find the loop boundaries.
+ Value *Count = getOrCreateTripCount(Lp);
+
+ Value *StartIdx = ConstantInt::get(IdxTy, 0);
+
+ // We need to test whether the backedge-taken count is uint##_max. Adding one
+ // to it will cause overflow and an incorrect loop trip count in the vector
+ // body. In case of overflow we want to directly jump to the scalar remainder
+ // loop.
+ emitMinimumIterationCountCheck(Lp, ScalarPH);
+ // Now, compare the new count to zero. If it is zero skip the vector loop and
+ // jump to the scalar loop.
+ emitVectorLoopEnteredCheck(Lp, ScalarPH);
+ // Generate the code to check any assumptions that we've made for SCEV
+ // expressions.
+ emitSCEVChecks(Lp, ScalarPH);
+
+ // Generate the code that checks in runtime if arrays overlap. We put the
+ // checks into a separate block to make the more common case of few elements
+ // faster.
+ emitMemRuntimeChecks(Lp, ScalarPH);
+
+ // Generate the induction variable.
+ // The loop step is equal to the vectorization factor (num of SIMD elements)
+ // times the unroll factor (num of SIMD instructions).
+ Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
+ Constant *Step = ConstantInt::get(IdxTy, VF * UF);
+ Induction =
+ createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
+ getDebugLocFromInstOrOperands(OldInduction));
+
+ // We are going to resume the execution of the scalar loop.
+ // Go over all of the induction variables that we found and fix the
+ // PHIs that are left in the scalar version of the loop.
+ // The starting values of PHI nodes depend on the counter of the last
+ // iteration in the vectorized loop.
+ // If we come from a bypass edge then we need to start from the original
+ // start value.
+
+ // This variable saves the new starting index for the scalar loop. It is used
+ // to test if there are any tail iterations left once the vector loop has
+ // completed.
+ LoopVectorizationLegality::InductionList::iterator I, E;
+ LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
+ for (I = List->begin(), E = List->end(); I != E; ++I) {
+ PHINode *OrigPhi = I->first;
+ InductionDescriptor II = I->second;
+
+ // Create phi nodes to merge from the backedge-taken check block.
+ PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3,
+ "bc.resume.val",
+ ScalarPH->getTerminator());
+ Value *EndValue;
+ if (OrigPhi == OldInduction) {
+ // We know what the end value is.
+ EndValue = CountRoundDown;
+ } else {
+ IRBuilder<> B(LoopBypassBlocks.back()->getTerminator());
+ Value *CRD = B.CreateSExtOrTrunc(CountRoundDown,
+ II.getStepValue()->getType(),
+ "cast.crd");
+ EndValue = II.transform(B, CRD);
+ EndValue->setName("ind.end");
+ }
+
+ // The new PHI merges the original incoming value, in case of a bypass,
+ // or the value at the end of the vectorized loop.
+ BCResumeVal->addIncoming(EndValue, MiddleBlock);
+
+ // Fix the scalar body counter (PHI node).
+ unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
+
+ // The old induction's phi node in the scalar body needs the truncated
+ // value.
+ for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+ BCResumeVal->addIncoming(II.getStartValue(), LoopBypassBlocks[I]);
+ OrigPhi->setIncomingValue(BlockIdx, BCResumeVal);
+ }
+
+ // Add a check in the middle block to see if we have completed
+ // all of the iterations in the first vector loop.
+ // If (N - N%VF) == N, then we *don't* need to run the remainder.
+ Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
+ CountRoundDown, "cmp.n",
+ MiddleBlock->getTerminator());
+ ReplaceInstWithInst(MiddleBlock->getTerminator(),
+ BranchInst::Create(ExitBlock, ScalarPH, CmpN));
+
+ // Get ready to start creating new instructions into the vectorized body.
+ Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt());
+
+ // Save the state.
+ LoopVectorPreHeader = Lp->getLoopPreheader();
+ LoopScalarPreHeader = ScalarPH;
+ LoopMiddleBlock = MiddleBlock;
+ LoopExitBlock = ExitBlock;
+ LoopVectorBody.push_back(VecBody);
+ LoopScalarBody = OldBasicBlock;
+
+ LoopVectorizeHints Hints(Lp, true);
+ Hints.setAlreadyVectorized();
+}
+
+namespace {
+struct CSEDenseMapInfo {
+ static bool canHandle(Instruction *I) {
+ return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
+ isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
+ }
+ static inline Instruction *getEmptyKey() {
+ return DenseMapInfo<Instruction *>::getEmptyKey();
+ }
+ static inline Instruction *getTombstoneKey() {
+ return DenseMapInfo<Instruction *>::getTombstoneKey();
+ }
+ static unsigned getHashValue(Instruction *I) {
+ assert(canHandle(I) && "Unknown instruction!");
+ return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
+ I->value_op_end()));
+ }
+ static bool isEqual(Instruction *LHS, Instruction *RHS) {
+ if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
+ LHS == getTombstoneKey() || RHS == getTombstoneKey())
+ return LHS == RHS;
+ return LHS->isIdenticalTo(RHS);
+ }
+};
+}
+
+/// \brief Check whether this block is a predicated block.
+/// Due to if predication of stores we might create a sequence of "if(pred) a[i]
+/// = ...; " blocks. We start with one vectorized basic block. For every
+/// conditional block we split this vectorized block. Therefore, every second
+/// block will be a predicated one.
+static bool isPredicatedBlock(unsigned BlockNum) {
+ return BlockNum % 2;
+}
+
+///\brief Perform cse of induction variable instructions.
+static void cse(SmallVector<BasicBlock *, 4> &BBs) {
+ // Perform simple cse.
+ SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
+ for (unsigned i = 0, e = BBs.size(); i != e; ++i) {
+ BasicBlock *BB = BBs[i];
+ for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
+ Instruction *In = &*I++;
+
+ if (!CSEDenseMapInfo::canHandle(In))
+ continue;
+
+ // Check if we can replace this instruction with any of the
+ // visited instructions.
+ if (Instruction *V = CSEMap.lookup(In)) {
+ In->replaceAllUsesWith(V);
+ In->eraseFromParent();
+ continue;
+ }
+ // Ignore instructions in conditional blocks. We create "if (pred) a[i] =
+ // ...;" blocks for predicated stores. Every second block is a predicated
+ // block.
+ if (isPredicatedBlock(i))
+ continue;
+
+ CSEMap[In] = In;
+ }
+ }
+}
+
+/// \brief Adds a 'fast' flag to floating point operations.
+static Value *addFastMathFlag(Value *V) {
+ if (isa<FPMathOperator>(V)){
+ FastMathFlags Flags;
+ Flags.setUnsafeAlgebra();
+ cast<Instruction>(V)->setFastMathFlags(Flags);
+ }
+ return V;
+}
+
+/// Estimate the overhead of scalarizing a value. Insert and Extract are set if
+/// the result needs to be inserted and/or extracted from vectors.
+static unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract,
+ const TargetTransformInfo &TTI) {
+ if (Ty->isVoidTy())
+ return 0;
+
+ assert(Ty->isVectorTy() && "Can only scalarize vectors");
+ unsigned Cost = 0;
+
+ for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
+ if (Insert)
+ Cost += TTI.getVectorInstrCost(Instruction::InsertElement, Ty, i);
+ if (Extract)
+ Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, Ty, i);
+ }
+
+ return Cost;
+}
+
+// Estimate cost of a call instruction CI if it were vectorized with factor VF.
+// Return the cost of the instruction, including scalarization overhead if it's
+// needed. The flag NeedToScalarize shows if the call needs to be scalarized -
+// i.e. either vector version isn't available, or is too expensive.
+static unsigned getVectorCallCost(CallInst *CI, unsigned VF,
+ const TargetTransformInfo &TTI,
+ const TargetLibraryInfo *TLI,
+ bool &NeedToScalarize) {
+ Function *F = CI->getCalledFunction();
+ StringRef FnName = CI->getCalledFunction()->getName();
+ Type *ScalarRetTy = CI->getType();
+ SmallVector<Type *, 4> Tys, ScalarTys;
+ for (auto &ArgOp : CI->arg_operands())
+ ScalarTys.push_back(ArgOp->getType());
+
+ // Estimate cost of scalarized vector call. The source operands are assumed
+ // to be vectors, so we need to extract individual elements from there,
+ // execute VF scalar calls, and then gather the result into the vector return
+ // value.
+ unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
+ if (VF == 1)
+ return ScalarCallCost;
+
+ // Compute corresponding vector type for return value and arguments.
+ Type *RetTy = ToVectorTy(ScalarRetTy, VF);
+ for (unsigned i = 0, ie = ScalarTys.size(); i != ie; ++i)
+ Tys.push_back(ToVectorTy(ScalarTys[i], VF));
+
+ // Compute costs of unpacking argument values for the scalar calls and
+ // packing the return values to a vector.
+ unsigned ScalarizationCost =
+ getScalarizationOverhead(RetTy, true, false, TTI);
+ for (unsigned i = 0, ie = Tys.size(); i != ie; ++i)
+ ScalarizationCost += getScalarizationOverhead(Tys[i], false, true, TTI);
+
+ unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
+
+ // If we can't emit a vector call for this function, then the currently found
+ // cost is the cost we need to return.
+ NeedToScalarize = true;
+ if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())
+ return Cost;
+
+ // If the corresponding vector cost is cheaper, return its cost.
+ unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
+ if (VectorCallCost < Cost) {
+ NeedToScalarize = false;
+ return VectorCallCost;
+ }
+ return Cost;
+}
+
+// Estimate cost of an intrinsic call instruction CI if it were vectorized with
+// factor VF. Return the cost of the instruction, including scalarization
+// overhead if it's needed.
+static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF,
+ const TargetTransformInfo &TTI,
+ const TargetLibraryInfo *TLI) {
+ Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
+ assert(ID && "Expected intrinsic call!");
+
+ Type *RetTy = ToVectorTy(CI->getType(), VF);
+ SmallVector<Type *, 4> Tys;
+ for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i)
+ Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF));
+
+ return TTI.getIntrinsicInstrCost(ID, RetTy, Tys);
+}
+
+static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
+ IntegerType *I1 = cast<IntegerType>(T1->getVectorElementType());
+ IntegerType *I2 = cast<IntegerType>(T2->getVectorElementType());
+ return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
+}
+static Type *largestIntegerVectorType(Type *T1, Type *T2) {
+ IntegerType *I1 = cast<IntegerType>(T1->getVectorElementType());
+ IntegerType *I2 = cast<IntegerType>(T2->getVectorElementType());
+ return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
+}
+
+void InnerLoopVectorizer::truncateToMinimalBitwidths() {
+ // For every instruction `I` in MinBWs, truncate the operands, create a
+ // truncated version of `I` and reextend its result. InstCombine runs
+ // later and will remove any ext/trunc pairs.
+ //
+ for (auto &KV : MinBWs) {
+ VectorParts &Parts = WidenMap.get(KV.first);
+ for (Value *&I : Parts) {
+ if (I->use_empty())
+ continue;
+ Type *OriginalTy = I->getType();
+ Type *ScalarTruncatedTy = IntegerType::get(OriginalTy->getContext(),
+ KV.second);
+ Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
+ OriginalTy->getVectorNumElements());
+ if (TruncatedTy == OriginalTy)
+ continue;
+
+ IRBuilder<> B(cast<Instruction>(I));
+ auto ShrinkOperand = [&](Value *V) -> Value* {
+ if (auto *ZI = dyn_cast<ZExtInst>(V))
+ if (ZI->getSrcTy() == TruncatedTy)
+ return ZI->getOperand(0);
+ return B.CreateZExtOrTrunc(V, TruncatedTy);
+ };
+
+ // The actual instruction modification depends on the instruction type,
+ // unfortunately.
+ Value *NewI = nullptr;
+ if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
+ NewI = B.CreateBinOp(BO->getOpcode(),
+ ShrinkOperand(BO->getOperand(0)),
+ ShrinkOperand(BO->getOperand(1)));
+ cast<BinaryOperator>(NewI)->copyIRFlags(I);
+ } else if (ICmpInst *CI = dyn_cast<ICmpInst>(I)) {
+ NewI = B.CreateICmp(CI->getPredicate(),
+ ShrinkOperand(CI->getOperand(0)),
+ ShrinkOperand(CI->getOperand(1)));
+ } else if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
+ NewI = B.CreateSelect(SI->getCondition(),
+ ShrinkOperand(SI->getTrueValue()),
+ ShrinkOperand(SI->getFalseValue()));
+ } else if (CastInst *CI = dyn_cast<CastInst>(I)) {
+ switch (CI->getOpcode()) {
+ default: llvm_unreachable("Unhandled cast!");
+ case Instruction::Trunc:
+ NewI = ShrinkOperand(CI->getOperand(0));
+ break;
+ case Instruction::SExt:
+ NewI = B.CreateSExtOrTrunc(CI->getOperand(0),
+ smallestIntegerVectorType(OriginalTy,
+ TruncatedTy));
+ break;
+ case Instruction::ZExt:
+ NewI = B.CreateZExtOrTrunc(CI->getOperand(0),
+ smallestIntegerVectorType(OriginalTy,
+ TruncatedTy));
+ break;
+ }
+ } else if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(I)) {
+ auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
+ auto *O0 =
+ B.CreateZExtOrTrunc(SI->getOperand(0),
+ VectorType::get(ScalarTruncatedTy, Elements0));
+ auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
+ auto *O1 =
+ B.CreateZExtOrTrunc(SI->getOperand(1),
+ VectorType::get(ScalarTruncatedTy, Elements1));
+
+ NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
+ } else if (isa<LoadInst>(I)) {
+ // Don't do anything with the operands, just extend the result.
+ continue;
+ } else {
+ llvm_unreachable("Unhandled instruction type!");
+ }
+
+ // Lastly, extend the result.
+ NewI->takeName(cast<Instruction>(I));
+ Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
+ I->replaceAllUsesWith(Res);
+ cast<Instruction>(I)->eraseFromParent();
+ I = Res;
+ }
+ }
+
+ // We'll have created a bunch of ZExts that are now parentless. Clean up.
+ for (auto &KV : MinBWs) {
+ VectorParts &Parts = WidenMap.get(KV.first);
+ for (Value *&I : Parts) {
+ ZExtInst *Inst = dyn_cast<ZExtInst>(I);
+ if (Inst && Inst->use_empty()) {
+ Value *NewI = Inst->getOperand(0);
+ Inst->eraseFromParent();
+ I = NewI;
+ }
+ }
+ }
+}
+
+void InnerLoopVectorizer::vectorizeLoop() {
+ //===------------------------------------------------===//
+ //
+ // Notice: any optimization or new instruction that go
+ // into the code below should be also be implemented in
+ // the cost-model.
+ //
+ //===------------------------------------------------===//
+ Constant *Zero = Builder.getInt32(0);
+
+ // In order to support reduction variables we need to be able to vectorize
+ // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two
+ // stages. First, we create a new vector PHI node with no incoming edges.
+ // We use this value when we vectorize all of the instructions that use the
+ // PHI. Next, after all of the instructions in the block are complete we
+ // add the new incoming edges to the PHI. At this point all of the
+ // instructions in the basic block are vectorized, so we can use them to
+ // construct the PHI.
+ PhiVector RdxPHIsToFix;
+
+ // Scan the loop in a topological order to ensure that defs are vectorized
+ // before users.
+ LoopBlocksDFS DFS(OrigLoop);
+ DFS.perform(LI);
+
+ // Vectorize all of the blocks in the original loop.
+ for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
+ be = DFS.endRPO(); bb != be; ++bb)
+ vectorizeBlockInLoop(*bb, &RdxPHIsToFix);
+
+ // Insert truncates and extends for any truncated instructions as hints to
+ // InstCombine.
+ if (VF > 1)
+ truncateToMinimalBitwidths();
+
+ // At this point every instruction in the original loop is widened to
+ // a vector form. We are almost done. Now, we need to fix the PHI nodes
+ // that we vectorized. The PHI nodes are currently empty because we did
+ // not want to introduce cycles. Notice that the remaining PHI nodes
+ // that we need to fix are reduction variables.
+
+ // Create the 'reduced' values for each of the induction vars.
+ // The reduced values are the vector values that we scalarize and combine
+ // after the loop is finished.
+ for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end();
+ it != e; ++it) {
+ PHINode *RdxPhi = *it;
+ assert(RdxPhi && "Unable to recover vectorized PHI");
+
+ // Find the reduction variable descriptor.
+ assert(Legal->isReductionVariable(RdxPhi) &&
+ "Unable to find the reduction variable");
+ RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[RdxPhi];
+
+ RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
+ TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
+ Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
+ RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
+ RdxDesc.getMinMaxRecurrenceKind();
+ setDebugLocFromInst(Builder, ReductionStartValue);
+
+ // We need to generate a reduction vector from the incoming scalar.
+ // To do so, we need to generate the 'identity' vector and override
+ // one of the elements with the incoming scalar reduction. We need
+ // to do it in the vector-loop preheader.
+ Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator());
+
+ // This is the vector-clone of the value that leaves the loop.
+ VectorParts &VectorExit = getVectorValue(LoopExitInst);
+ Type *VecTy = VectorExit[0]->getType();
+
+ // Find the reduction identity variable. Zero for addition, or, xor,
+ // one for multiplication, -1 for And.
+ Value *Identity;
+ Value *VectorStart;
+ if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
+ RK == RecurrenceDescriptor::RK_FloatMinMax) {
+ // MinMax reduction have the start value as their identify.
+ if (VF == 1) {
+ VectorStart = Identity = ReductionStartValue;
+ } else {
+ VectorStart = Identity =
+ Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
+ }
+ } else {
+ // Handle other reduction kinds:
+ Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
+ RK, VecTy->getScalarType());
+ if (VF == 1) {
+ Identity = Iden;
+ // This vector is the Identity vector where the first element is the
+ // incoming scalar reduction.
+ VectorStart = ReductionStartValue;
+ } else {
+ Identity = ConstantVector::getSplat(VF, Iden);
+
+ // This vector is the Identity vector where the first element is the
+ // incoming scalar reduction.
+ VectorStart =
+ Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
+ }
+ }
+
+ // Fix the vector-loop phi.
+
+ // Reductions do not have to start at zero. They can start with
+ // any loop invariant values.
+ VectorParts &VecRdxPhi = WidenMap.get(RdxPhi);
+ BasicBlock *Latch = OrigLoop->getLoopLatch();
+ Value *LoopVal = RdxPhi->getIncomingValueForBlock(Latch);
+ VectorParts &Val = getVectorValue(LoopVal);
+ for (unsigned part = 0; part < UF; ++part) {
+ // Make sure to add the reduction stat value only to the
+ // first unroll part.
+ Value *StartVal = (part == 0) ? VectorStart : Identity;
+ cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal,
+ LoopVectorPreHeader);
+ cast<PHINode>(VecRdxPhi[part])->addIncoming(Val[part],
+ LoopVectorBody.back());
+ }
+
+ // Before each round, move the insertion point right between
+ // the PHIs and the values we are going to write.
+ // This allows us to write both PHINodes and the extractelement
+ // instructions.
+ Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
+
+ VectorParts RdxParts = getVectorValue(LoopExitInst);
+ setDebugLocFromInst(Builder, LoopExitInst);
+
+ // If the vector reduction can be performed in a smaller type, we truncate
+ // then extend the loop exit value to enable InstCombine to evaluate the
+ // entire expression in the smaller type.
+ if (VF > 1 && RdxPhi->getType() != RdxDesc.getRecurrenceType()) {
+ Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
+ Builder.SetInsertPoint(LoopVectorBody.back()->getTerminator());
+ for (unsigned part = 0; part < UF; ++part) {
+ Value *Trunc = Builder.CreateTrunc(RdxParts[part], RdxVecTy);
+ Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
+ : Builder.CreateZExt(Trunc, VecTy);
+ for (Value::user_iterator UI = RdxParts[part]->user_begin();
+ UI != RdxParts[part]->user_end();)
+ if (*UI != Trunc) {
+ (*UI++)->replaceUsesOfWith(RdxParts[part], Extnd);
+ RdxParts[part] = Extnd;
+ } else {
+ ++UI;
+ }
+ }
+ Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
+ for (unsigned part = 0; part < UF; ++part)
+ RdxParts[part] = Builder.CreateTrunc(RdxParts[part], RdxVecTy);
+ }
+
+ // Reduce all of the unrolled parts into a single vector.
+ Value *ReducedPartRdx = RdxParts[0];
+ unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
+ setDebugLocFromInst(Builder, ReducedPartRdx);
+ for (unsigned part = 1; part < UF; ++part) {
+ if (Op != Instruction::ICmp && Op != Instruction::FCmp)
+ // Floating point operations had to be 'fast' to enable the reduction.
+ ReducedPartRdx = addFastMathFlag(
+ Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxParts[part],
+ ReducedPartRdx, "bin.rdx"));
+ else
+ ReducedPartRdx = RecurrenceDescriptor::createMinMaxOp(
+ Builder, MinMaxKind, ReducedPartRdx, RdxParts[part]);
+ }
+
+ if (VF > 1) {
+ // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
+ // and vector ops, reducing the set of values being computed by half each
+ // round.
+ assert(isPowerOf2_32(VF) &&
+ "Reduction emission only supported for pow2 vectors!");
+ Value *TmpVec = ReducedPartRdx;
+ SmallVector<Constant*, 32> ShuffleMask(VF, nullptr);
+ for (unsigned i = VF; i != 1; i >>= 1) {
+ // Move the upper half of the vector to the lower half.
+ for (unsigned j = 0; j != i/2; ++j)
+ ShuffleMask[j] = Builder.getInt32(i/2 + j);
+
+ // Fill the rest of the mask with undef.
+ std::fill(&ShuffleMask[i/2], ShuffleMask.end(),
+ UndefValue::get(Builder.getInt32Ty()));
+
+ Value *Shuf =
+ Builder.CreateShuffleVector(TmpVec,
+ UndefValue::get(TmpVec->getType()),
+ ConstantVector::get(ShuffleMask),
+ "rdx.shuf");
+
+ if (Op != Instruction::ICmp && Op != Instruction::FCmp)
+ // Floating point operations had to be 'fast' to enable the reduction.
+ TmpVec = addFastMathFlag(Builder.CreateBinOp(
+ (Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx"));
+ else
+ TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind,
+ TmpVec, Shuf);
+ }
+
+ // The result is in the first element of the vector.
+ ReducedPartRdx = Builder.CreateExtractElement(TmpVec,
+ Builder.getInt32(0));
+
+ // If the reduction can be performed in a smaller type, we need to extend
+ // the reduction to the wider type before we branch to the original loop.
+ if (RdxPhi->getType() != RdxDesc.getRecurrenceType())
+ ReducedPartRdx =
+ RdxDesc.isSigned()
+ ? Builder.CreateSExt(ReducedPartRdx, RdxPhi->getType())
+ : Builder.CreateZExt(ReducedPartRdx, RdxPhi->getType());
+ }
+
+ // Create a phi node that merges control-flow from the backedge-taken check
+ // block and the middle block.
+ PHINode *BCBlockPhi = PHINode::Create(RdxPhi->getType(), 2, "bc.merge.rdx",
+ LoopScalarPreHeader->getTerminator());
+ for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+ BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
+ BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
+
+ // Now, we need to fix the users of the reduction variable
+ // inside and outside of the scalar remainder loop.
+ // We know that the loop is in LCSSA form. We need to update the
+ // PHI nodes in the exit blocks.
+ for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
+ LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
+ PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
+ if (!LCSSAPhi) break;
+
+ // All PHINodes need to have a single entry edge, or two if
+ // we already fixed them.
+ assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
+
+ // We found our reduction value exit-PHI. Update it with the
+ // incoming bypass edge.
+ if (LCSSAPhi->getIncomingValue(0) == LoopExitInst) {
+ // Add an edge coming from the bypass.
+ LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
+ break;
+ }
+ }// end of the LCSSA phi scan.
+
+ // Fix the scalar loop reduction variable with the incoming reduction sum
+ // from the vector body and from the backedge value.
+ int IncomingEdgeBlockIdx =
+ (RdxPhi)->getBasicBlockIndex(OrigLoop->getLoopLatch());
+ assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
+ // Pick the other block.
+ int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
+ (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
+ (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
+ }// end of for each redux variable.
+
+ fixLCSSAPHIs();
+
+ // Make sure DomTree is updated.
+ updateAnalysis();
+
+ // Predicate any stores.
+ for (auto KV : PredicatedStores) {
+ BasicBlock::iterator I(KV.first);
+ auto *BB = SplitBlock(I->getParent(), &*std::next(I), DT, LI);
+ auto *T = SplitBlockAndInsertIfThen(KV.second, &*I, /*Unreachable=*/false,
+ /*BranchWeights=*/nullptr, DT);
+ I->moveBefore(T);
+ I->getParent()->setName("pred.store.if");
+ BB->setName("pred.store.continue");
+ }
+ DEBUG(DT->verifyDomTree());
+ // Remove redundant induction instructions.
+ cse(LoopVectorBody);
+}
+
+void InnerLoopVectorizer::fixLCSSAPHIs() {
+ for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
+ LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
+ PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
+ if (!LCSSAPhi) break;
+ if (LCSSAPhi->getNumIncomingValues() == 1)
+ LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()),
+ LoopMiddleBlock);
+ }
+}
+
+InnerLoopVectorizer::VectorParts
+InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
+ assert(std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) &&
+ "Invalid edge");
+
+ // Look for cached value.
+ std::pair<BasicBlock*, BasicBlock*> Edge(Src, Dst);
+ EdgeMaskCache::iterator ECEntryIt = MaskCache.find(Edge);
+ if (ECEntryIt != MaskCache.end())
+ return ECEntryIt->second;
+
+ VectorParts SrcMask = createBlockInMask(Src);
+
+ // The terminator has to be a branch inst!
+ BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
+ assert(BI && "Unexpected terminator found");
+
+ if (BI->isConditional()) {
+ VectorParts EdgeMask = getVectorValue(BI->getCondition());
+
+ if (BI->getSuccessor(0) != Dst)
+ for (unsigned part = 0; part < UF; ++part)
+ EdgeMask[part] = Builder.CreateNot(EdgeMask[part]);
+
+ for (unsigned part = 0; part < UF; ++part)
+ EdgeMask[part] = Builder.CreateAnd(EdgeMask[part], SrcMask[part]);
+
+ MaskCache[Edge] = EdgeMask;
+ return EdgeMask;
+ }
+
+ MaskCache[Edge] = SrcMask;
+ return SrcMask;
+}
+
+InnerLoopVectorizer::VectorParts
+InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
+ assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
+
+ // Loop incoming mask is all-one.
+ if (OrigLoop->getHeader() == BB) {
+ Value *C = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1);
+ return getVectorValue(C);
+ }
+
+ // This is the block mask. We OR all incoming edges, and with zero.
+ Value *Zero = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0);
+ VectorParts BlockMask = getVectorValue(Zero);
+
+ // For each pred:
+ for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it) {
+ VectorParts EM = createEdgeMask(*it, BB);
+ for (unsigned part = 0; part < UF; ++part)
+ BlockMask[part] = Builder.CreateOr(BlockMask[part], EM[part]);
+ }
+
+ return BlockMask;
+}
+
+void InnerLoopVectorizer::widenPHIInstruction(
+ Instruction *PN, InnerLoopVectorizer::VectorParts &Entry, unsigned UF,
+ unsigned VF, PhiVector *PV) {
+ PHINode* P = cast<PHINode>(PN);
+ // Handle reduction variables:
+ if (Legal->isReductionVariable(P)) {
+ for (unsigned part = 0; part < UF; ++part) {
+ // This is phase one of vectorizing PHIs.
+ Type *VecTy = (VF == 1) ? PN->getType() :
+ VectorType::get(PN->getType(), VF);
+ Entry[part] = PHINode::Create(
+ VecTy, 2, "vec.phi", &*LoopVectorBody.back()->getFirstInsertionPt());
+ }
+ PV->push_back(P);
+ return;
+ }
+
+ setDebugLocFromInst(Builder, P);
+ // Check for PHI nodes that are lowered to vector selects.
+ if (P->getParent() != OrigLoop->getHeader()) {
+ // We know that all PHIs in non-header blocks are converted into
+ // selects, so we don't have to worry about the insertion order and we
+ // can just use the builder.
+ // At this point we generate the predication tree. There may be
+ // duplications since this is a simple recursive scan, but future
+ // optimizations will clean it up.
+
+ unsigned NumIncoming = P->getNumIncomingValues();
+
+ // Generate a sequence of selects of the form:
+ // SELECT(Mask3, In3,
+ // SELECT(Mask2, In2,
+ // ( ...)))
+ for (unsigned In = 0; In < NumIncoming; In++) {
+ VectorParts Cond = createEdgeMask(P->getIncomingBlock(In),
+ P->getParent());
+ VectorParts &In0 = getVectorValue(P->getIncomingValue(In));
+
+ for (unsigned part = 0; part < UF; ++part) {
+ // We might have single edge PHIs (blocks) - use an identity
+ // 'select' for the first PHI operand.
+ if (In == 0)
+ Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
+ In0[part]);
+ else
+ // Select between the current value and the previous incoming edge
+ // based on the incoming mask.
+ Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
+ Entry[part], "predphi");
+ }
+ }
+ return;
+ }
+
+ // This PHINode must be an induction variable.
+ // Make sure that we know about it.
+ assert(Legal->getInductionVars()->count(P) &&
+ "Not an induction variable");
+
+ InductionDescriptor II = Legal->getInductionVars()->lookup(P);
+
+ // FIXME: The newly created binary instructions should contain nsw/nuw flags,
+ // which can be found from the original scalar operations.
+ switch (II.getKind()) {
+ case InductionDescriptor::IK_NoInduction:
+ llvm_unreachable("Unknown induction");
+ case InductionDescriptor::IK_IntInduction: {
+ assert(P->getType() == II.getStartValue()->getType() &&
+ "Types must match");
+ // Handle other induction variables that are now based on the
+ // canonical one.
+ Value *V = Induction;
+ if (P != OldInduction) {
+ V = Builder.CreateSExtOrTrunc(Induction, P->getType());
+ V = II.transform(Builder, V);
+ V->setName("offset.idx");
+ }
+ Value *Broadcasted = getBroadcastInstrs(V);
+ // After broadcasting the induction variable we need to make the vector
+ // consecutive by adding 0, 1, 2, etc.
+ for (unsigned part = 0; part < UF; ++part)
+ Entry[part] = getStepVector(Broadcasted, VF * part, II.getStepValue());
+ return;
+ }
+ case InductionDescriptor::IK_PtrInduction:
+ // Handle the pointer induction variable case.
+ assert(P->getType()->isPointerTy() && "Unexpected type.");
+ // This is the normalized GEP that starts counting at zero.
+ Value *PtrInd = Induction;
+ PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStepValue()->getType());
+ // This is the vector of results. Notice that we don't generate
+ // vector geps because scalar geps result in better code.
+ for (unsigned part = 0; part < UF; ++part) {
+ if (VF == 1) {
+ int EltIndex = part;
+ Constant *Idx = ConstantInt::get(PtrInd->getType(), EltIndex);
+ Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
+ Value *SclrGep = II.transform(Builder, GlobalIdx);
+ SclrGep->setName("next.gep");
+ Entry[part] = SclrGep;
+ continue;
+ }
+
+ Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
+ for (unsigned int i = 0; i < VF; ++i) {
+ int EltIndex = i + part * VF;
+ Constant *Idx = ConstantInt::get(PtrInd->getType(), EltIndex);
+ Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
+ Value *SclrGep = II.transform(Builder, GlobalIdx);
+ SclrGep->setName("next.gep");
+ VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
+ Builder.getInt32(i),
+ "insert.gep");
+ }
+ Entry[part] = VecVal;
+ }
+ return;
+ }
+}
+
+void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
+ // For each instruction in the old loop.
+ for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+ VectorParts &Entry = WidenMap.get(&*it);
+
+ switch (it->getOpcode()) {
+ case Instruction::Br:
+ // Nothing to do for PHIs and BR, since we already took care of the
+ // loop control flow instructions.
+ continue;
+ case Instruction::PHI: {
+ // Vectorize PHINodes.
+ widenPHIInstruction(&*it, Entry, UF, VF, PV);
+ continue;
+ }// End of PHI.
+
+ case Instruction::Add:
+ case Instruction::FAdd:
+ case Instruction::Sub:
+ case Instruction::FSub:
+ case Instruction::Mul:
+ case Instruction::FMul:
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::FDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor: {
+ // Just widen binops.
+ BinaryOperator *BinOp = dyn_cast<BinaryOperator>(it);
+ setDebugLocFromInst(Builder, BinOp);
+ VectorParts &A = getVectorValue(it->getOperand(0));
+ VectorParts &B = getVectorValue(it->getOperand(1));
+
+ // Use this vector value for all users of the original instruction.
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]);
+
+ if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V))
+ VecOp->copyIRFlags(BinOp);
+
+ Entry[Part] = V;
+ }
+
+ propagateMetadata(Entry, &*it);
+ break;
+ }
+ case Instruction::Select: {
+ // Widen selects.
+ // If the selector is loop invariant we can create a select
+ // instruction with a scalar condition. Otherwise, use vector-select.
+ auto *SE = PSE.getSE();
+ bool InvariantCond =
+ SE->isLoopInvariant(PSE.getSCEV(it->getOperand(0)), OrigLoop);
+ setDebugLocFromInst(Builder, &*it);
+
+ // The condition can be loop invariant but still defined inside the
+ // loop. This means that we can't just use the original 'cond' value.
+ // We have to take the 'vectorized' value and pick the first lane.
+ // Instcombine will make this a no-op.
+ VectorParts &Cond = getVectorValue(it->getOperand(0));
+ VectorParts &Op0 = getVectorValue(it->getOperand(1));
+ VectorParts &Op1 = getVectorValue(it->getOperand(2));
+
+ Value *ScalarCond = (VF == 1) ? Cond[0] :
+ Builder.CreateExtractElement(Cond[0], Builder.getInt32(0));
+
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ Entry[Part] = Builder.CreateSelect(
+ InvariantCond ? ScalarCond : Cond[Part],
+ Op0[Part],
+ Op1[Part]);
+ }
+
+ propagateMetadata(Entry, &*it);
+ break;
+ }
+
+ case Instruction::ICmp:
+ case Instruction::FCmp: {
+ // Widen compares. Generate vector compares.
+ bool FCmp = (it->getOpcode() == Instruction::FCmp);
+ CmpInst *Cmp = dyn_cast<CmpInst>(it);
+ setDebugLocFromInst(Builder, &*it);
+ VectorParts &A = getVectorValue(it->getOperand(0));
+ VectorParts &B = getVectorValue(it->getOperand(1));
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ Value *C = nullptr;
+ if (FCmp) {
+ C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]);
+ cast<FCmpInst>(C)->copyFastMathFlags(&*it);
+ } else {
+ C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]);
+ }
+ Entry[Part] = C;
+ }
+
+ propagateMetadata(Entry, &*it);
+ break;
+ }
+
+ case Instruction::Store:
+ case Instruction::Load:
+ vectorizeMemoryInstruction(&*it);
+ break;
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::SIToFP:
+ case Instruction::UIToFP:
+ case Instruction::Trunc:
+ case Instruction::FPTrunc:
+ case Instruction::BitCast: {
+ CastInst *CI = dyn_cast<CastInst>(it);
+ setDebugLocFromInst(Builder, &*it);
+ /// Optimize the special case where the source is the induction
+ /// variable. Notice that we can only optimize the 'trunc' case
+ /// because: a. FP conversions lose precision, b. sext/zext may wrap,
+ /// c. other casts depend on pointer size.
+ if (CI->getOperand(0) == OldInduction &&
+ it->getOpcode() == Instruction::Trunc) {
+ Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction,
+ CI->getType());
+ Value *Broadcasted = getBroadcastInstrs(ScalarCast);
+ InductionDescriptor II =
+ Legal->getInductionVars()->lookup(OldInduction);
+ Constant *Step = ConstantInt::getSigned(
+ CI->getType(), II.getStepValue()->getSExtValue());
+ for (unsigned Part = 0; Part < UF; ++Part)
+ Entry[Part] = getStepVector(Broadcasted, VF * Part, Step);
+ propagateMetadata(Entry, &*it);
+ break;
+ }
+ /// Vectorize casts.
+ Type *DestTy = (VF == 1) ? CI->getType() :
+ VectorType::get(CI->getType(), VF);
+
+ VectorParts &A = getVectorValue(it->getOperand(0));
+ for (unsigned Part = 0; Part < UF; ++Part)
+ Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy);
+ propagateMetadata(Entry, &*it);
+ break;
+ }
+
+ case Instruction::Call: {
+ // Ignore dbg intrinsics.
+ if (isa<DbgInfoIntrinsic>(it))
+ break;
+ setDebugLocFromInst(Builder, &*it);
+
+ Module *M = BB->getParent()->getParent();
+ CallInst *CI = cast<CallInst>(it);
+
+ StringRef FnName = CI->getCalledFunction()->getName();
+ Function *F = CI->getCalledFunction();
+ Type *RetTy = ToVectorTy(CI->getType(), VF);
+ SmallVector<Type *, 4> Tys;
+ for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i)
+ Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF));
+
+ Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
+ if (ID &&
+ (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
+ ID == Intrinsic::lifetime_start)) {
+ scalarizeInstruction(&*it);
+ break;
+ }
+ // The flag shows whether we use Intrinsic or a usual Call for vectorized
+ // version of the instruction.
+ // Is it beneficial to perform intrinsic call compared to lib call?
+ bool NeedToScalarize;
+ unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);
+ bool UseVectorIntrinsic =
+ ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;
+ if (!UseVectorIntrinsic && NeedToScalarize) {
+ scalarizeInstruction(&*it);
+ break;
+ }
+
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ SmallVector<Value *, 4> Args;
+ for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
+ Value *Arg = CI->getArgOperand(i);
+ // Some intrinsics have a scalar argument - don't replace it with a
+ // vector.
+ if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) {
+ VectorParts &VectorArg = getVectorValue(CI->getArgOperand(i));
+ Arg = VectorArg[Part];
+ }
+ Args.push_back(Arg);
+ }
+
+ Function *VectorF;
+ if (UseVectorIntrinsic) {
+ // Use vector version of the intrinsic.
+ Type *TysForDecl[] = {CI->getType()};
+ if (VF > 1)
+ TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
+ VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
+ } else {
+ // Use vector version of the library call.
+ StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
+ assert(!VFnName.empty() && "Vector function name is empty.");
+ VectorF = M->getFunction(VFnName);
+ if (!VectorF) {
+ // Generate a declaration
+ FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
+ VectorF =
+ Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
+ VectorF->copyAttributesFrom(F);
+ }
+ }
+ assert(VectorF && "Can't create vector function.");
+ Entry[Part] = Builder.CreateCall(VectorF, Args);
+ }
+
+ propagateMetadata(Entry, &*it);
+ break;
+ }
+
+ default:
+ // All other instructions are unsupported. Scalarize them.
+ scalarizeInstruction(&*it);
+ break;
+ }// end of switch.
+ }// end of for_each instr.
+}
+
+void InnerLoopVectorizer::updateAnalysis() {
+ // Forget the original basic block.
+ PSE.getSE()->forgetLoop(OrigLoop);
+
+ // Update the dominator tree information.
+ assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
+ "Entry does not dominate exit.");
+
+ for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
+ DT->addNewBlock(LoopBypassBlocks[I], LoopBypassBlocks[I-1]);
+ DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlocks.back());
+
+ // We don't predicate stores by this point, so the vector body should be a
+ // single loop.
+ assert(LoopVectorBody.size() == 1 && "Expected single block loop!");
+ DT->addNewBlock(LoopVectorBody[0], LoopVectorPreHeader);
+
+ DT->addNewBlock(LoopMiddleBlock, LoopVectorBody.back());
+ DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
+ DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
+ DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
+
+ DEBUG(DT->verifyDomTree());
+}
+
+/// \brief Check whether it is safe to if-convert this phi node.
+///
+/// Phi nodes with constant expressions that can trap are not safe to if
+/// convert.
+static bool canIfConvertPHINodes(BasicBlock *BB) {
+ for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+ PHINode *Phi = dyn_cast<PHINode>(I);
+ if (!Phi)
+ return true;
+ for (unsigned p = 0, e = Phi->getNumIncomingValues(); p != e; ++p)
+ if (Constant *C = dyn_cast<Constant>(Phi->getIncomingValue(p)))
+ if (C->canTrap())
+ return false;
+ }
+ return true;
+}
+
+bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
+ if (!EnableIfConversion) {
+ emitAnalysis(VectorizationReport() << "if-conversion is disabled");
+ return false;
+ }
+
+ assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
+
+ // A list of pointers that we can safely read and write to.
+ SmallPtrSet<Value *, 8> SafePointes;
+
+ // Collect safe addresses.
+ for (Loop::block_iterator BI = TheLoop->block_begin(),
+ BE = TheLoop->block_end(); BI != BE; ++BI) {
+ BasicBlock *BB = *BI;
+
+ if (blockNeedsPredication(BB))
+ continue;
+
+ for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+ if (LoadInst *LI = dyn_cast<LoadInst>(I))
+ SafePointes.insert(LI->getPointerOperand());
+ else if (StoreInst *SI = dyn_cast<StoreInst>(I))
+ SafePointes.insert(SI->getPointerOperand());
+ }
+ }
+
+ // Collect the blocks that need predication.
+ BasicBlock *Header = TheLoop->getHeader();
+ for (Loop::block_iterator BI = TheLoop->block_begin(),
+ BE = TheLoop->block_end(); BI != BE; ++BI) {
+ BasicBlock *BB = *BI;
+
+ // We don't support switch statements inside loops.
+ if (!isa<BranchInst>(BB->getTerminator())) {
+ emitAnalysis(VectorizationReport(BB->getTerminator())
+ << "loop contains a switch statement");
+ return false;
+ }
+
+ // We must be able to predicate all blocks that need to be predicated.
+ if (blockNeedsPredication(BB)) {
+ if (!blockCanBePredicated(BB, SafePointes)) {
+ emitAnalysis(VectorizationReport(BB->getTerminator())
+ << "control flow cannot be substituted for a select");
+ return false;
+ }
+ } else if (BB != Header && !canIfConvertPHINodes(BB)) {
+ emitAnalysis(VectorizationReport(BB->getTerminator())
+ << "control flow cannot be substituted for a select");
+ return false;
+ }
+ }
+
+ // We can if-convert this loop.
+ return true;
+}
+
+bool LoopVectorizationLegality::canVectorize() {
+ // We must have a loop in canonical form. Loops with indirectbr in them cannot
+ // be canonicalized.
+ if (!TheLoop->getLoopPreheader()) {
+ emitAnalysis(
+ VectorizationReport() <<
+ "loop control flow is not understood by vectorizer");
+ return false;
+ }
+
+ // We can only vectorize innermost loops.
+ if (!TheLoop->empty()) {
+ emitAnalysis(VectorizationReport() << "loop is not the innermost loop");
+ return false;
+ }
+
+ // We must have a single backedge.
+ if (TheLoop->getNumBackEdges() != 1) {
+ emitAnalysis(
+ VectorizationReport() <<
+ "loop control flow is not understood by vectorizer");
+ return false;
+ }
+
+ // We must have a single exiting block.
+ if (!TheLoop->getExitingBlock()) {
+ emitAnalysis(
+ VectorizationReport() <<
+ "loop control flow is not understood by vectorizer");
+ return false;
+ }
+
+ // We only handle bottom-tested loops, i.e. loop in which the condition is
+ // checked at the end of each iteration. With that we can assume that all
+ // instructions in the loop are executed the same number of times.
+ if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
+ emitAnalysis(
+ VectorizationReport() <<
+ "loop control flow is not understood by vectorizer");
+ return false;
+ }
+
+ // We need to have a loop header.
+ DEBUG(dbgs() << "LV: Found a loop: " <<
+ TheLoop->getHeader()->getName() << '\n');
+
+ // Check if we can if-convert non-single-bb loops.
+ unsigned NumBlocks = TheLoop->getNumBlocks();
+ if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
+ DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
+ return false;
+ }
+
+ // ScalarEvolution needs to be able to find the exit count.
+ const SCEV *ExitCount = PSE.getSE()->getBackedgeTakenCount(TheLoop);
+ if (ExitCount == PSE.getSE()->getCouldNotCompute()) {
+ emitAnalysis(VectorizationReport()
+ << "could not determine number of loop iterations");
+ DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
+ return false;
+ }
+
+ // Check if we can vectorize the instructions and CFG in this loop.
+ if (!canVectorizeInstrs()) {
+ DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
+ return false;
+ }
+
+ // Go over each instruction and look at memory deps.
+ if (!canVectorizeMemory()) {
+ DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
+ return false;
+ }
+
+ // Collect all of the variables that remain uniform after vectorization.
+ collectLoopUniforms();
+
+ DEBUG(dbgs() << "LV: We can vectorize this loop"
+ << (LAI->getRuntimePointerChecking()->Need
+ ? " (with a runtime bound check)"
+ : "")
+ << "!\n");
+
+ bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
+
+ // If an override option has been passed in for interleaved accesses, use it.
+ if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
+ UseInterleaved = EnableInterleavedMemAccesses;
+
+ // Analyze interleaved memory accesses.
+ if (UseInterleaved)
+ InterleaveInfo.analyzeInterleaving(Strides);
+
+ unsigned SCEVThreshold = VectorizeSCEVCheckThreshold;
+ if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
+ SCEVThreshold = PragmaVectorizeSCEVCheckThreshold;
+
+ if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) {
+ emitAnalysis(VectorizationReport()
+ << "Too many SCEV assumptions need to be made and checked "
+ << "at runtime");
+ DEBUG(dbgs() << "LV: Too many SCEV checks needed.\n");
+ return false;
+ }
+
+ // Okay! We can vectorize. At this point we don't have any other mem analysis
+ // which may limit our maximum vectorization factor, so just return true with
+ // no restrictions.
+ return true;
+}
+
+static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) {
+ if (Ty->isPointerTy())
+ return DL.getIntPtrType(Ty);
+
+ // It is possible that char's or short's overflow when we ask for the loop's
+ // trip count, work around this by changing the type size.
+ if (Ty->getScalarSizeInBits() < 32)
+ return Type::getInt32Ty(Ty->getContext());
+
+ return Ty;
+}
+
+static Type* getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) {
+ Ty0 = convertPointerToIntegerType(DL, Ty0);
+ Ty1 = convertPointerToIntegerType(DL, Ty1);
+ if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits())
+ return Ty0;
+ return Ty1;
+}
+
+/// \brief Check that the instruction has outside loop users and is not an
+/// identified reduction variable.
+static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
+ SmallPtrSetImpl<Value *> &Reductions) {
+ // Reduction instructions are allowed to have exit users. All other
+ // instructions must not have external users.
+ if (!Reductions.count(Inst))
+ //Check that all of the users of the loop are inside the BB.
+ for (User *U : Inst->users()) {
+ Instruction *UI = cast<Instruction>(U);
+ // This user may be a reduction exit value.
+ if (!TheLoop->contains(UI)) {
+ DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n');
+ return true;
+ }
+ }
+ return false;
+}
+
+bool LoopVectorizationLegality::canVectorizeInstrs() {
+ BasicBlock *Header = TheLoop->getHeader();
+
+ // Look for the attribute signaling the absence of NaNs.
+ Function &F = *Header->getParent();
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ if (F.hasFnAttribute("no-nans-fp-math"))
+ HasFunNoNaNAttr =
+ F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
+
+ // For each block in the loop.
+ for (Loop::block_iterator bb = TheLoop->block_begin(),
+ be = TheLoop->block_end(); bb != be; ++bb) {
+
+ // Scan the instructions in the block and look for hazards.
+ for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
+ ++it) {
+
+ if (PHINode *Phi = dyn_cast<PHINode>(it)) {
+ Type *PhiTy = Phi->getType();
+ // Check that this PHI type is allowed.
+ if (!PhiTy->isIntegerTy() &&
+ !PhiTy->isFloatingPointTy() &&
+ !PhiTy->isPointerTy()) {
+ emitAnalysis(VectorizationReport(&*it)
+ << "loop control flow is not understood by vectorizer");
+ DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
+ return false;
+ }
+
+ // If this PHINode is not in the header block, then we know that we
+ // can convert it to select during if-conversion. No need to check if
+ // the PHIs in this block are induction or reduction variables.
+ if (*bb != Header) {
+ // Check that this instruction has no outside users or is an
+ // identified reduction value with an outside user.
+ if (!hasOutsideLoopUser(TheLoop, &*it, AllowedExit))
+ continue;
+ emitAnalysis(VectorizationReport(&*it) <<
+ "value could not be identified as "
+ "an induction or reduction variable");
+ return false;
+ }
+
+ // We only allow if-converted PHIs with exactly two incoming values.
+ if (Phi->getNumIncomingValues() != 2) {
+ emitAnalysis(VectorizationReport(&*it)
+ << "control flow not understood by vectorizer");
+ DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
+ return false;
+ }
+
+ InductionDescriptor ID;
+ if (InductionDescriptor::isInductionPHI(Phi, PSE.getSE(), ID)) {
+ Inductions[Phi] = ID;
+ // Get the widest type.
+ if (!WidestIndTy)
+ WidestIndTy = convertPointerToIntegerType(DL, PhiTy);
+ else
+ WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy);
+
+ // Int inductions are special because we only allow one IV.
+ if (ID.getKind() == InductionDescriptor::IK_IntInduction &&
+ ID.getStepValue()->isOne() &&
+ isa<Constant>(ID.getStartValue()) &&
+ cast<Constant>(ID.getStartValue())->isNullValue()) {
+ // Use the phi node with the widest type as induction. Use the last
+ // one if there are multiple (no good reason for doing this other
+ // than it is expedient). We've checked that it begins at zero and
+ // steps by one, so this is a canonical induction variable.
+ if (!Induction || PhiTy == WidestIndTy)
+ Induction = Phi;
+ }
+
+ DEBUG(dbgs() << "LV: Found an induction variable.\n");
+
+ // Until we explicitly handle the case of an induction variable with
+ // an outside loop user we have to give up vectorizing this loop.
+ if (hasOutsideLoopUser(TheLoop, &*it, AllowedExit)) {
+ emitAnalysis(VectorizationReport(&*it) <<
+ "use of induction value outside of the "
+ "loop is not handled by vectorizer");
+ return false;
+ }
+
+ continue;
+ }
+
+ RecurrenceDescriptor RedDes;
+ if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes)) {
+ if (RedDes.hasUnsafeAlgebra())
+ Requirements->addUnsafeAlgebraInst(RedDes.getUnsafeAlgebraInst());
+ AllowedExit.insert(RedDes.getLoopExitInstr());
+ Reductions[Phi] = RedDes;
+ continue;
+ }
+
+ emitAnalysis(VectorizationReport(&*it) <<
+ "value that could not be identified as "
+ "reduction is used outside the loop");
+ DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n");
+ return false;
+ }// end of PHI handling
+
+ // We handle calls that:
+ // * Are debug info intrinsics.
+ // * Have a mapping to an IR intrinsic.
+ // * Have a vector version available.
+ CallInst *CI = dyn_cast<CallInst>(it);
+ if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgInfoIntrinsic>(CI) &&
+ !(CI->getCalledFunction() && TLI &&
+ TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) {
+ emitAnalysis(VectorizationReport(&*it)
+ << "call instruction cannot be vectorized");
+ DEBUG(dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n");
+ return false;
+ }
+
+ // Intrinsics such as powi,cttz and ctlz are legal to vectorize if the
+ // second argument is the same (i.e. loop invariant)
+ if (CI &&
+ hasVectorInstrinsicScalarOpd(getIntrinsicIDForCall(CI, TLI), 1)) {
+ auto *SE = PSE.getSE();
+ if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(1)), TheLoop)) {
+ emitAnalysis(VectorizationReport(&*it)
+ << "intrinsic instruction cannot be vectorized");
+ DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n");
+ return false;
+ }
+ }
+
+ // Check that the instruction return type is vectorizable.
+ // Also, we can't vectorize extractelement instructions.
+ if ((!VectorType::isValidElementType(it->getType()) &&
+ !it->getType()->isVoidTy()) || isa<ExtractElementInst>(it)) {
+ emitAnalysis(VectorizationReport(&*it)
+ << "instruction return type cannot be vectorized");
+ DEBUG(dbgs() << "LV: Found unvectorizable type.\n");
+ return false;
+ }
+
+ // Check that the stored type is vectorizable.
+ if (StoreInst *ST = dyn_cast<StoreInst>(it)) {
+ Type *T = ST->getValueOperand()->getType();
+ if (!VectorType::isValidElementType(T)) {
+ emitAnalysis(VectorizationReport(ST) <<
+ "store instruction cannot be vectorized");
+ return false;
+ }
+ if (EnableMemAccessVersioning)
+ collectStridedAccess(ST);
+ }
+
+ if (EnableMemAccessVersioning)
+ if (LoadInst *LI = dyn_cast<LoadInst>(it))
+ collectStridedAccess(LI);
+
+ // Reduction instructions are allowed to have exit users.
+ // All other instructions must not have external users.
+ if (hasOutsideLoopUser(TheLoop, &*it, AllowedExit)) {
+ emitAnalysis(VectorizationReport(&*it) <<
+ "value cannot be used outside the loop");
+ return false;
+ }
+
+ } // next instr.
+
+ }
+
+ if (!Induction) {
+ DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
+ if (Inductions.empty()) {
+ emitAnalysis(VectorizationReport()
+ << "loop induction variable could not be identified");
+ return false;
+ }
+ }
+
+ // Now we know the widest induction type, check if our found induction
+ // is the same size. If it's not, unset it here and InnerLoopVectorizer
+ // will create another.
+ if (Induction && WidestIndTy != Induction->getType())
+ Induction = nullptr;
+
+ return true;
+}
+
+void LoopVectorizationLegality::collectStridedAccess(Value *MemAccess) {
+ Value *Ptr = nullptr;
+ if (LoadInst *LI = dyn_cast<LoadInst>(MemAccess))
+ Ptr = LI->getPointerOperand();
+ else if (StoreInst *SI = dyn_cast<StoreInst>(MemAccess))
+ Ptr = SI->getPointerOperand();
+ else
+ return;
+
+ Value *Stride = getStrideFromPointer(Ptr, PSE.getSE(), TheLoop);
+ if (!Stride)
+ return;
+
+ DEBUG(dbgs() << "LV: Found a strided access that we can version");
+ DEBUG(dbgs() << " Ptr: " << *Ptr << " Stride: " << *Stride << "\n");
+ Strides[Ptr] = Stride;
+ StrideSet.insert(Stride);
+}
+
+void LoopVectorizationLegality::collectLoopUniforms() {
+ // We now know that the loop is vectorizable!
+ // Collect variables that will remain uniform after vectorization.
+ std::vector<Value*> Worklist;
+ BasicBlock *Latch = TheLoop->getLoopLatch();
+
+ // Start with the conditional branch and walk up the block.
+ Worklist.push_back(Latch->getTerminator()->getOperand(0));
+
+ // Also add all consecutive pointer values; these values will be uniform
+ // after vectorization (and subsequent cleanup) and, until revectorization is
+ // supported, all dependencies must also be uniform.
+ for (Loop::block_iterator B = TheLoop->block_begin(),
+ BE = TheLoop->block_end(); B != BE; ++B)
+ for (BasicBlock::iterator I = (*B)->begin(), IE = (*B)->end();
+ I != IE; ++I)
+ if (I->getType()->isPointerTy() && isConsecutivePtr(&*I))
+ Worklist.insert(Worklist.end(), I->op_begin(), I->op_end());
+
+ while (!Worklist.empty()) {
+ Instruction *I = dyn_cast<Instruction>(Worklist.back());
+ Worklist.pop_back();
+
+ // Look at instructions inside this loop.
+ // Stop when reaching PHI nodes.
+ // TODO: we need to follow values all over the loop, not only in this block.
+ if (!I || !TheLoop->contains(I) || isa<PHINode>(I))
+ continue;
+
+ // This is a known uniform.
+ Uniforms.insert(I);
+
+ // Insert all operands.
+ Worklist.insert(Worklist.end(), I->op_begin(), I->op_end());
+ }
+}
+
+bool LoopVectorizationLegality::canVectorizeMemory() {
+ LAI = &LAA->getInfo(TheLoop, Strides);
+ auto &OptionalReport = LAI->getReport();
+ if (OptionalReport)
+ emitAnalysis(VectorizationReport(*OptionalReport));
+ if (!LAI->canVectorizeMemory())
+ return false;
+
+ if (LAI->hasStoreToLoopInvariantAddress()) {
+ emitAnalysis(
+ VectorizationReport()
+ << "write to a loop invariant address could not be vectorized");
+ DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n");
+ return false;
+ }
+
+ Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks());
+ PSE.addPredicate(LAI->PSE.getUnionPredicate());
+
+ return true;
+}
+
+bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
+ Value *In0 = const_cast<Value*>(V);
+ PHINode *PN = dyn_cast_or_null<PHINode>(In0);
+ if (!PN)
+ return false;
+
+ return Inductions.count(PN);
+}
+
+bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) {
+ return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
+}
+
+bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB,
+ SmallPtrSetImpl<Value *> &SafePtrs) {
+
+ for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+ // Check that we don't have a constant expression that can trap as operand.
+ for (Instruction::op_iterator OI = it->op_begin(), OE = it->op_end();
+ OI != OE; ++OI) {
+ if (Constant *C = dyn_cast<Constant>(*OI))
+ if (C->canTrap())
+ return false;
+ }
+ // We might be able to hoist the load.
+ if (it->mayReadFromMemory()) {
+ LoadInst *LI = dyn_cast<LoadInst>(it);
+ if (!LI)
+ return false;
+ if (!SafePtrs.count(LI->getPointerOperand())) {
+ if (isLegalMaskedLoad(LI->getType(), LI->getPointerOperand())) {
+ MaskedOp.insert(LI);
+ continue;
+ }
+ return false;
+ }
+ }
+
+ // We don't predicate stores at the moment.
+ if (it->mayWriteToMemory()) {
+ StoreInst *SI = dyn_cast<StoreInst>(it);
+ // We only support predication of stores in basic blocks with one
+ // predecessor.
+ if (!SI)
+ return false;
+
+ bool isSafePtr = (SafePtrs.count(SI->getPointerOperand()) != 0);
+ bool isSinglePredecessor = SI->getParent()->getSinglePredecessor();
+
+ if (++NumPredStores > NumberOfStoresToPredicate || !isSafePtr ||
+ !isSinglePredecessor) {
+ // Build a masked store if it is legal for the target, otherwise
+ // scalarize the block.
+ bool isLegalMaskedOp =
+ isLegalMaskedStore(SI->getValueOperand()->getType(),
+ SI->getPointerOperand());
+ if (isLegalMaskedOp) {
+ --NumPredStores;
+ MaskedOp.insert(SI);
+ continue;
+ }
+ return false;
+ }
+ }
+ if (it->mayThrow())
+ return false;
+
+ // The instructions below can trap.
+ switch (it->getOpcode()) {
+ default: continue;
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ return false;
+ }
+ }
+
+ return true;
+}
+
+void InterleavedAccessInfo::collectConstStridedAccesses(
+ MapVector<Instruction *, StrideDescriptor> &StrideAccesses,
+ const ValueToValueMap &Strides) {
+ // Holds load/store instructions in program order.
+ SmallVector<Instruction *, 16> AccessList;
+
+ for (auto *BB : TheLoop->getBlocks()) {
+ bool IsPred = LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
+
+ for (auto &I : *BB) {
+ if (!isa<LoadInst>(&I) && !isa<StoreInst>(&I))
+ continue;
+ // FIXME: Currently we can't handle mixed accesses and predicated accesses
+ if (IsPred)
+ return;
+
+ AccessList.push_back(&I);
+ }
+ }
+
+ if (AccessList.empty())
+ return;
+
+ auto &DL = TheLoop->getHeader()->getModule()->getDataLayout();
+ for (auto I : AccessList) {
+ LoadInst *LI = dyn_cast<LoadInst>(I);
+ StoreInst *SI = dyn_cast<StoreInst>(I);
+
+ Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
+ int Stride = isStridedPtr(PSE, Ptr, TheLoop, Strides);
+
+ // The factor of the corresponding interleave group.
+ unsigned Factor = std::abs(Stride);
+
+ // Ignore the access if the factor is too small or too large.
+ if (Factor < 2 || Factor > MaxInterleaveGroupFactor)
+ continue;
+
+ const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
+ PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
+ unsigned Size = DL.getTypeAllocSize(PtrTy->getElementType());
+
+ // An alignment of 0 means target ABI alignment.
+ unsigned Align = LI ? LI->getAlignment() : SI->getAlignment();
+ if (!Align)
+ Align = DL.getABITypeAlignment(PtrTy->getElementType());
+
+ StrideAccesses[I] = StrideDescriptor(Stride, Scev, Size, Align);
+ }
+}
+
+// Analyze interleaved accesses and collect them into interleave groups.
+//
+// Notice that the vectorization on interleaved groups will change instruction
+// orders and may break dependences. But the memory dependence check guarantees
+// that there is no overlap between two pointers of different strides, element
+// sizes or underlying bases.
+//
+// For pointers sharing the same stride, element size and underlying base, no
+// need to worry about Read-After-Write dependences and Write-After-Read
+// dependences.
+//
+// E.g. The RAW dependence: A[i] = a;
+// b = A[i];
+// This won't exist as it is a store-load forwarding conflict, which has
+// already been checked and forbidden in the dependence check.
+//
+// E.g. The WAR dependence: a = A[i]; // (1)
+// A[i] = b; // (2)
+// The store group of (2) is always inserted at or below (2), and the load group
+// of (1) is always inserted at or above (1). The dependence is safe.
+void InterleavedAccessInfo::analyzeInterleaving(
+ const ValueToValueMap &Strides) {
+ DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n");
+
+ // Holds all the stride accesses.
+ MapVector<Instruction *, StrideDescriptor> StrideAccesses;
+ collectConstStridedAccesses(StrideAccesses, Strides);
+
+ if (StrideAccesses.empty())
+ return;
+
+ // Holds all interleaved store groups temporarily.
+ SmallSetVector<InterleaveGroup *, 4> StoreGroups;
+
+ // Search the load-load/write-write pair B-A in bottom-up order and try to
+ // insert B into the interleave group of A according to 3 rules:
+ // 1. A and B have the same stride.
+ // 2. A and B have the same memory object size.
+ // 3. B belongs to the group according to the distance.
+ //
+ // The bottom-up order can avoid breaking the Write-After-Write dependences
+ // between two pointers of the same base.
+ // E.g. A[i] = a; (1)
+ // A[i] = b; (2)
+ // A[i+1] = c (3)
+ // We form the group (2)+(3) in front, so (1) has to form groups with accesses
+ // above (1), which guarantees that (1) is always above (2).
+ for (auto I = StrideAccesses.rbegin(), E = StrideAccesses.rend(); I != E;
+ ++I) {
+ Instruction *A = I->first;
+ StrideDescriptor DesA = I->second;
+
+ InterleaveGroup *Group = getInterleaveGroup(A);
+ if (!Group) {
+ DEBUG(dbgs() << "LV: Creating an interleave group with:" << *A << '\n');
+ Group = createInterleaveGroup(A, DesA.Stride, DesA.Align);
+ }
+
+ if (A->mayWriteToMemory())
+ StoreGroups.insert(Group);
+
+ for (auto II = std::next(I); II != E; ++II) {
+ Instruction *B = II->first;
+ StrideDescriptor DesB = II->second;
+
+ // Ignore if B is already in a group or B is a different memory operation.
+ if (isInterleaved(B) || A->mayReadFromMemory() != B->mayReadFromMemory())
+ continue;
+
+ // Check the rule 1 and 2.
+ if (DesB.Stride != DesA.Stride || DesB.Size != DesA.Size)
+ continue;
+
+ // Calculate the distance and prepare for the rule 3.
+ const SCEVConstant *DistToA = dyn_cast<SCEVConstant>(
+ PSE.getSE()->getMinusSCEV(DesB.Scev, DesA.Scev));
+ if (!DistToA)
+ continue;
+
+ int DistanceToA = DistToA->getAPInt().getSExtValue();
+
+ // Skip if the distance is not multiple of size as they are not in the
+ // same group.
+ if (DistanceToA % static_cast<int>(DesA.Size))
+ continue;
+
+ // The index of B is the index of A plus the related index to A.
+ int IndexB =
+ Group->getIndex(A) + DistanceToA / static_cast<int>(DesA.Size);
+
+ // Try to insert B into the group.
+ if (Group->insertMember(B, IndexB, DesB.Align)) {
+ DEBUG(dbgs() << "LV: Inserted:" << *B << '\n'
+ << " into the interleave group with" << *A << '\n');
+ InterleaveGroupMap[B] = Group;
+
+ // Set the first load in program order as the insert position.
+ if (B->mayReadFromMemory())
+ Group->setInsertPos(B);
+ }
+ } // Iteration on instruction B
+ } // Iteration on instruction A
+
+ // Remove interleaved store groups with gaps.
+ for (InterleaveGroup *Group : StoreGroups)
+ if (Group->getNumMembers() != Group->getFactor())
+ releaseGroup(Group);
+}
+
+LoopVectorizationCostModel::VectorizationFactor
+LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
+ // Width 1 means no vectorize
+ VectorizationFactor Factor = { 1U, 0U };
+ if (OptForSize && Legal->getRuntimePointerChecking()->Need) {
+ emitAnalysis(VectorizationReport() <<
+ "runtime pointer checks needed. Enable vectorization of this "
+ "loop with '#pragma clang loop vectorize(enable)' when "
+ "compiling with -Os/-Oz");
+ DEBUG(dbgs() <<
+ "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n");
+ return Factor;
+ }
+
+ if (!EnableCondStoresVectorization && Legal->getNumPredStores()) {
+ emitAnalysis(VectorizationReport() <<
+ "store that is conditionally executed prevents vectorization");
+ DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n");
+ return Factor;
+ }
+
+ // Find the trip count.
+ unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
+ DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
+
+ MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
+ unsigned SmallestType, WidestType;
+ std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
+ unsigned WidestRegister = TTI.getRegisterBitWidth(true);
+ unsigned MaxSafeDepDist = -1U;
+ if (Legal->getMaxSafeDepDistBytes() != -1U)
+ MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
+ WidestRegister = ((WidestRegister < MaxSafeDepDist) ?
+ WidestRegister : MaxSafeDepDist);
+ unsigned MaxVectorSize = WidestRegister / WidestType;
+
+ DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType << " / "
+ << WidestType << " bits.\n");
+ DEBUG(dbgs() << "LV: The Widest register is: "
+ << WidestRegister << " bits.\n");
+
+ if (MaxVectorSize == 0) {
+ DEBUG(dbgs() << "LV: The target has no vector registers.\n");
+ MaxVectorSize = 1;
+ }
+
+ assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements"
+ " into one vector!");
+
+ unsigned VF = MaxVectorSize;
+ if (MaximizeBandwidth && !OptForSize) {
+ // Collect all viable vectorization factors.
+ SmallVector<unsigned, 8> VFs;
+ unsigned NewMaxVectorSize = WidestRegister / SmallestType;
+ for (unsigned VS = MaxVectorSize; VS <= NewMaxVectorSize; VS *= 2)
+ VFs.push_back(VS);
+
+ // For each VF calculate its register usage.
+ auto RUs = calculateRegisterUsage(VFs);
+
+ // Select the largest VF which doesn't require more registers than existing
+ // ones.
+ unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true);
+ for (int i = RUs.size() - 1; i >= 0; --i) {
+ if (RUs[i].MaxLocalUsers <= TargetNumRegisters) {
+ VF = VFs[i];
+ break;
+ }
+ }
+ }
+
+ // If we optimize the program for size, avoid creating the tail loop.
+ if (OptForSize) {
+ // If we are unable to calculate the trip count then don't try to vectorize.
+ if (TC < 2) {
+ emitAnalysis
+ (VectorizationReport() <<
+ "unable to calculate the loop count due to complex control flow");
+ DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
+ return Factor;
+ }
+
+ // Find the maximum SIMD width that can fit within the trip count.
+ VF = TC % MaxVectorSize;
+
+ if (VF == 0)
+ VF = MaxVectorSize;
+ else {
+ // If the trip count that we found modulo the vectorization factor is not
+ // zero then we require a tail.
+ emitAnalysis(VectorizationReport() <<
+ "cannot optimize for size and vectorize at the "
+ "same time. Enable vectorization of this loop "
+ "with '#pragma clang loop vectorize(enable)' "
+ "when compiling with -Os/-Oz");
+ DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
+ return Factor;
+ }
+ }
+
+ int UserVF = Hints->getWidth();
+ if (UserVF != 0) {
+ assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
+ DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
+
+ Factor.Width = UserVF;
+ return Factor;
+ }
+
+ float Cost = expectedCost(1);
+#ifndef NDEBUG
+ const float ScalarCost = Cost;
+#endif /* NDEBUG */
+ unsigned Width = 1;
+ DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
+
+ bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
+ // Ignore scalar width, because the user explicitly wants vectorization.
+ if (ForceVectorization && VF > 1) {
+ Width = 2;
+ Cost = expectedCost(Width) / (float)Width;
+ }
+
+ for (unsigned i=2; i <= VF; i*=2) {
+ // Notice that the vector loop needs to be executed less times, so
+ // we need to divide the cost of the vector loops by the width of
+ // the vector elements.
+ float VectorCost = expectedCost(i) / (float)i;
+ DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " <<
+ (int)VectorCost << ".\n");
+ if (VectorCost < Cost) {
+ Cost = VectorCost;
+ Width = i;
+ }
+ }
+
+ DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
+ << "LV: Vectorization seems to be not beneficial, "
+ << "but was forced by a user.\n");
+ DEBUG(dbgs() << "LV: Selecting VF: "<< Width << ".\n");
+ Factor.Width = Width;
+ Factor.Cost = Width * Cost;
+ return Factor;
+}
+
+std::pair<unsigned, unsigned>
+LoopVectorizationCostModel::getSmallestAndWidestTypes() {
+ unsigned MinWidth = -1U;
+ unsigned MaxWidth = 8;
+ const DataLayout &DL = TheFunction->getParent()->getDataLayout();
+
+ // For each block.
+ for (Loop::block_iterator bb = TheLoop->block_begin(),
+ be = TheLoop->block_end(); bb != be; ++bb) {
+ BasicBlock *BB = *bb;
+
+ // For each instruction in the loop.
+ for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+ Type *T = it->getType();
+
+ // Skip ignored values.
+ if (ValuesToIgnore.count(&*it))
+ continue;
+
+ // Only examine Loads, Stores and PHINodes.
+ if (!isa<LoadInst>(it) && !isa<StoreInst>(it) && !isa<PHINode>(it))
+ continue;
+
+ // Examine PHI nodes that are reduction variables. Update the type to
+ // account for the recurrence type.
+ if (PHINode *PN = dyn_cast<PHINode>(it)) {
+ if (!Legal->isReductionVariable(PN))
+ continue;
+ RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN];
+ T = RdxDesc.getRecurrenceType();
+ }
+
+ // Examine the stored values.
+ if (StoreInst *ST = dyn_cast<StoreInst>(it))
+ T = ST->getValueOperand()->getType();
+
+ // Ignore loaded pointer types and stored pointer types that are not
+ // consecutive. However, we do want to take consecutive stores/loads of
+ // pointer vectors into account.
+ if (T->isPointerTy() && !isConsecutiveLoadOrStore(&*it))
+ continue;
+
+ MinWidth = std::min(MinWidth,
+ (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
+ MaxWidth = std::max(MaxWidth,
+ (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
+ }
+ }
+
+ return {MinWidth, MaxWidth};
+}
+
+unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
+ unsigned VF,
+ unsigned LoopCost) {
+
+ // -- The interleave heuristics --
+ // We interleave the loop in order to expose ILP and reduce the loop overhead.
+ // There are many micro-architectural considerations that we can't predict
+ // at this level. For example, frontend pressure (on decode or fetch) due to
+ // code size, or the number and capabilities of the execution ports.
+ //
+ // We use the following heuristics to select the interleave count:
+ // 1. If the code has reductions, then we interleave to break the cross
+ // iteration dependency.
+ // 2. If the loop is really small, then we interleave to reduce the loop
+ // overhead.
+ // 3. We don't interleave if we think that we will spill registers to memory
+ // due to the increased register pressure.
+
+ // When we optimize for size, we don't interleave.
+ if (OptForSize)
+ return 1;
+
+ // We used the distance for the interleave count.
+ if (Legal->getMaxSafeDepDistBytes() != -1U)
+ return 1;
+
+ // Do not interleave loops with a relatively small trip count.
+ unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
+ if (TC > 1 && TC < TinyTripCountInterleaveThreshold)
+ return 1;
+
+ unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);
+ DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters <<
+ " registers\n");
+
+ if (VF == 1) {
+ if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
+ TargetNumRegisters = ForceTargetNumScalarRegs;
+ } else {
+ if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
+ TargetNumRegisters = ForceTargetNumVectorRegs;
+ }
+
+ RegisterUsage R = calculateRegisterUsage({VF})[0];
+ // We divide by these constants so assume that we have at least one
+ // instruction that uses at least one register.
+ R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
+ R.NumInstructions = std::max(R.NumInstructions, 1U);
+
+ // We calculate the interleave count using the following formula.
+ // Subtract the number of loop invariants from the number of available
+ // registers. These registers are used by all of the interleaved instances.
+ // Next, divide the remaining registers by the number of registers that is
+ // required by the loop, in order to estimate how many parallel instances
+ // fit without causing spills. All of this is rounded down if necessary to be
+ // a power of two. We want power of two interleave count to simplify any
+ // addressing operations or alignment considerations.
+ unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
+ R.MaxLocalUsers);
+
+ // Don't count the induction variable as interleaved.
+ if (EnableIndVarRegisterHeur)
+ IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) /
+ std::max(1U, (R.MaxLocalUsers - 1)));
+
+ // Clamp the interleave ranges to reasonable counts.
+ unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
+
+ // Check if the user has overridden the max.
+ if (VF == 1) {
+ if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
+ MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
+ } else {
+ if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
+ MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
+ }
+
+ // If we did not calculate the cost for VF (because the user selected the VF)
+ // then we calculate the cost of VF here.
+ if (LoopCost == 0)
+ LoopCost = expectedCost(VF);
+
+ // Clamp the calculated IC to be between the 1 and the max interleave count
+ // that the target allows.
+ if (IC > MaxInterleaveCount)
+ IC = MaxInterleaveCount;
+ else if (IC < 1)
+ IC = 1;
+
+ // Interleave if we vectorized this loop and there is a reduction that could
+ // benefit from interleaving.
+ if (VF > 1 && Legal->getReductionVars()->size()) {
+ DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
+ return IC;
+ }
+
+ // Note that if we've already vectorized the loop we will have done the
+ // runtime check and so interleaving won't require further checks.
+ bool InterleavingRequiresRuntimePointerCheck =
+ (VF == 1 && Legal->getRuntimePointerChecking()->Need);
+
+ // We want to interleave small loops in order to reduce the loop overhead and
+ // potentially expose ILP opportunities.
+ DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
+ if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
+ // We assume that the cost overhead is 1 and we use the cost model
+ // to estimate the cost of the loop and interleave until the cost of the
+ // loop overhead is about 5% of the cost of the loop.
+ unsigned SmallIC =
+ std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
+
+ // Interleave until store/load ports (estimated by max interleave count) are
+ // saturated.
+ unsigned NumStores = Legal->getNumStores();
+ unsigned NumLoads = Legal->getNumLoads();
+ unsigned StoresIC = IC / (NumStores ? NumStores : 1);
+ unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
+
+ // If we have a scalar reduction (vector reductions are already dealt with
+ // by this point), we can increase the critical path length if the loop
+ // we're interleaving is inside another loop. Limit, by default to 2, so the
+ // critical path only gets increased by one reduction operation.
+ if (Legal->getReductionVars()->size() &&
+ TheLoop->getLoopDepth() > 1) {
+ unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
+ SmallIC = std::min(SmallIC, F);
+ StoresIC = std::min(StoresIC, F);
+ LoadsIC = std::min(LoadsIC, F);
+ }
+
+ if (EnableLoadStoreRuntimeInterleave &&
+ std::max(StoresIC, LoadsIC) > SmallIC) {
+ DEBUG(dbgs() << "LV: Interleaving to saturate store or load ports.\n");
+ return std::max(StoresIC, LoadsIC);
+ }
+
+ DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
+ return SmallIC;
+ }
+
+ // Interleave if this is a large loop (small loops are already dealt with by
+ // this point) that could benefit from interleaving.
+ bool HasReductions = (Legal->getReductionVars()->size() > 0);
+ if (TTI.enableAggressiveInterleaving(HasReductions)) {
+ DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
+ return IC;
+ }
+
+ DEBUG(dbgs() << "LV: Not Interleaving.\n");
+ return 1;
+}
+
+SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
+LoopVectorizationCostModel::calculateRegisterUsage(
+ const SmallVector<unsigned, 8> &VFs) {
+ // This function calculates the register usage by measuring the highest number
+ // of values that are alive at a single location. Obviously, this is a very
+ // rough estimation. We scan the loop in a topological order in order and
+ // assign a number to each instruction. We use RPO to ensure that defs are
+ // met before their users. We assume that each instruction that has in-loop
+ // users starts an interval. We record every time that an in-loop value is
+ // used, so we have a list of the first and last occurrences of each
+ // instruction. Next, we transpose this data structure into a multi map that
+ // holds the list of intervals that *end* at a specific location. This multi
+ // map allows us to perform a linear search. We scan the instructions linearly
+ // and record each time that a new interval starts, by placing it in a set.
+ // If we find this value in the multi-map then we remove it from the set.
+ // The max register usage is the maximum size of the set.
+ // We also search for instructions that are defined outside the loop, but are
+ // used inside the loop. We need this number separately from the max-interval
+ // usage number because when we unroll, loop-invariant values do not take
+ // more register.
+ LoopBlocksDFS DFS(TheLoop);
+ DFS.perform(LI);
+
+ RegisterUsage RU;
+ RU.NumInstructions = 0;
+
+ // Each 'key' in the map opens a new interval. The values
+ // of the map are the index of the 'last seen' usage of the
+ // instruction that is the key.
+ typedef DenseMap<Instruction*, unsigned> IntervalMap;
+ // Maps instruction to its index.
+ DenseMap<unsigned, Instruction*> IdxToInstr;
+ // Marks the end of each interval.
+ IntervalMap EndPoint;
+ // Saves the list of instruction indices that are used in the loop.
+ SmallSet<Instruction*, 8> Ends;
+ // Saves the list of values that are used in the loop but are
+ // defined outside the loop, such as arguments and constants.
+ SmallPtrSet<Value*, 8> LoopInvariants;
+
+ unsigned Index = 0;
+ for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
+ be = DFS.endRPO(); bb != be; ++bb) {
+ RU.NumInstructions += (*bb)->size();
+ for (Instruction &I : **bb) {
+ IdxToInstr[Index++] = &I;
+
+ // Save the end location of each USE.
+ for (unsigned i = 0; i < I.getNumOperands(); ++i) {
+ Value *U = I.getOperand(i);
+ Instruction *Instr = dyn_cast<Instruction>(U);
+
+ // Ignore non-instruction values such as arguments, constants, etc.
+ if (!Instr) continue;
+
+ // If this instruction is outside the loop then record it and continue.
+ if (!TheLoop->contains(Instr)) {
+ LoopInvariants.insert(Instr);
+ continue;
+ }
+
+ // Overwrite previous end points.
+ EndPoint[Instr] = Index;
+ Ends.insert(Instr);
+ }
+ }
+ }
+
+ // Saves the list of intervals that end with the index in 'key'.
+ typedef SmallVector<Instruction*, 2> InstrList;
+ DenseMap<unsigned, InstrList> TransposeEnds;
+
+ // Transpose the EndPoints to a list of values that end at each index.
+ for (IntervalMap::iterator it = EndPoint.begin(), e = EndPoint.end();
+ it != e; ++it)
+ TransposeEnds[it->second].push_back(it->first);
+
+ SmallSet<Instruction*, 8> OpenIntervals;
+
+ // Get the size of the widest register.
+ unsigned MaxSafeDepDist = -1U;
+ if (Legal->getMaxSafeDepDistBytes() != -1U)
+ MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
+ unsigned WidestRegister =
+ std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
+ const DataLayout &DL = TheFunction->getParent()->getDataLayout();
+
+ SmallVector<RegisterUsage, 8> RUs(VFs.size());
+ SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0);
+
+ DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
+
+ // A lambda that gets the register usage for the given type and VF.
+ auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
+ unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
+ return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
+ };
+
+ for (unsigned int i = 0; i < Index; ++i) {
+ Instruction *I = IdxToInstr[i];
+ // Ignore instructions that are never used within the loop.
+ if (!Ends.count(I)) continue;
+
+ // Remove all of the instructions that end at this location.
+ InstrList &List = TransposeEnds[i];
+ for (unsigned int j = 0, e = List.size(); j < e; ++j)
+ OpenIntervals.erase(List[j]);
+
+ // Skip ignored values.
+ if (ValuesToIgnore.count(I))
+ continue;
+
+ // For each VF find the maximum usage of registers.
+ for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
+ if (VFs[j] == 1) {
+ MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size());
+ continue;
+ }
+
+ // Count the number of live intervals.
+ unsigned RegUsage = 0;
+ for (auto Inst : OpenIntervals) {
+ // Skip ignored values for VF > 1.
+ if (VecValuesToIgnore.count(Inst))
+ continue;
+ RegUsage += GetRegUsage(Inst->getType(), VFs[j]);
+ }
+ MaxUsages[j] = std::max(MaxUsages[j], RegUsage);
+ }
+
+ DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
+ << OpenIntervals.size() << '\n');
+
+ // Add the current instruction to the list of open intervals.
+ OpenIntervals.insert(I);
+ }
+
+ for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
+ unsigned Invariant = 0;
+ if (VFs[i] == 1)
+ Invariant = LoopInvariants.size();
+ else {
+ for (auto Inst : LoopInvariants)
+ Invariant += GetRegUsage(Inst->getType(), VFs[i]);
+ }
+
+ DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n');
+ DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n');
+ DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n');
+ DEBUG(dbgs() << "LV(REG): LoopSize: " << RU.NumInstructions << '\n');
+
+ RU.LoopInvariantRegs = Invariant;
+ RU.MaxLocalUsers = MaxUsages[i];
+ RUs[i] = RU;
+ }
+
+ return RUs;
+}
+
+unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
+ unsigned Cost = 0;
+
+ // For each block.
+ for (Loop::block_iterator bb = TheLoop->block_begin(),
+ be = TheLoop->block_end(); bb != be; ++bb) {
+ unsigned BlockCost = 0;
+ BasicBlock *BB = *bb;
+
+ // For each instruction in the old loop.
+ for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+ // Skip dbg intrinsics.
+ if (isa<DbgInfoIntrinsic>(it))
+ continue;
+
+ // Skip ignored values.
+ if (ValuesToIgnore.count(&*it))
+ continue;
+
+ unsigned C = getInstructionCost(&*it, VF);
+
+ // Check if we should override the cost.
+ if (ForceTargetInstructionCost.getNumOccurrences() > 0)
+ C = ForceTargetInstructionCost;
+
+ BlockCost += C;
+ DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF " <<
+ VF << " For instruction: " << *it << '\n');
+ }
+
+ // We assume that if-converted blocks have a 50% chance of being executed.
+ // When the code is scalar then some of the blocks are avoided due to CF.
+ // When the code is vectorized we execute all code paths.
+ if (VF == 1 && Legal->blockNeedsPredication(*bb))
+ BlockCost /= 2;
+
+ Cost += BlockCost;
+ }
+
+ return Cost;
+}
+
+/// \brief Check whether the address computation for a non-consecutive memory
+/// access looks like an unlikely candidate for being merged into the indexing
+/// mode.
+///
+/// We look for a GEP which has one index that is an induction variable and all
+/// other indices are loop invariant. If the stride of this access is also
+/// within a small bound we decide that this address computation can likely be
+/// merged into the addressing mode.
+/// In all other cases, we identify the address computation as complex.
+static bool isLikelyComplexAddressComputation(Value *Ptr,
+ LoopVectorizationLegality *Legal,
+ ScalarEvolution *SE,
+ const Loop *TheLoop) {
+ GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
+ if (!Gep)
+ return true;
+
+ // We are looking for a gep with all loop invariant indices except for one
+ // which should be an induction variable.
+ unsigned NumOperands = Gep->getNumOperands();
+ for (unsigned i = 1; i < NumOperands; ++i) {
+ Value *Opd = Gep->getOperand(i);
+ if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
+ !Legal->isInductionVariable(Opd))
+ return true;
+ }
+
+ // Now we know we have a GEP ptr, %inv, %ind, %inv. Make sure that the step
+ // can likely be merged into the address computation.
+ unsigned MaxMergeDistance = 64;
+
+ const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Ptr));
+ if (!AddRec)
+ return true;
+
+ // Check the step is constant.
+ const SCEV *Step = AddRec->getStepRecurrence(*SE);
+ // Calculate the pointer stride and check if it is consecutive.
+ const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
+ if (!C)
+ return true;
+
+ const APInt &APStepVal = C->getAPInt();
+
+ // Huge step value - give up.
+ if (APStepVal.getBitWidth() > 64)
+ return true;
+
+ int64_t StepVal = APStepVal.getSExtValue();
+
+ return StepVal > MaxMergeDistance;
+}
+
+static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
+ return Legal->hasStride(I->getOperand(0)) ||
+ Legal->hasStride(I->getOperand(1));
+}
+
+unsigned
+LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
+ // If we know that this instruction will remain uniform, check the cost of
+ // the scalar version.
+ if (Legal->isUniformAfterVectorization(I))
+ VF = 1;
+
+ Type *RetTy = I->getType();
+ if (VF > 1 && MinBWs.count(I))
+ RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
+ Type *VectorTy = ToVectorTy(RetTy, VF);
+ auto SE = PSE.getSE();
+
+ // TODO: We need to estimate the cost of intrinsic calls.
+ switch (I->getOpcode()) {
+ case Instruction::GetElementPtr:
+ // We mark this instruction as zero-cost because the cost of GEPs in
+ // vectorized code depends on whether the corresponding memory instruction
+ // is scalarized or not. Therefore, we handle GEPs with the memory
+ // instruction cost.
+ return 0;
+ case Instruction::Br: {
+ return TTI.getCFInstrCost(I->getOpcode());
+ }
+ case Instruction::PHI:
+ //TODO: IF-converted IFs become selects.
+ return 0;
+ case Instruction::Add:
+ case Instruction::FAdd:
+ case Instruction::Sub:
+ case Instruction::FSub:
+ case Instruction::Mul:
+ case Instruction::FMul:
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::FDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor: {
+ // Since we will replace the stride by 1 the multiplication should go away.
+ if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
+ return 0;
+ // Certain instructions can be cheaper to vectorize if they have a constant
+ // second vector operand. One example of this are shifts on x86.
+ TargetTransformInfo::OperandValueKind Op1VK =
+ TargetTransformInfo::OK_AnyValue;
+ TargetTransformInfo::OperandValueKind Op2VK =
+ TargetTransformInfo::OK_AnyValue;
+ TargetTransformInfo::OperandValueProperties Op1VP =
+ TargetTransformInfo::OP_None;
+ TargetTransformInfo::OperandValueProperties Op2VP =
+ TargetTransformInfo::OP_None;
+ Value *Op2 = I->getOperand(1);
+
+ // Check for a splat of a constant or for a non uniform vector of constants.
+ if (isa<ConstantInt>(Op2)) {
+ ConstantInt *CInt = cast<ConstantInt>(Op2);
+ if (CInt && CInt->getValue().isPowerOf2())
+ Op2VP = TargetTransformInfo::OP_PowerOf2;
+ Op2VK = TargetTransformInfo::OK_UniformConstantValue;
+ } else if (isa<ConstantVector>(Op2) || isa<ConstantDataVector>(Op2)) {
+ Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
+ Constant *SplatValue = cast<Constant>(Op2)->getSplatValue();
+ if (SplatValue) {
+ ConstantInt *CInt = dyn_cast<ConstantInt>(SplatValue);
+ if (CInt && CInt->getValue().isPowerOf2())
+ Op2VP = TargetTransformInfo::OP_PowerOf2;
+ Op2VK = TargetTransformInfo::OK_UniformConstantValue;
+ }
+ }
+
+ return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK,
+ Op1VP, Op2VP);
+ }
+ case Instruction::Select: {
+ SelectInst *SI = cast<SelectInst>(I);
+ const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
+ bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
+ Type *CondTy = SI->getCondition()->getType();
+ if (!ScalarCond)
+ CondTy = VectorType::get(CondTy, VF);
+
+ return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy);
+ }
+ case Instruction::ICmp:
+ case Instruction::FCmp: {
+ Type *ValTy = I->getOperand(0)->getType();
+ Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
+ auto It = MinBWs.find(Op0AsInstruction);
+ if (VF > 1 && It != MinBWs.end())
+ ValTy = IntegerType::get(ValTy->getContext(), It->second);
+ VectorTy = ToVectorTy(ValTy, VF);
+ return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy);
+ }
+ case Instruction::Store:
+ case Instruction::Load: {
+ StoreInst *SI = dyn_cast<StoreInst>(I);
+ LoadInst *LI = dyn_cast<LoadInst>(I);
+ Type *ValTy = (SI ? SI->getValueOperand()->getType() :
+ LI->getType());
+ VectorTy = ToVectorTy(ValTy, VF);
+
+ unsigned Alignment = SI ? SI->getAlignment() : LI->getAlignment();
+ unsigned AS = SI ? SI->getPointerAddressSpace() :
+ LI->getPointerAddressSpace();
+ Value *Ptr = SI ? SI->getPointerOperand() : LI->getPointerOperand();
+ // We add the cost of address computation here instead of with the gep
+ // instruction because only here we know whether the operation is
+ // scalarized.
+ if (VF == 1)
+ return TTI.getAddressComputationCost(VectorTy) +
+ TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
+
+ // For an interleaved access, calculate the total cost of the whole
+ // interleave group.
+ if (Legal->isAccessInterleaved(I)) {
+ auto Group = Legal->getInterleavedAccessGroup(I);
+ assert(Group && "Fail to get an interleaved access group.");
+
+ // Only calculate the cost once at the insert position.
+ if (Group->getInsertPos() != I)
+ return 0;
+
+ unsigned InterleaveFactor = Group->getFactor();
+ Type *WideVecTy =
+ VectorType::get(VectorTy->getVectorElementType(),
+ VectorTy->getVectorNumElements() * InterleaveFactor);
+
+ // Holds the indices of existing members in an interleaved load group.
+ // An interleaved store group doesn't need this as it dones't allow gaps.
+ SmallVector<unsigned, 4> Indices;
+ if (LI) {
+ for (unsigned i = 0; i < InterleaveFactor; i++)
+ if (Group->getMember(i))
+ Indices.push_back(i);
+ }
+
+ // Calculate the cost of the whole interleaved group.
+ unsigned Cost = TTI.getInterleavedMemoryOpCost(
+ I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
+ Group->getAlignment(), AS);
+
+ if (Group->isReverse())
+ Cost +=
+ Group->getNumMembers() *
+ TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
+
+ // FIXME: The interleaved load group with a huge gap could be even more
+ // expensive than scalar operations. Then we could ignore such group and
+ // use scalar operations instead.
+ return Cost;
+ }
+
+ // Scalarized loads/stores.
+ int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
+ bool Reverse = ConsecutiveStride < 0;
+ const DataLayout &DL = I->getModule()->getDataLayout();
+ unsigned ScalarAllocatedSize = DL.getTypeAllocSize(ValTy);
+ unsigned VectorElementSize = DL.getTypeStoreSize(VectorTy) / VF;
+ if (!ConsecutiveStride || ScalarAllocatedSize != VectorElementSize) {
+ bool IsComplexComputation =
+ isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop);
+ unsigned Cost = 0;
+ // The cost of extracting from the value vector and pointer vector.
+ Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
+ for (unsigned i = 0; i < VF; ++i) {
+ // The cost of extracting the pointer operand.
+ Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, PtrTy, i);
+ // In case of STORE, the cost of ExtractElement from the vector.
+ // In case of LOAD, the cost of InsertElement into the returned
+ // vector.
+ Cost += TTI.getVectorInstrCost(SI ? Instruction::ExtractElement :
+ Instruction::InsertElement,
+ VectorTy, i);
+ }
+
+ // The cost of the scalar loads/stores.
+ Cost += VF * TTI.getAddressComputationCost(PtrTy, IsComplexComputation);
+ Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
+ Alignment, AS);
+ return Cost;
+ }
+
+ // Wide load/stores.
+ unsigned Cost = TTI.getAddressComputationCost(VectorTy);
+ if (Legal->isMaskRequired(I))
+ Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment,
+ AS);
+ else
+ Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
+
+ if (Reverse)
+ Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
+ VectorTy, 0);
+ return Cost;
+ }
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::SIToFP:
+ case Instruction::UIToFP:
+ case Instruction::Trunc:
+ case Instruction::FPTrunc:
+ case Instruction::BitCast: {
+ // We optimize the truncation of induction variable.
+ // The cost of these is the same as the scalar operation.
+ if (I->getOpcode() == Instruction::Trunc &&
+ Legal->isInductionVariable(I->getOperand(0)))
+ return TTI.getCastInstrCost(I->getOpcode(), I->getType(),
+ I->getOperand(0)->getType());
+
+ Type *SrcScalarTy = I->getOperand(0)->getType();
+ Type *SrcVecTy = ToVectorTy(SrcScalarTy, VF);
+ if (VF > 1 && MinBWs.count(I)) {
+ // This cast is going to be shrunk. This may remove the cast or it might
+ // turn it into slightly different cast. For example, if MinBW == 16,
+ // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
+ //
+ // Calculate the modified src and dest types.
+ Type *MinVecTy = VectorTy;
+ if (I->getOpcode() == Instruction::Trunc) {
+ SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
+ VectorTy = largestIntegerVectorType(ToVectorTy(I->getType(), VF),
+ MinVecTy);
+ } else if (I->getOpcode() == Instruction::ZExt ||
+ I->getOpcode() == Instruction::SExt) {
+ SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
+ VectorTy = smallestIntegerVectorType(ToVectorTy(I->getType(), VF),
+ MinVecTy);
+ }
+ }
+
+ return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
+ }
+ case Instruction::Call: {
+ bool NeedToScalarize;
+ CallInst *CI = cast<CallInst>(I);
+ unsigned CallCost = getVectorCallCost(CI, VF, TTI, TLI, NeedToScalarize);
+ if (getIntrinsicIDForCall(CI, TLI))
+ return std::min(CallCost, getVectorIntrinsicCost(CI, VF, TTI, TLI));
+ return CallCost;
+ }
+ default: {
+ // We are scalarizing the instruction. Return the cost of the scalar
+ // instruction, plus the cost of insert and extract into vector
+ // elements, times the vector width.
+ unsigned Cost = 0;
+
+ if (!RetTy->isVoidTy() && VF != 1) {
+ unsigned InsCost = TTI.getVectorInstrCost(Instruction::InsertElement,
+ VectorTy);
+ unsigned ExtCost = TTI.getVectorInstrCost(Instruction::ExtractElement,
+ VectorTy);
+
+ // The cost of inserting the results plus extracting each one of the
+ // operands.
+ Cost += VF * (InsCost + ExtCost * I->getNumOperands());
+ }
+
+ // The cost of executing VF copies of the scalar instruction. This opcode
+ // is unknown. Assume that it is the same as 'mul'.
+ Cost += VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy);
+ return Cost;
+ }
+ }// end of switch.
+}
+
+char LoopVectorize::ID = 0;
+static const char lv_name[] = "Loop Vectorization";
+INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LCSSA)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DemandedBits)
+INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
+
+namespace llvm {
+ Pass *createLoopVectorizePass(bool NoUnrolling, bool AlwaysVectorize) {
+ return new LoopVectorize(NoUnrolling, AlwaysVectorize);
+ }
+}
+
+bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
+ // Check for a store.
+ if (StoreInst *ST = dyn_cast<StoreInst>(Inst))
+ return Legal->isConsecutivePtr(ST->getPointerOperand()) != 0;
+
+ // Check for a load.
+ if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
+ return Legal->isConsecutivePtr(LI->getPointerOperand()) != 0;
+
+ return false;
+}
+
+void LoopVectorizationCostModel::collectValuesToIgnore() {
+ // Ignore ephemeral values.
+ CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
+
+ // Ignore type-promoting instructions we identified during reduction
+ // detection.
+ for (auto &Reduction : *Legal->getReductionVars()) {
+ RecurrenceDescriptor &RedDes = Reduction.second;
+ SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
+ VecValuesToIgnore.insert(Casts.begin(), Casts.end());
+ }
+
+ // Ignore induction phis that are only used in either GetElementPtr or ICmp
+ // instruction to exit loop. Induction variables usually have large types and
+ // can have big impact when estimating register usage.
+ // This is for when VF > 1.
+ for (auto &Induction : *Legal->getInductionVars()) {
+ auto *PN = Induction.first;
+ auto *UpdateV = PN->getIncomingValueForBlock(TheLoop->getLoopLatch());
+
+ // Check that the PHI is only used by the induction increment (UpdateV) or
+ // by GEPs. Then check that UpdateV is only used by a compare instruction or
+ // the loop header PHI.
+ // FIXME: Need precise def-use analysis to determine if this instruction
+ // variable will be vectorized.
+ if (std::all_of(PN->user_begin(), PN->user_end(),
+ [&](const User *U) -> bool {
+ return U == UpdateV || isa<GetElementPtrInst>(U);
+ }) &&
+ std::all_of(UpdateV->user_begin(), UpdateV->user_end(),
+ [&](const User *U) -> bool {
+ return U == PN || isa<ICmpInst>(U);
+ })) {
+ VecValuesToIgnore.insert(PN);
+ VecValuesToIgnore.insert(UpdateV);
+ }
+ }
+
+ // Ignore instructions that will not be vectorized.
+ // This is for when VF > 1.
+ for (auto bb = TheLoop->block_begin(), be = TheLoop->block_end(); bb != be;
+ ++bb) {
+ for (auto &Inst : **bb) {
+ switch (Inst.getOpcode()) {
+ case Instruction::GetElementPtr: {
+ // Ignore GEP if its last operand is an induction variable so that it is
+ // a consecutive load/store and won't be vectorized as scatter/gather
+ // pattern.
+
+ GetElementPtrInst *Gep = cast<GetElementPtrInst>(&Inst);
+ unsigned NumOperands = Gep->getNumOperands();
+ unsigned InductionOperand = getGEPInductionOperand(Gep);
+ bool GepToIgnore = true;
+
+ // Check that all of the gep indices are uniform except for the
+ // induction operand.
+ for (unsigned i = 0; i != NumOperands; ++i) {
+ if (i != InductionOperand &&
+ !PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)),
+ TheLoop)) {
+ GepToIgnore = false;
+ break;
+ }
+ }
+
+ if (GepToIgnore)
+ VecValuesToIgnore.insert(&Inst);
+ break;
+ }
+ }
+ }
+ }
+}
+
+void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
+ bool IfPredicateStore) {
+ assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
+ // Holds vector parameters or scalars, in case of uniform vals.
+ SmallVector<VectorParts, 4> Params;
+
+ setDebugLocFromInst(Builder, Instr);
+
+ // Find all of the vectorized parameters.
+ for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
+ Value *SrcOp = Instr->getOperand(op);
+
+ // If we are accessing the old induction variable, use the new one.
+ if (SrcOp == OldInduction) {
+ Params.push_back(getVectorValue(SrcOp));
+ continue;
+ }
+
+ // Try using previously calculated values.
+ Instruction *SrcInst = dyn_cast<Instruction>(SrcOp);
+
+ // If the src is an instruction that appeared earlier in the basic block
+ // then it should already be vectorized.
+ if (SrcInst && OrigLoop->contains(SrcInst)) {
+ assert(WidenMap.has(SrcInst) && "Source operand is unavailable");
+ // The parameter is a vector value from earlier.
+ Params.push_back(WidenMap.get(SrcInst));
+ } else {
+ // The parameter is a scalar from outside the loop. Maybe even a constant.
+ VectorParts Scalars;
+ Scalars.append(UF, SrcOp);
+ Params.push_back(Scalars);
+ }
+ }
+
+ assert(Params.size() == Instr->getNumOperands() &&
+ "Invalid number of operands");
+
+ // Does this instruction return a value ?
+ bool IsVoidRetTy = Instr->getType()->isVoidTy();
+
+ Value *UndefVec = IsVoidRetTy ? nullptr :
+ UndefValue::get(Instr->getType());
+ // Create a new entry in the WidenMap and initialize it to Undef or Null.
+ VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
+
+ VectorParts Cond;
+ if (IfPredicateStore) {
+ assert(Instr->getParent()->getSinglePredecessor() &&
+ "Only support single predecessor blocks");
+ Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(),
+ Instr->getParent());
+ }
+
+ // For each vector unroll 'part':
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ // For each scalar that we create:
+
+ // Start an "if (pred) a[i] = ..." block.
+ Value *Cmp = nullptr;
+ if (IfPredicateStore) {
+ if (Cond[Part]->getType()->isVectorTy())
+ Cond[Part] =
+ Builder.CreateExtractElement(Cond[Part], Builder.getInt32(0));
+ Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cond[Part],
+ ConstantInt::get(Cond[Part]->getType(), 1));
+ }
+
+ Instruction *Cloned = Instr->clone();
+ if (!IsVoidRetTy)
+ Cloned->setName(Instr->getName() + ".cloned");
+ // Replace the operands of the cloned instructions with extracted scalars.
+ for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
+ Value *Op = Params[op][Part];
+ Cloned->setOperand(op, Op);
+ }
+
+ // Place the cloned scalar in the new loop.
+ Builder.Insert(Cloned);
+
+ // If the original scalar returns a value we need to place it in a vector
+ // so that future users will be able to use it.
+ if (!IsVoidRetTy)
+ VecResults[Part] = Cloned;
+
+ // End if-block.
+ if (IfPredicateStore)
+ PredicatedStores.push_back(std::make_pair(cast<StoreInst>(Cloned),
+ Cmp));
+ }
+}
+
+void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) {
+ StoreInst *SI = dyn_cast<StoreInst>(Instr);
+ bool IfPredicateStore = (SI && Legal->blockNeedsPredication(SI->getParent()));
+
+ return scalarizeInstruction(Instr, IfPredicateStore);
+}
+
+Value *InnerLoopUnroller::reverseVector(Value *Vec) {
+ return Vec;
+}
+
+Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) {
+ return V;
+}
+
+Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step) {
+ // When unrolling and the VF is 1, we only need to add a simple scalar.
+ Type *ITy = Val->getType();
+ assert(!ITy->isVectorTy() && "Val must be a scalar");
+ Constant *C = ConstantInt::get(ITy, StartIdx);
+ return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
+}
diff --git a/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
new file mode 100644
index 0000000..9ed44d1
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -0,0 +1,4224 @@
+//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
+// stores that can be put together into vector-stores. Next, it attempts to
+// construct vectorizable tree using the use-def chains. If a profitable tree
+// was found, the SLP vectorizer performs vectorization on the tree.
+//
+// The pass is inspired by the work described in the paper:
+// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/Transforms/Vectorize.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/NoFolder.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include <algorithm>
+#include <map>
+#include <memory>
+
+using namespace llvm;
+
+#define SV_NAME "slp-vectorizer"
+#define DEBUG_TYPE "SLP"
+
+STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
+
+static cl::opt<int>
+ SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
+ cl::desc("Only vectorize if you gain more than this "
+ "number "));
+
+static cl::opt<bool>
+ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
+ cl::desc("Attempt to vectorize horizontal reductions"));
+
+static cl::opt<bool> ShouldStartVectorizeHorAtStore(
+ "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
+ cl::desc(
+ "Attempt to vectorize horizontal reductions feeding into a store"));
+
+static cl::opt<int>
+MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
+ cl::desc("Attempt to vectorize for this register size in bits"));
+
+/// Limits the size of scheduling regions in a block.
+/// It avoid long compile times for _very_ large blocks where vector
+/// instructions are spread over a wide range.
+/// This limit is way higher than needed by real-world functions.
+static cl::opt<int>
+ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
+ cl::desc("Limit the size of the SLP scheduling region per block"));
+
+namespace {
+
+// FIXME: Set this via cl::opt to allow overriding.
+static const unsigned MinVecRegSize = 128;
+
+static const unsigned RecursionMaxDepth = 12;
+
+// Limit the number of alias checks. The limit is chosen so that
+// it has no negative effect on the llvm benchmarks.
+static const unsigned AliasedCheckLimit = 10;
+
+// Another limit for the alias checks: The maximum distance between load/store
+// instructions where alias checks are done.
+// This limit is useful for very large basic blocks.
+static const unsigned MaxMemDepDistance = 160;
+
+/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
+/// regions to be handled.
+static const int MinScheduleRegionSize = 16;
+
+/// \brief Predicate for the element types that the SLP vectorizer supports.
+///
+/// The most important thing to filter here are types which are invalid in LLVM
+/// vectors. We also filter target specific types which have absolutely no
+/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
+/// avoids spending time checking the cost model and realizing that they will
+/// be inevitably scalarized.
+static bool isValidElementType(Type *Ty) {
+ return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
+ !Ty->isPPC_FP128Ty();
+}
+
+/// \returns the parent basic block if all of the instructions in \p VL
+/// are in the same block or null otherwise.
+static BasicBlock *getSameBlock(ArrayRef<Value *> VL) {
+ Instruction *I0 = dyn_cast<Instruction>(VL[0]);
+ if (!I0)
+ return nullptr;
+ BasicBlock *BB = I0->getParent();
+ for (int i = 1, e = VL.size(); i < e; i++) {
+ Instruction *I = dyn_cast<Instruction>(VL[i]);
+ if (!I)
+ return nullptr;
+
+ if (BB != I->getParent())
+ return nullptr;
+ }
+ return BB;
+}
+
+/// \returns True if all of the values in \p VL are constants.
+static bool allConstant(ArrayRef<Value *> VL) {
+ for (unsigned i = 0, e = VL.size(); i < e; ++i)
+ if (!isa<Constant>(VL[i]))
+ return false;
+ return true;
+}
+
+/// \returns True if all of the values in \p VL are identical.
+static bool isSplat(ArrayRef<Value *> VL) {
+ for (unsigned i = 1, e = VL.size(); i < e; ++i)
+ if (VL[i] != VL[0])
+ return false;
+ return true;
+}
+
+///\returns Opcode that can be clubbed with \p Op to create an alternate
+/// sequence which can later be merged as a ShuffleVector instruction.
+static unsigned getAltOpcode(unsigned Op) {
+ switch (Op) {
+ case Instruction::FAdd:
+ return Instruction::FSub;
+ case Instruction::FSub:
+ return Instruction::FAdd;
+ case Instruction::Add:
+ return Instruction::Sub;
+ case Instruction::Sub:
+ return Instruction::Add;
+ default:
+ return 0;
+ }
+}
+
+///\returns bool representing if Opcode \p Op can be part
+/// of an alternate sequence which can later be merged as
+/// a ShuffleVector instruction.
+static bool canCombineAsAltInst(unsigned Op) {
+ return Op == Instruction::FAdd || Op == Instruction::FSub ||
+ Op == Instruction::Sub || Op == Instruction::Add;
+}
+
+/// \returns ShuffleVector instruction if instructions in \p VL have
+/// alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence.
+/// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...)
+static unsigned isAltInst(ArrayRef<Value *> VL) {
+ Instruction *I0 = dyn_cast<Instruction>(VL[0]);
+ unsigned Opcode = I0->getOpcode();
+ unsigned AltOpcode = getAltOpcode(Opcode);
+ for (int i = 1, e = VL.size(); i < e; i++) {
+ Instruction *I = dyn_cast<Instruction>(VL[i]);
+ if (!I || I->getOpcode() != ((i & 1) ? AltOpcode : Opcode))
+ return 0;
+ }
+ return Instruction::ShuffleVector;
+}
+
+/// \returns The opcode if all of the Instructions in \p VL have the same
+/// opcode, or zero.
+static unsigned getSameOpcode(ArrayRef<Value *> VL) {
+ Instruction *I0 = dyn_cast<Instruction>(VL[0]);
+ if (!I0)
+ return 0;
+ unsigned Opcode = I0->getOpcode();
+ for (int i = 1, e = VL.size(); i < e; i++) {
+ Instruction *I = dyn_cast<Instruction>(VL[i]);
+ if (!I || Opcode != I->getOpcode()) {
+ if (canCombineAsAltInst(Opcode) && i == 1)
+ return isAltInst(VL);
+ return 0;
+ }
+ }
+ return Opcode;
+}
+
+/// Get the intersection (logical and) of all of the potential IR flags
+/// of each scalar operation (VL) that will be converted into a vector (I).
+/// Flag set: NSW, NUW, exact, and all of fast-math.
+static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) {
+ if (auto *VecOp = dyn_cast<BinaryOperator>(I)) {
+ if (auto *Intersection = dyn_cast<BinaryOperator>(VL[0])) {
+ // Intersection is initialized to the 0th scalar,
+ // so start counting from index '1'.
+ for (int i = 1, e = VL.size(); i < e; ++i) {
+ if (auto *Scalar = dyn_cast<BinaryOperator>(VL[i]))
+ Intersection->andIRFlags(Scalar);
+ }
+ VecOp->copyIRFlags(Intersection);
+ }
+ }
+}
+
+/// \returns \p I after propagating metadata from \p VL.
+static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) {
+ Instruction *I0 = cast<Instruction>(VL[0]);
+ SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata;
+ I0->getAllMetadataOtherThanDebugLoc(Metadata);
+
+ for (unsigned i = 0, n = Metadata.size(); i != n; ++i) {
+ unsigned Kind = Metadata[i].first;
+ MDNode *MD = Metadata[i].second;
+
+ for (int i = 1, e = VL.size(); MD && i != e; i++) {
+ Instruction *I = cast<Instruction>(VL[i]);
+ MDNode *IMD = I->getMetadata(Kind);
+
+ switch (Kind) {
+ default:
+ MD = nullptr; // Remove unknown metadata
+ break;
+ case LLVMContext::MD_tbaa:
+ MD = MDNode::getMostGenericTBAA(MD, IMD);
+ break;
+ case LLVMContext::MD_alias_scope:
+ MD = MDNode::getMostGenericAliasScope(MD, IMD);
+ break;
+ case LLVMContext::MD_noalias:
+ MD = MDNode::intersect(MD, IMD);
+ break;
+ case LLVMContext::MD_fpmath:
+ MD = MDNode::getMostGenericFPMath(MD, IMD);
+ break;
+ case LLVMContext::MD_nontemporal:
+ MD = MDNode::intersect(MD, IMD);
+ break;
+ }
+ }
+ I->setMetadata(Kind, MD);
+ }
+ return I;
+}
+
+/// \returns The type that all of the values in \p VL have or null if there
+/// are different types.
+static Type* getSameType(ArrayRef<Value *> VL) {
+ Type *Ty = VL[0]->getType();
+ for (int i = 1, e = VL.size(); i < e; i++)
+ if (VL[i]->getType() != Ty)
+ return nullptr;
+
+ return Ty;
+}
+
+/// \returns True if the ExtractElement instructions in VL can be vectorized
+/// to use the original vector.
+static bool CanReuseExtract(ArrayRef<Value *> VL) {
+ assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid opcode");
+ // Check if all of the extracts come from the same vector and from the
+ // correct offset.
+ Value *VL0 = VL[0];
+ ExtractElementInst *E0 = cast<ExtractElementInst>(VL0);
+ Value *Vec = E0->getOperand(0);
+
+ // We have to extract from the same vector type.
+ unsigned NElts = Vec->getType()->getVectorNumElements();
+
+ if (NElts != VL.size())
+ return false;
+
+ // Check that all of the indices extract from the correct offset.
+ ConstantInt *CI = dyn_cast<ConstantInt>(E0->getOperand(1));
+ if (!CI || CI->getZExtValue())
+ return false;
+
+ for (unsigned i = 1, e = VL.size(); i < e; ++i) {
+ ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
+ ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1));
+
+ if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec)
+ return false;
+ }
+
+ return true;
+}
+
+/// \returns True if in-tree use also needs extract. This refers to
+/// possible scalar operand in vectorized instruction.
+static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
+ TargetLibraryInfo *TLI) {
+
+ unsigned Opcode = UserInst->getOpcode();
+ switch (Opcode) {
+ case Instruction::Load: {
+ LoadInst *LI = cast<LoadInst>(UserInst);
+ return (LI->getPointerOperand() == Scalar);
+ }
+ case Instruction::Store: {
+ StoreInst *SI = cast<StoreInst>(UserInst);
+ return (SI->getPointerOperand() == Scalar);
+ }
+ case Instruction::Call: {
+ CallInst *CI = cast<CallInst>(UserInst);
+ Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
+ if (hasVectorInstrinsicScalarOpd(ID, 1)) {
+ return (CI->getArgOperand(1) == Scalar);
+ }
+ }
+ default:
+ return false;
+ }
+}
+
+/// \returns the AA location that is being access by the instruction.
+static MemoryLocation getLocation(Instruction *I, AliasAnalysis *AA) {
+ if (StoreInst *SI = dyn_cast<StoreInst>(I))
+ return MemoryLocation::get(SI);
+ if (LoadInst *LI = dyn_cast<LoadInst>(I))
+ return MemoryLocation::get(LI);
+ return MemoryLocation();
+}
+
+/// \returns True if the instruction is not a volatile or atomic load/store.
+static bool isSimple(Instruction *I) {
+ if (LoadInst *LI = dyn_cast<LoadInst>(I))
+ return LI->isSimple();
+ if (StoreInst *SI = dyn_cast<StoreInst>(I))
+ return SI->isSimple();
+ if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
+ return !MI->isVolatile();
+ return true;
+}
+
+/// Bottom Up SLP Vectorizer.
+class BoUpSLP {
+public:
+ typedef SmallVector<Value *, 8> ValueList;
+ typedef SmallVector<Instruction *, 16> InstrList;
+ typedef SmallPtrSet<Value *, 16> ValueSet;
+ typedef SmallVector<StoreInst *, 8> StoreList;
+
+ BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
+ TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
+ DominatorTree *Dt, AssumptionCache *AC)
+ : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func),
+ SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt),
+ Builder(Se->getContext()) {
+ CodeMetrics::collectEphemeralValues(F, AC, EphValues);
+ }
+
+ /// \brief Vectorize the tree that starts with the elements in \p VL.
+ /// Returns the vectorized root.
+ Value *vectorizeTree();
+
+ /// \returns the cost incurred by unwanted spills and fills, caused by
+ /// holding live values over call sites.
+ int getSpillCost();
+
+ /// \returns the vectorization cost of the subtree that starts at \p VL.
+ /// A negative number means that this is profitable.
+ int getTreeCost();
+
+ /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
+ /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
+ void buildTree(ArrayRef<Value *> Roots,
+ ArrayRef<Value *> UserIgnoreLst = None);
+
+ /// Clear the internal data structures that are created by 'buildTree'.
+ void deleteTree() {
+ VectorizableTree.clear();
+ ScalarToTreeEntry.clear();
+ MustGather.clear();
+ ExternalUses.clear();
+ NumLoadsWantToKeepOrder = 0;
+ NumLoadsWantToChangeOrder = 0;
+ for (auto &Iter : BlocksSchedules) {
+ BlockScheduling *BS = Iter.second.get();
+ BS->clear();
+ }
+ }
+
+ /// \returns true if the memory operations A and B are consecutive.
+ bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL);
+
+ /// \brief Perform LICM and CSE on the newly generated gather sequences.
+ void optimizeGatherSequence();
+
+ /// \returns true if it is beneficial to reverse the vector order.
+ bool shouldReorder() const {
+ return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder;
+ }
+
+private:
+ struct TreeEntry;
+
+ /// \returns the cost of the vectorizable entry.
+ int getEntryCost(TreeEntry *E);
+
+ /// This is the recursive part of buildTree.
+ void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth);
+
+ /// Vectorize a single entry in the tree.
+ Value *vectorizeTree(TreeEntry *E);
+
+ /// Vectorize a single entry in the tree, starting in \p VL.
+ Value *vectorizeTree(ArrayRef<Value *> VL);
+
+ /// \returns the pointer to the vectorized value if \p VL is already
+ /// vectorized, or NULL. They may happen in cycles.
+ Value *alreadyVectorized(ArrayRef<Value *> VL) const;
+
+ /// \brief Take the pointer operand from the Load/Store instruction.
+ /// \returns NULL if this is not a valid Load/Store instruction.
+ static Value *getPointerOperand(Value *I);
+
+ /// \brief Take the address space operand from the Load/Store instruction.
+ /// \returns -1 if this is not a valid Load/Store instruction.
+ static unsigned getAddressSpaceOperand(Value *I);
+
+ /// \returns the scalarization cost for this type. Scalarization in this
+ /// context means the creation of vectors from a group of scalars.
+ int getGatherCost(Type *Ty);
+
+ /// \returns the scalarization cost for this list of values. Assuming that
+ /// this subtree gets vectorized, we may need to extract the values from the
+ /// roots. This method calculates the cost of extracting the values.
+ int getGatherCost(ArrayRef<Value *> VL);
+
+ /// \brief Set the Builder insert point to one after the last instruction in
+ /// the bundle
+ void setInsertPointAfterBundle(ArrayRef<Value *> VL);
+
+ /// \returns a vector from a collection of scalars in \p VL.
+ Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
+
+ /// \returns whether the VectorizableTree is fully vectorizable and will
+ /// be beneficial even the tree height is tiny.
+ bool isFullyVectorizableTinyTree();
+
+ /// \reorder commutative operands in alt shuffle if they result in
+ /// vectorized code.
+ void reorderAltShuffleOperands(ArrayRef<Value *> VL,
+ SmallVectorImpl<Value *> &Left,
+ SmallVectorImpl<Value *> &Right);
+ /// \reorder commutative operands to get better probability of
+ /// generating vectorized code.
+ void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
+ SmallVectorImpl<Value *> &Left,
+ SmallVectorImpl<Value *> &Right);
+ struct TreeEntry {
+ TreeEntry() : Scalars(), VectorizedValue(nullptr),
+ NeedToGather(0) {}
+
+ /// \returns true if the scalars in VL are equal to this entry.
+ bool isSame(ArrayRef<Value *> VL) const {
+ assert(VL.size() == Scalars.size() && "Invalid size");
+ return std::equal(VL.begin(), VL.end(), Scalars.begin());
+ }
+
+ /// A vector of scalars.
+ ValueList Scalars;
+
+ /// The Scalars are vectorized into this value. It is initialized to Null.
+ Value *VectorizedValue;
+
+ /// Do we need to gather this sequence ?
+ bool NeedToGather;
+ };
+
+ /// Create a new VectorizableTree entry.
+ TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized) {
+ VectorizableTree.emplace_back();
+ int idx = VectorizableTree.size() - 1;
+ TreeEntry *Last = &VectorizableTree[idx];
+ Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
+ Last->NeedToGather = !Vectorized;
+ if (Vectorized) {
+ for (int i = 0, e = VL.size(); i != e; ++i) {
+ assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!");
+ ScalarToTreeEntry[VL[i]] = idx;
+ }
+ } else {
+ MustGather.insert(VL.begin(), VL.end());
+ }
+ return Last;
+ }
+
+ /// -- Vectorization State --
+ /// Holds all of the tree entries.
+ std::vector<TreeEntry> VectorizableTree;
+
+ /// Maps a specific scalar to its tree entry.
+ SmallDenseMap<Value*, int> ScalarToTreeEntry;
+
+ /// A list of scalars that we found that we need to keep as scalars.
+ ValueSet MustGather;
+
+ /// This POD struct describes one external user in the vectorized tree.
+ struct ExternalUser {
+ ExternalUser (Value *S, llvm::User *U, int L) :
+ Scalar(S), User(U), Lane(L){}
+ // Which scalar in our function.
+ Value *Scalar;
+ // Which user that uses the scalar.
+ llvm::User *User;
+ // Which lane does the scalar belong to.
+ int Lane;
+ };
+ typedef SmallVector<ExternalUser, 16> UserList;
+
+ /// Checks if two instructions may access the same memory.
+ ///
+ /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
+ /// is invariant in the calling loop.
+ bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
+ Instruction *Inst2) {
+
+ // First check if the result is already in the cache.
+ AliasCacheKey key = std::make_pair(Inst1, Inst2);
+ Optional<bool> &result = AliasCache[key];
+ if (result.hasValue()) {
+ return result.getValue();
+ }
+ MemoryLocation Loc2 = getLocation(Inst2, AA);
+ bool aliased = true;
+ if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) {
+ // Do the alias check.
+ aliased = AA->alias(Loc1, Loc2);
+ }
+ // Store the result in the cache.
+ result = aliased;
+ return aliased;
+ }
+
+ typedef std::pair<Instruction *, Instruction *> AliasCacheKey;
+
+ /// Cache for alias results.
+ /// TODO: consider moving this to the AliasAnalysis itself.
+ DenseMap<AliasCacheKey, Optional<bool>> AliasCache;
+
+ /// Removes an instruction from its block and eventually deletes it.
+ /// It's like Instruction::eraseFromParent() except that the actual deletion
+ /// is delayed until BoUpSLP is destructed.
+ /// This is required to ensure that there are no incorrect collisions in the
+ /// AliasCache, which can happen if a new instruction is allocated at the
+ /// same address as a previously deleted instruction.
+ void eraseInstruction(Instruction *I) {
+ I->removeFromParent();
+ I->dropAllReferences();
+ DeletedInstructions.push_back(std::unique_ptr<Instruction>(I));
+ }
+
+ /// Temporary store for deleted instructions. Instructions will be deleted
+ /// eventually when the BoUpSLP is destructed.
+ SmallVector<std::unique_ptr<Instruction>, 8> DeletedInstructions;
+
+ /// A list of values that need to extracted out of the tree.
+ /// This list holds pairs of (Internal Scalar : External User).
+ UserList ExternalUses;
+
+ /// Values used only by @llvm.assume calls.
+ SmallPtrSet<const Value *, 32> EphValues;
+
+ /// Holds all of the instructions that we gathered.
+ SetVector<Instruction *> GatherSeq;
+ /// A list of blocks that we are going to CSE.
+ SetVector<BasicBlock *> CSEBlocks;
+
+ /// Contains all scheduling relevant data for an instruction.
+ /// A ScheduleData either represents a single instruction or a member of an
+ /// instruction bundle (= a group of instructions which is combined into a
+ /// vector instruction).
+ struct ScheduleData {
+
+ // The initial value for the dependency counters. It means that the
+ // dependencies are not calculated yet.
+ enum { InvalidDeps = -1 };
+
+ ScheduleData()
+ : Inst(nullptr), FirstInBundle(nullptr), NextInBundle(nullptr),
+ NextLoadStore(nullptr), SchedulingRegionID(0), SchedulingPriority(0),
+ Dependencies(InvalidDeps), UnscheduledDeps(InvalidDeps),
+ UnscheduledDepsInBundle(InvalidDeps), IsScheduled(false) {}
+
+ void init(int BlockSchedulingRegionID) {
+ FirstInBundle = this;
+ NextInBundle = nullptr;
+ NextLoadStore = nullptr;
+ IsScheduled = false;
+ SchedulingRegionID = BlockSchedulingRegionID;
+ UnscheduledDepsInBundle = UnscheduledDeps;
+ clearDependencies();
+ }
+
+ /// Returns true if the dependency information has been calculated.
+ bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
+
+ /// Returns true for single instructions and for bundle representatives
+ /// (= the head of a bundle).
+ bool isSchedulingEntity() const { return FirstInBundle == this; }
+
+ /// Returns true if it represents an instruction bundle and not only a
+ /// single instruction.
+ bool isPartOfBundle() const {
+ return NextInBundle != nullptr || FirstInBundle != this;
+ }
+
+ /// Returns true if it is ready for scheduling, i.e. it has no more
+ /// unscheduled depending instructions/bundles.
+ bool isReady() const {
+ assert(isSchedulingEntity() &&
+ "can't consider non-scheduling entity for ready list");
+ return UnscheduledDepsInBundle == 0 && !IsScheduled;
+ }
+
+ /// Modifies the number of unscheduled dependencies, also updating it for
+ /// the whole bundle.
+ int incrementUnscheduledDeps(int Incr) {
+ UnscheduledDeps += Incr;
+ return FirstInBundle->UnscheduledDepsInBundle += Incr;
+ }
+
+ /// Sets the number of unscheduled dependencies to the number of
+ /// dependencies.
+ void resetUnscheduledDeps() {
+ incrementUnscheduledDeps(Dependencies - UnscheduledDeps);
+ }
+
+ /// Clears all dependency information.
+ void clearDependencies() {
+ Dependencies = InvalidDeps;
+ resetUnscheduledDeps();
+ MemoryDependencies.clear();
+ }
+
+ void dump(raw_ostream &os) const {
+ if (!isSchedulingEntity()) {
+ os << "/ " << *Inst;
+ } else if (NextInBundle) {
+ os << '[' << *Inst;
+ ScheduleData *SD = NextInBundle;
+ while (SD) {
+ os << ';' << *SD->Inst;
+ SD = SD->NextInBundle;
+ }
+ os << ']';
+ } else {
+ os << *Inst;
+ }
+ }
+
+ Instruction *Inst;
+
+ /// Points to the head in an instruction bundle (and always to this for
+ /// single instructions).
+ ScheduleData *FirstInBundle;
+
+ /// Single linked list of all instructions in a bundle. Null if it is a
+ /// single instruction.
+ ScheduleData *NextInBundle;
+
+ /// Single linked list of all memory instructions (e.g. load, store, call)
+ /// in the block - until the end of the scheduling region.
+ ScheduleData *NextLoadStore;
+
+ /// The dependent memory instructions.
+ /// This list is derived on demand in calculateDependencies().
+ SmallVector<ScheduleData *, 4> MemoryDependencies;
+
+ /// This ScheduleData is in the current scheduling region if this matches
+ /// the current SchedulingRegionID of BlockScheduling.
+ int SchedulingRegionID;
+
+ /// Used for getting a "good" final ordering of instructions.
+ int SchedulingPriority;
+
+ /// The number of dependencies. Constitutes of the number of users of the
+ /// instruction plus the number of dependent memory instructions (if any).
+ /// This value is calculated on demand.
+ /// If InvalidDeps, the number of dependencies is not calculated yet.
+ ///
+ int Dependencies;
+
+ /// The number of dependencies minus the number of dependencies of scheduled
+ /// instructions. As soon as this is zero, the instruction/bundle gets ready
+ /// for scheduling.
+ /// Note that this is negative as long as Dependencies is not calculated.
+ int UnscheduledDeps;
+
+ /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for
+ /// single instructions.
+ int UnscheduledDepsInBundle;
+
+ /// True if this instruction is scheduled (or considered as scheduled in the
+ /// dry-run).
+ bool IsScheduled;
+ };
+
+#ifndef NDEBUG
+ friend raw_ostream &operator<<(raw_ostream &os,
+ const BoUpSLP::ScheduleData &SD);
+#endif
+
+ /// Contains all scheduling data for a basic block.
+ ///
+ struct BlockScheduling {
+
+ BlockScheduling(BasicBlock *BB)
+ : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize),
+ ScheduleStart(nullptr), ScheduleEnd(nullptr),
+ FirstLoadStoreInRegion(nullptr), LastLoadStoreInRegion(nullptr),
+ ScheduleRegionSize(0),
+ ScheduleRegionSizeLimit(ScheduleRegionSizeBudget),
+ // Make sure that the initial SchedulingRegionID is greater than the
+ // initial SchedulingRegionID in ScheduleData (which is 0).
+ SchedulingRegionID(1) {}
+
+ void clear() {
+ ReadyInsts.clear();
+ ScheduleStart = nullptr;
+ ScheduleEnd = nullptr;
+ FirstLoadStoreInRegion = nullptr;
+ LastLoadStoreInRegion = nullptr;
+
+ // Reduce the maximum schedule region size by the size of the
+ // previous scheduling run.
+ ScheduleRegionSizeLimit -= ScheduleRegionSize;
+ if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
+ ScheduleRegionSizeLimit = MinScheduleRegionSize;
+ ScheduleRegionSize = 0;
+
+ // Make a new scheduling region, i.e. all existing ScheduleData is not
+ // in the new region yet.
+ ++SchedulingRegionID;
+ }
+
+ ScheduleData *getScheduleData(Value *V) {
+ ScheduleData *SD = ScheduleDataMap[V];
+ if (SD && SD->SchedulingRegionID == SchedulingRegionID)
+ return SD;
+ return nullptr;
+ }
+
+ bool isInSchedulingRegion(ScheduleData *SD) {
+ return SD->SchedulingRegionID == SchedulingRegionID;
+ }
+
+ /// Marks an instruction as scheduled and puts all dependent ready
+ /// instructions into the ready-list.
+ template <typename ReadyListType>
+ void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
+ SD->IsScheduled = true;
+ DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
+
+ ScheduleData *BundleMember = SD;
+ while (BundleMember) {
+ // Handle the def-use chain dependencies.
+ for (Use &U : BundleMember->Inst->operands()) {
+ ScheduleData *OpDef = getScheduleData(U.get());
+ if (OpDef && OpDef->hasValidDependencies() &&
+ OpDef->incrementUnscheduledDeps(-1) == 0) {
+ // There are no more unscheduled dependencies after decrementing,
+ // so we can put the dependent instruction into the ready list.
+ ScheduleData *DepBundle = OpDef->FirstInBundle;
+ assert(!DepBundle->IsScheduled &&
+ "already scheduled bundle gets ready");
+ ReadyList.insert(DepBundle);
+ DEBUG(dbgs() << "SLP: gets ready (def): " << *DepBundle << "\n");
+ }
+ }
+ // Handle the memory dependencies.
+ for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
+ if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
+ // There are no more unscheduled dependencies after decrementing,
+ // so we can put the dependent instruction into the ready list.
+ ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
+ assert(!DepBundle->IsScheduled &&
+ "already scheduled bundle gets ready");
+ ReadyList.insert(DepBundle);
+ DEBUG(dbgs() << "SLP: gets ready (mem): " << *DepBundle << "\n");
+ }
+ }
+ BundleMember = BundleMember->NextInBundle;
+ }
+ }
+
+ /// Put all instructions into the ReadyList which are ready for scheduling.
+ template <typename ReadyListType>
+ void initialFillReadyList(ReadyListType &ReadyList) {
+ for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
+ ScheduleData *SD = getScheduleData(I);
+ if (SD->isSchedulingEntity() && SD->isReady()) {
+ ReadyList.insert(SD);
+ DEBUG(dbgs() << "SLP: initially in ready list: " << *I << "\n");
+ }
+ }
+ }
+
+ /// Checks if a bundle of instructions can be scheduled, i.e. has no
+ /// cyclic dependencies. This is only a dry-run, no instructions are
+ /// actually moved at this stage.
+ bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP);
+
+ /// Un-bundles a group of instructions.
+ void cancelScheduling(ArrayRef<Value *> VL);
+
+ /// Extends the scheduling region so that V is inside the region.
+ /// \returns true if the region size is within the limit.
+ bool extendSchedulingRegion(Value *V);
+
+ /// Initialize the ScheduleData structures for new instructions in the
+ /// scheduling region.
+ void initScheduleData(Instruction *FromI, Instruction *ToI,
+ ScheduleData *PrevLoadStore,
+ ScheduleData *NextLoadStore);
+
+ /// Updates the dependency information of a bundle and of all instructions/
+ /// bundles which depend on the original bundle.
+ void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
+ BoUpSLP *SLP);
+
+ /// Sets all instruction in the scheduling region to un-scheduled.
+ void resetSchedule();
+
+ BasicBlock *BB;
+
+ /// Simple memory allocation for ScheduleData.
+ std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
+
+ /// The size of a ScheduleData array in ScheduleDataChunks.
+ int ChunkSize;
+
+ /// The allocator position in the current chunk, which is the last entry
+ /// of ScheduleDataChunks.
+ int ChunkPos;
+
+ /// Attaches ScheduleData to Instruction.
+ /// Note that the mapping survives during all vectorization iterations, i.e.
+ /// ScheduleData structures are recycled.
+ DenseMap<Value *, ScheduleData *> ScheduleDataMap;
+
+ struct ReadyList : SmallVector<ScheduleData *, 8> {
+ void insert(ScheduleData *SD) { push_back(SD); }
+ };
+
+ /// The ready-list for scheduling (only used for the dry-run).
+ ReadyList ReadyInsts;
+
+ /// The first instruction of the scheduling region.
+ Instruction *ScheduleStart;
+
+ /// The first instruction _after_ the scheduling region.
+ Instruction *ScheduleEnd;
+
+ /// The first memory accessing instruction in the scheduling region
+ /// (can be null).
+ ScheduleData *FirstLoadStoreInRegion;
+
+ /// The last memory accessing instruction in the scheduling region
+ /// (can be null).
+ ScheduleData *LastLoadStoreInRegion;
+
+ /// The current size of the scheduling region.
+ int ScheduleRegionSize;
+
+ /// The maximum size allowed for the scheduling region.
+ int ScheduleRegionSizeLimit;
+
+ /// The ID of the scheduling region. For a new vectorization iteration this
+ /// is incremented which "removes" all ScheduleData from the region.
+ int SchedulingRegionID;
+ };
+
+ /// Attaches the BlockScheduling structures to basic blocks.
+ MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
+
+ /// Performs the "real" scheduling. Done before vectorization is actually
+ /// performed in a basic block.
+ void scheduleBlock(BlockScheduling *BS);
+
+ /// List of users to ignore during scheduling and that don't need extracting.
+ ArrayRef<Value *> UserIgnoreList;
+
+ // Number of load-bundles, which contain consecutive loads.
+ int NumLoadsWantToKeepOrder;
+
+ // Number of load-bundles of size 2, which are consecutive loads if reversed.
+ int NumLoadsWantToChangeOrder;
+
+ // Analysis and block reference.
+ Function *F;
+ ScalarEvolution *SE;
+ TargetTransformInfo *TTI;
+ TargetLibraryInfo *TLI;
+ AliasAnalysis *AA;
+ LoopInfo *LI;
+ DominatorTree *DT;
+ /// Instruction builder to construct the vectorized tree.
+ IRBuilder<> Builder;
+};
+
+#ifndef NDEBUG
+raw_ostream &operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD) {
+ SD.dump(os);
+ return os;
+}
+#endif
+
+void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
+ ArrayRef<Value *> UserIgnoreLst) {
+ deleteTree();
+ UserIgnoreList = UserIgnoreLst;
+ if (!getSameType(Roots))
+ return;
+ buildTree_rec(Roots, 0);
+
+ // Collect the values that we need to extract from the tree.
+ for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) {
+ TreeEntry *Entry = &VectorizableTree[EIdx];
+
+ // For each lane:
+ for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
+ Value *Scalar = Entry->Scalars[Lane];
+
+ // No need to handle users of gathered values.
+ if (Entry->NeedToGather)
+ continue;
+
+ for (User *U : Scalar->users()) {
+ DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
+
+ Instruction *UserInst = dyn_cast<Instruction>(U);
+ if (!UserInst)
+ continue;
+
+ // Skip in-tree scalars that become vectors
+ if (ScalarToTreeEntry.count(U)) {
+ int Idx = ScalarToTreeEntry[U];
+ TreeEntry *UseEntry = &VectorizableTree[Idx];
+ Value *UseScalar = UseEntry->Scalars[0];
+ // Some in-tree scalars will remain as scalar in vectorized
+ // instructions. If that is the case, the one in Lane 0 will
+ // be used.
+ if (UseScalar != U ||
+ !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
+ DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
+ << ".\n");
+ assert(!VectorizableTree[Idx].NeedToGather && "Bad state");
+ continue;
+ }
+ }
+
+ // Ignore users in the user ignore list.
+ if (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), UserInst) !=
+ UserIgnoreList.end())
+ continue;
+
+ DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " <<
+ Lane << " from " << *Scalar << ".\n");
+ ExternalUses.push_back(ExternalUser(Scalar, U, Lane));
+ }
+ }
+ }
+}
+
+
+void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
+ bool SameTy = getSameType(VL); (void)SameTy;
+ bool isAltShuffle = false;
+ assert(SameTy && "Invalid types!");
+
+ if (Depth == RecursionMaxDepth) {
+ DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
+ newTreeEntry(VL, false);
+ return;
+ }
+
+ // Don't handle vectors.
+ if (VL[0]->getType()->isVectorTy()) {
+ DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
+ newTreeEntry(VL, false);
+ return;
+ }
+
+ if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
+ if (SI->getValueOperand()->getType()->isVectorTy()) {
+ DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
+ newTreeEntry(VL, false);
+ return;
+ }
+ unsigned Opcode = getSameOpcode(VL);
+
+ // Check that this shuffle vector refers to the alternate
+ // sequence of opcodes.
+ if (Opcode == Instruction::ShuffleVector) {
+ Instruction *I0 = dyn_cast<Instruction>(VL[0]);
+ unsigned Op = I0->getOpcode();
+ if (Op != Instruction::ShuffleVector)
+ isAltShuffle = true;
+ }
+
+ // If all of the operands are identical or constant we have a simple solution.
+ if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || !Opcode) {
+ DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
+ newTreeEntry(VL, false);
+ return;
+ }
+
+ // We now know that this is a vector of instructions of the same type from
+ // the same block.
+
+ // Don't vectorize ephemeral values.
+ for (unsigned i = 0, e = VL.size(); i != e; ++i) {
+ if (EphValues.count(VL[i])) {
+ DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
+ ") is ephemeral.\n");
+ newTreeEntry(VL, false);
+ return;
+ }
+ }
+
+ // Check if this is a duplicate of another entry.
+ if (ScalarToTreeEntry.count(VL[0])) {
+ int Idx = ScalarToTreeEntry[VL[0]];
+ TreeEntry *E = &VectorizableTree[Idx];
+ for (unsigned i = 0, e = VL.size(); i != e; ++i) {
+ DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
+ if (E->Scalars[i] != VL[i]) {
+ DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
+ newTreeEntry(VL, false);
+ return;
+ }
+ }
+ DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n");
+ return;
+ }
+
+ // Check that none of the instructions in the bundle are already in the tree.
+ for (unsigned i = 0, e = VL.size(); i != e; ++i) {
+ if (ScalarToTreeEntry.count(VL[i])) {
+ DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
+ ") is already in tree.\n");
+ newTreeEntry(VL, false);
+ return;
+ }
+ }
+
+ // If any of the scalars is marked as a value that needs to stay scalar then
+ // we need to gather the scalars.
+ for (unsigned i = 0, e = VL.size(); i != e; ++i) {
+ if (MustGather.count(VL[i])) {
+ DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
+ newTreeEntry(VL, false);
+ return;
+ }
+ }
+
+ // Check that all of the users of the scalars that we want to vectorize are
+ // schedulable.
+ Instruction *VL0 = cast<Instruction>(VL[0]);
+ BasicBlock *BB = cast<Instruction>(VL0)->getParent();
+
+ if (!DT->isReachableFromEntry(BB)) {
+ // Don't go into unreachable blocks. They may contain instructions with
+ // dependency cycles which confuse the final scheduling.
+ DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
+ newTreeEntry(VL, false);
+ return;
+ }
+
+ // Check that every instructions appears once in this bundle.
+ for (unsigned i = 0, e = VL.size(); i < e; ++i)
+ for (unsigned j = i+1; j < e; ++j)
+ if (VL[i] == VL[j]) {
+ DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
+ newTreeEntry(VL, false);
+ return;
+ }
+
+ auto &BSRef = BlocksSchedules[BB];
+ if (!BSRef) {
+ BSRef = llvm::make_unique<BlockScheduling>(BB);
+ }
+ BlockScheduling &BS = *BSRef.get();
+
+ if (!BS.tryScheduleBundle(VL, this)) {
+ DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
+ assert((!BS.getScheduleData(VL[0]) ||
+ !BS.getScheduleData(VL[0])->isPartOfBundle()) &&
+ "tryScheduleBundle should cancelScheduling on failure");
+ newTreeEntry(VL, false);
+ return;
+ }
+ DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
+
+ switch (Opcode) {
+ case Instruction::PHI: {
+ PHINode *PH = dyn_cast<PHINode>(VL0);
+
+ // Check for terminator values (e.g. invoke).
+ for (unsigned j = 0; j < VL.size(); ++j)
+ for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
+ TerminatorInst *Term = dyn_cast<TerminatorInst>(
+ cast<PHINode>(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i)));
+ if (Term) {
+ DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
+ BS.cancelScheduling(VL);
+ newTreeEntry(VL, false);
+ return;
+ }
+ }
+
+ newTreeEntry(VL, true);
+ DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
+
+ for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
+ ValueList Operands;
+ // Prepare the operand vector.
+ for (unsigned j = 0; j < VL.size(); ++j)
+ Operands.push_back(cast<PHINode>(VL[j])->getIncomingValueForBlock(
+ PH->getIncomingBlock(i)));
+
+ buildTree_rec(Operands, Depth + 1);
+ }
+ return;
+ }
+ case Instruction::ExtractElement: {
+ bool Reuse = CanReuseExtract(VL);
+ if (Reuse) {
+ DEBUG(dbgs() << "SLP: Reusing extract sequence.\n");
+ } else {
+ BS.cancelScheduling(VL);
+ }
+ newTreeEntry(VL, Reuse);
+ return;
+ }
+ case Instruction::Load: {
+ // Check that a vectorized load would load the same memory as a scalar
+ // load.
+ // For example we don't want vectorize loads that are smaller than 8 bit.
+ // Even though we have a packed struct {<i2, i2, i2, i2>} LLVM treats
+ // loading/storing it as an i8 struct. If we vectorize loads/stores from
+ // such a struct we read/write packed bits disagreeing with the
+ // unvectorized version.
+ const DataLayout &DL = F->getParent()->getDataLayout();
+ Type *ScalarTy = VL[0]->getType();
+
+ if (DL.getTypeSizeInBits(ScalarTy) !=
+ DL.getTypeAllocSizeInBits(ScalarTy)) {
+ BS.cancelScheduling(VL);
+ newTreeEntry(VL, false);
+ DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
+ return;
+ }
+ // Check if the loads are consecutive or of we need to swizzle them.
+ for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
+ LoadInst *L = cast<LoadInst>(VL[i]);
+ if (!L->isSimple()) {
+ BS.cancelScheduling(VL);
+ newTreeEntry(VL, false);
+ DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
+ return;
+ }
+
+ if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) {
+ if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0], DL)) {
+ ++NumLoadsWantToChangeOrder;
+ }
+ BS.cancelScheduling(VL);
+ newTreeEntry(VL, false);
+ DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
+ return;
+ }
+ }
+ ++NumLoadsWantToKeepOrder;
+ newTreeEntry(VL, true);
+ DEBUG(dbgs() << "SLP: added a vector of loads.\n");
+ return;
+ }
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::SIToFP:
+ case Instruction::UIToFP:
+ case Instruction::Trunc:
+ case Instruction::FPTrunc:
+ case Instruction::BitCast: {
+ Type *SrcTy = VL0->getOperand(0)->getType();
+ for (unsigned i = 0; i < VL.size(); ++i) {
+ Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
+ if (Ty != SrcTy || !isValidElementType(Ty)) {
+ BS.cancelScheduling(VL);
+ newTreeEntry(VL, false);
+ DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
+ return;
+ }
+ }
+ newTreeEntry(VL, true);
+ DEBUG(dbgs() << "SLP: added a vector of casts.\n");
+
+ for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
+ ValueList Operands;
+ // Prepare the operand vector.
+ for (unsigned j = 0; j < VL.size(); ++j)
+ Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
+
+ buildTree_rec(Operands, Depth+1);
+ }
+ return;
+ }
+ case Instruction::ICmp:
+ case Instruction::FCmp: {
+ // Check that all of the compares have the same predicate.
+ CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
+ Type *ComparedTy = cast<Instruction>(VL[0])->getOperand(0)->getType();
+ for (unsigned i = 1, e = VL.size(); i < e; ++i) {
+ CmpInst *Cmp = cast<CmpInst>(VL[i]);
+ if (Cmp->getPredicate() != P0 ||
+ Cmp->getOperand(0)->getType() != ComparedTy) {
+ BS.cancelScheduling(VL);
+ newTreeEntry(VL, false);
+ DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
+ return;
+ }
+ }
+
+ newTreeEntry(VL, true);
+ DEBUG(dbgs() << "SLP: added a vector of compares.\n");
+
+ for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
+ ValueList Operands;
+ // Prepare the operand vector.
+ for (unsigned j = 0; j < VL.size(); ++j)
+ Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
+
+ buildTree_rec(Operands, Depth+1);
+ }
+ return;
+ }
+ case Instruction::Select:
+ case Instruction::Add:
+ case Instruction::FAdd:
+ case Instruction::Sub:
+ case Instruction::FSub:
+ case Instruction::Mul:
+ case Instruction::FMul:
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::FDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor: {
+ newTreeEntry(VL, true);
+ DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
+
+ // Sort operands of the instructions so that each side is more likely to
+ // have the same opcode.
+ if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
+ ValueList Left, Right;
+ reorderInputsAccordingToOpcode(VL, Left, Right);
+ buildTree_rec(Left, Depth + 1);
+ buildTree_rec(Right, Depth + 1);
+ return;
+ }
+
+ for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
+ ValueList Operands;
+ // Prepare the operand vector.
+ for (unsigned j = 0; j < VL.size(); ++j)
+ Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
+
+ buildTree_rec(Operands, Depth+1);
+ }
+ return;
+ }
+ case Instruction::GetElementPtr: {
+ // We don't combine GEPs with complicated (nested) indexing.
+ for (unsigned j = 0; j < VL.size(); ++j) {
+ if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
+ DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
+ BS.cancelScheduling(VL);
+ newTreeEntry(VL, false);
+ return;
+ }
+ }
+
+ // We can't combine several GEPs into one vector if they operate on
+ // different types.
+ Type *Ty0 = cast<Instruction>(VL0)->getOperand(0)->getType();
+ for (unsigned j = 0; j < VL.size(); ++j) {
+ Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
+ if (Ty0 != CurTy) {
+ DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
+ BS.cancelScheduling(VL);
+ newTreeEntry(VL, false);
+ return;
+ }
+ }
+
+ // We don't combine GEPs with non-constant indexes.
+ for (unsigned j = 0; j < VL.size(); ++j) {
+ auto Op = cast<Instruction>(VL[j])->getOperand(1);
+ if (!isa<ConstantInt>(Op)) {
+ DEBUG(
+ dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
+ BS.cancelScheduling(VL);
+ newTreeEntry(VL, false);
+ return;
+ }
+ }
+
+ newTreeEntry(VL, true);
+ DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
+ for (unsigned i = 0, e = 2; i < e; ++i) {
+ ValueList Operands;
+ // Prepare the operand vector.
+ for (unsigned j = 0; j < VL.size(); ++j)
+ Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
+
+ buildTree_rec(Operands, Depth + 1);
+ }
+ return;
+ }
+ case Instruction::Store: {
+ const DataLayout &DL = F->getParent()->getDataLayout();
+ // Check if the stores are consecutive or of we need to swizzle them.
+ for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
+ if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) {
+ BS.cancelScheduling(VL);
+ newTreeEntry(VL, false);
+ DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
+ return;
+ }
+
+ newTreeEntry(VL, true);
+ DEBUG(dbgs() << "SLP: added a vector of stores.\n");
+
+ ValueList Operands;
+ for (unsigned j = 0; j < VL.size(); ++j)
+ Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
+
+ buildTree_rec(Operands, Depth + 1);
+ return;
+ }
+ case Instruction::Call: {
+ // Check if the calls are all to the same vectorizable intrinsic.
+ CallInst *CI = cast<CallInst>(VL[0]);
+ // Check if this is an Intrinsic call or something that can be
+ // represented by an intrinsic call
+ Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
+ if (!isTriviallyVectorizable(ID)) {
+ BS.cancelScheduling(VL);
+ newTreeEntry(VL, false);
+ DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
+ return;
+ }
+ Function *Int = CI->getCalledFunction();
+ Value *A1I = nullptr;
+ if (hasVectorInstrinsicScalarOpd(ID, 1))
+ A1I = CI->getArgOperand(1);
+ for (unsigned i = 1, e = VL.size(); i != e; ++i) {
+ CallInst *CI2 = dyn_cast<CallInst>(VL[i]);
+ if (!CI2 || CI2->getCalledFunction() != Int ||
+ getIntrinsicIDForCall(CI2, TLI) != ID) {
+ BS.cancelScheduling(VL);
+ newTreeEntry(VL, false);
+ DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
+ << "\n");
+ return;
+ }
+ // ctlz,cttz and powi are special intrinsics whose second argument
+ // should be same in order for them to be vectorized.
+ if (hasVectorInstrinsicScalarOpd(ID, 1)) {
+ Value *A1J = CI2->getArgOperand(1);
+ if (A1I != A1J) {
+ BS.cancelScheduling(VL);
+ newTreeEntry(VL, false);
+ DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
+ << " argument "<< A1I<<"!=" << A1J
+ << "\n");
+ return;
+ }
+ }
+ }
+
+ newTreeEntry(VL, true);
+ for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
+ ValueList Operands;
+ // Prepare the operand vector.
+ for (unsigned j = 0; j < VL.size(); ++j) {
+ CallInst *CI2 = dyn_cast<CallInst>(VL[j]);
+ Operands.push_back(CI2->getArgOperand(i));
+ }
+ buildTree_rec(Operands, Depth + 1);
+ }
+ return;
+ }
+ case Instruction::ShuffleVector: {
+ // If this is not an alternate sequence of opcode like add-sub
+ // then do not vectorize this instruction.
+ if (!isAltShuffle) {
+ BS.cancelScheduling(VL);
+ newTreeEntry(VL, false);
+ DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
+ return;
+ }
+ newTreeEntry(VL, true);
+ DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
+
+ // Reorder operands if reordering would enable vectorization.
+ if (isa<BinaryOperator>(VL0)) {
+ ValueList Left, Right;
+ reorderAltShuffleOperands(VL, Left, Right);
+ buildTree_rec(Left, Depth + 1);
+ buildTree_rec(Right, Depth + 1);
+ return;
+ }
+
+ for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
+ ValueList Operands;
+ // Prepare the operand vector.
+ for (unsigned j = 0; j < VL.size(); ++j)
+ Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
+
+ buildTree_rec(Operands, Depth + 1);
+ }
+ return;
+ }
+ default:
+ BS.cancelScheduling(VL);
+ newTreeEntry(VL, false);
+ DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
+ return;
+ }
+}
+
+int BoUpSLP::getEntryCost(TreeEntry *E) {
+ ArrayRef<Value*> VL = E->Scalars;
+
+ Type *ScalarTy = VL[0]->getType();
+ if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
+ ScalarTy = SI->getValueOperand()->getType();
+ VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
+
+ if (E->NeedToGather) {
+ if (allConstant(VL))
+ return 0;
+ if (isSplat(VL)) {
+ return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
+ }
+ return getGatherCost(E->Scalars);
+ }
+ unsigned Opcode = getSameOpcode(VL);
+ assert(Opcode && getSameType(VL) && getSameBlock(VL) && "Invalid VL");
+ Instruction *VL0 = cast<Instruction>(VL[0]);
+ switch (Opcode) {
+ case Instruction::PHI: {
+ return 0;
+ }
+ case Instruction::ExtractElement: {
+ if (CanReuseExtract(VL)) {
+ int DeadCost = 0;
+ for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+ ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
+ if (E->hasOneUse())
+ // Take credit for instruction that will become dead.
+ DeadCost +=
+ TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
+ }
+ return -DeadCost;
+ }
+ return getGatherCost(VecTy);
+ }
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::SIToFP:
+ case Instruction::UIToFP:
+ case Instruction::Trunc:
+ case Instruction::FPTrunc:
+ case Instruction::BitCast: {
+ Type *SrcTy = VL0->getOperand(0)->getType();
+
+ // Calculate the cost of this instruction.
+ int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
+ VL0->getType(), SrcTy);
+
+ VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
+ int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy);
+ return VecCost - ScalarCost;
+ }
+ case Instruction::FCmp:
+ case Instruction::ICmp:
+ case Instruction::Select:
+ case Instruction::Add:
+ case Instruction::FAdd:
+ case Instruction::Sub:
+ case Instruction::FSub:
+ case Instruction::Mul:
+ case Instruction::FMul:
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::FDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor: {
+ // Calculate the cost of this instruction.
+ int ScalarCost = 0;
+ int VecCost = 0;
+ if (Opcode == Instruction::FCmp || Opcode == Instruction::ICmp ||
+ Opcode == Instruction::Select) {
+ VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
+ ScalarCost = VecTy->getNumElements() *
+ TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty());
+ VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy);
+ } else {
+ // Certain instructions can be cheaper to vectorize if they have a
+ // constant second vector operand.
+ TargetTransformInfo::OperandValueKind Op1VK =
+ TargetTransformInfo::OK_AnyValue;
+ TargetTransformInfo::OperandValueKind Op2VK =
+ TargetTransformInfo::OK_UniformConstantValue;
+ TargetTransformInfo::OperandValueProperties Op1VP =
+ TargetTransformInfo::OP_None;
+ TargetTransformInfo::OperandValueProperties Op2VP =
+ TargetTransformInfo::OP_None;
+
+ // If all operands are exactly the same ConstantInt then set the
+ // operand kind to OK_UniformConstantValue.
+ // If instead not all operands are constants, then set the operand kind
+ // to OK_AnyValue. If all operands are constants but not the same,
+ // then set the operand kind to OK_NonUniformConstantValue.
+ ConstantInt *CInt = nullptr;
+ for (unsigned i = 0; i < VL.size(); ++i) {
+ const Instruction *I = cast<Instruction>(VL[i]);
+ if (!isa<ConstantInt>(I->getOperand(1))) {
+ Op2VK = TargetTransformInfo::OK_AnyValue;
+ break;
+ }
+ if (i == 0) {
+ CInt = cast<ConstantInt>(I->getOperand(1));
+ continue;
+ }
+ if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
+ CInt != cast<ConstantInt>(I->getOperand(1)))
+ Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
+ }
+ // FIXME: Currently cost of model modification for division by
+ // power of 2 is handled only for X86. Add support for other targets.
+ if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt &&
+ CInt->getValue().isPowerOf2())
+ Op2VP = TargetTransformInfo::OP_PowerOf2;
+
+ ScalarCost = VecTy->getNumElements() *
+ TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK,
+ Op1VP, Op2VP);
+ VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK,
+ Op1VP, Op2VP);
+ }
+ return VecCost - ScalarCost;
+ }
+ case Instruction::GetElementPtr: {
+ TargetTransformInfo::OperandValueKind Op1VK =
+ TargetTransformInfo::OK_AnyValue;
+ TargetTransformInfo::OperandValueKind Op2VK =
+ TargetTransformInfo::OK_UniformConstantValue;
+
+ int ScalarCost =
+ VecTy->getNumElements() *
+ TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
+ int VecCost =
+ TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);
+
+ return VecCost - ScalarCost;
+ }
+ case Instruction::Load: {
+ // Cost of wide load - cost of scalar loads.
+ int ScalarLdCost = VecTy->getNumElements() *
+ TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
+ int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, 1, 0);
+ return VecLdCost - ScalarLdCost;
+ }
+ case Instruction::Store: {
+ // We know that we can merge the stores. Calculate the cost.
+ int ScalarStCost = VecTy->getNumElements() *
+ TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0);
+ int VecStCost = TTI->getMemoryOpCost(Instruction::Store, VecTy, 1, 0);
+ return VecStCost - ScalarStCost;
+ }
+ case Instruction::Call: {
+ CallInst *CI = cast<CallInst>(VL0);
+ Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
+
+ // Calculate the cost of the scalar and vector calls.
+ SmallVector<Type*, 4> ScalarTys, VecTys;
+ for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) {
+ ScalarTys.push_back(CI->getArgOperand(op)->getType());
+ VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(),
+ VecTy->getNumElements()));
+ }
+
+ int ScalarCallCost = VecTy->getNumElements() *
+ TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys);
+
+ int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys);
+
+ DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost
+ << " (" << VecCallCost << "-" << ScalarCallCost << ")"
+ << " for " << *CI << "\n");
+
+ return VecCallCost - ScalarCallCost;
+ }
+ case Instruction::ShuffleVector: {
+ TargetTransformInfo::OperandValueKind Op1VK =
+ TargetTransformInfo::OK_AnyValue;
+ TargetTransformInfo::OperandValueKind Op2VK =
+ TargetTransformInfo::OK_AnyValue;
+ int ScalarCost = 0;
+ int VecCost = 0;
+ for (unsigned i = 0; i < VL.size(); ++i) {
+ Instruction *I = cast<Instruction>(VL[i]);
+ if (!I)
+ break;
+ ScalarCost +=
+ TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK);
+ }
+ // VecCost is equal to sum of the cost of creating 2 vectors
+ // and the cost of creating shuffle.
+ Instruction *I0 = cast<Instruction>(VL[0]);
+ VecCost =
+ TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK);
+ Instruction *I1 = cast<Instruction>(VL[1]);
+ VecCost +=
+ TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK);
+ VecCost +=
+ TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0);
+ return VecCost - ScalarCost;
+ }
+ default:
+ llvm_unreachable("Unknown instruction");
+ }
+}
+
+bool BoUpSLP::isFullyVectorizableTinyTree() {
+ DEBUG(dbgs() << "SLP: Check whether the tree with height " <<
+ VectorizableTree.size() << " is fully vectorizable .\n");
+
+ // We only handle trees of height 2.
+ if (VectorizableTree.size() != 2)
+ return false;
+
+ // Handle splat stores.
+ if (!VectorizableTree[0].NeedToGather && isSplat(VectorizableTree[1].Scalars))
+ return true;
+
+ // Gathering cost would be too much for tiny trees.
+ if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather)
+ return false;
+
+ return true;
+}
+
+int BoUpSLP::getSpillCost() {
+ // Walk from the bottom of the tree to the top, tracking which values are
+ // live. When we see a call instruction that is not part of our tree,
+ // query TTI to see if there is a cost to keeping values live over it
+ // (for example, if spills and fills are required).
+ unsigned BundleWidth = VectorizableTree.front().Scalars.size();
+ int Cost = 0;
+
+ SmallPtrSet<Instruction*, 4> LiveValues;
+ Instruction *PrevInst = nullptr;
+
+ for (unsigned N = 0; N < VectorizableTree.size(); ++N) {
+ Instruction *Inst = dyn_cast<Instruction>(VectorizableTree[N].Scalars[0]);
+ if (!Inst)
+ continue;
+
+ if (!PrevInst) {
+ PrevInst = Inst;
+ continue;
+ }
+
+ DEBUG(
+ dbgs() << "SLP: #LV: " << LiveValues.size();
+ for (auto *X : LiveValues)
+ dbgs() << " " << X->getName();
+ dbgs() << ", Looking at ";
+ Inst->dump();
+ );
+
+ // Update LiveValues.
+ LiveValues.erase(PrevInst);
+ for (auto &J : PrevInst->operands()) {
+ if (isa<Instruction>(&*J) && ScalarToTreeEntry.count(&*J))
+ LiveValues.insert(cast<Instruction>(&*J));
+ }
+
+ // Now find the sequence of instructions between PrevInst and Inst.
+ BasicBlock::reverse_iterator InstIt(Inst->getIterator()),
+ PrevInstIt(PrevInst->getIterator());
+ --PrevInstIt;
+ while (InstIt != PrevInstIt) {
+ if (PrevInstIt == PrevInst->getParent()->rend()) {
+ PrevInstIt = Inst->getParent()->rbegin();
+ continue;
+ }
+
+ if (isa<CallInst>(&*PrevInstIt) && &*PrevInstIt != PrevInst) {
+ SmallVector<Type*, 4> V;
+ for (auto *II : LiveValues)
+ V.push_back(VectorType::get(II->getType(), BundleWidth));
+ Cost += TTI->getCostOfKeepingLiveOverCall(V);
+ }
+
+ ++PrevInstIt;
+ }
+
+ PrevInst = Inst;
+ }
+
+ DEBUG(dbgs() << "SLP: SpillCost=" << Cost << "\n");
+ return Cost;
+}
+
+int BoUpSLP::getTreeCost() {
+ int Cost = 0;
+ DEBUG(dbgs() << "SLP: Calculating cost for tree of size " <<
+ VectorizableTree.size() << ".\n");
+
+ // We only vectorize tiny trees if it is fully vectorizable.
+ if (VectorizableTree.size() < 3 && !isFullyVectorizableTinyTree()) {
+ if (VectorizableTree.empty()) {
+ assert(!ExternalUses.size() && "We should not have any external users");
+ }
+ return INT_MAX;
+ }
+
+ unsigned BundleWidth = VectorizableTree[0].Scalars.size();
+
+ for (unsigned i = 0, e = VectorizableTree.size(); i != e; ++i) {
+ int C = getEntryCost(&VectorizableTree[i]);
+ DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with "
+ << *VectorizableTree[i].Scalars[0] << " .\n");
+ Cost += C;
+ }
+
+ SmallSet<Value *, 16> ExtractCostCalculated;
+ int ExtractCost = 0;
+ for (UserList::iterator I = ExternalUses.begin(), E = ExternalUses.end();
+ I != E; ++I) {
+ // We only add extract cost once for the same scalar.
+ if (!ExtractCostCalculated.insert(I->Scalar).second)
+ continue;
+
+ // Uses by ephemeral values are free (because the ephemeral value will be
+ // removed prior to code generation, and so the extraction will be
+ // removed as well).
+ if (EphValues.count(I->User))
+ continue;
+
+ VectorType *VecTy = VectorType::get(I->Scalar->getType(), BundleWidth);
+ ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
+ I->Lane);
+ }
+
+ Cost += getSpillCost();
+
+ DEBUG(dbgs() << "SLP: Total Cost " << Cost + ExtractCost<< ".\n");
+ return Cost + ExtractCost;
+}
+
+int BoUpSLP::getGatherCost(Type *Ty) {
+ int Cost = 0;
+ for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
+ Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
+ return Cost;
+}
+
+int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {
+ // Find the type of the operands in VL.
+ Type *ScalarTy = VL[0]->getType();
+ if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
+ ScalarTy = SI->getValueOperand()->getType();
+ VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
+ // Find the cost of inserting/extracting values from the vector.
+ return getGatherCost(VecTy);
+}
+
+Value *BoUpSLP::getPointerOperand(Value *I) {
+ if (LoadInst *LI = dyn_cast<LoadInst>(I))
+ return LI->getPointerOperand();
+ if (StoreInst *SI = dyn_cast<StoreInst>(I))
+ return SI->getPointerOperand();
+ return nullptr;
+}
+
+unsigned BoUpSLP::getAddressSpaceOperand(Value *I) {
+ if (LoadInst *L = dyn_cast<LoadInst>(I))
+ return L->getPointerAddressSpace();
+ if (StoreInst *S = dyn_cast<StoreInst>(I))
+ return S->getPointerAddressSpace();
+ return -1;
+}
+
+bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL) {
+ Value *PtrA = getPointerOperand(A);
+ Value *PtrB = getPointerOperand(B);
+ unsigned ASA = getAddressSpaceOperand(A);
+ unsigned ASB = getAddressSpaceOperand(B);
+
+ // Check that the address spaces match and that the pointers are valid.
+ if (!PtrA || !PtrB || (ASA != ASB))
+ return false;
+
+ // Make sure that A and B are different pointers of the same type.
+ if (PtrA == PtrB || PtrA->getType() != PtrB->getType())
+ return false;
+
+ unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
+ Type *Ty = cast<PointerType>(PtrA->getType())->getElementType();
+ APInt Size(PtrBitWidth, DL.getTypeStoreSize(Ty));
+
+ APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0);
+ PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
+ PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
+
+ APInt OffsetDelta = OffsetB - OffsetA;
+
+ // Check if they are based on the same pointer. That makes the offsets
+ // sufficient.
+ if (PtrA == PtrB)
+ return OffsetDelta == Size;
+
+ // Compute the necessary base pointer delta to have the necessary final delta
+ // equal to the size.
+ APInt BaseDelta = Size - OffsetDelta;
+
+ // Otherwise compute the distance with SCEV between the base pointers.
+ const SCEV *PtrSCEVA = SE->getSCEV(PtrA);
+ const SCEV *PtrSCEVB = SE->getSCEV(PtrB);
+ const SCEV *C = SE->getConstant(BaseDelta);
+ const SCEV *X = SE->getAddExpr(PtrSCEVA, C);
+ return X == PtrSCEVB;
+}
+
+// Reorder commutative operations in alternate shuffle if the resulting vectors
+// are consecutive loads. This would allow us to vectorize the tree.
+// If we have something like-
+// load a[0] - load b[0]
+// load b[1] + load a[1]
+// load a[2] - load b[2]
+// load a[3] + load b[3]
+// Reordering the second load b[1] load a[1] would allow us to vectorize this
+// code.
+void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL,
+ SmallVectorImpl<Value *> &Left,
+ SmallVectorImpl<Value *> &Right) {
+ const DataLayout &DL = F->getParent()->getDataLayout();
+
+ // Push left and right operands of binary operation into Left and Right
+ for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+ Left.push_back(cast<Instruction>(VL[i])->getOperand(0));
+ Right.push_back(cast<Instruction>(VL[i])->getOperand(1));
+ }
+
+ // Reorder if we have a commutative operation and consecutive access
+ // are on either side of the alternate instructions.
+ for (unsigned j = 0; j < VL.size() - 1; ++j) {
+ if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
+ if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
+ Instruction *VL1 = cast<Instruction>(VL[j]);
+ Instruction *VL2 = cast<Instruction>(VL[j + 1]);
+ if (isConsecutiveAccess(L, L1, DL) && VL1->isCommutative()) {
+ std::swap(Left[j], Right[j]);
+ continue;
+ } else if (isConsecutiveAccess(L, L1, DL) && VL2->isCommutative()) {
+ std::swap(Left[j + 1], Right[j + 1]);
+ continue;
+ }
+ // else unchanged
+ }
+ }
+ if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
+ if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
+ Instruction *VL1 = cast<Instruction>(VL[j]);
+ Instruction *VL2 = cast<Instruction>(VL[j + 1]);
+ if (isConsecutiveAccess(L, L1, DL) && VL1->isCommutative()) {
+ std::swap(Left[j], Right[j]);
+ continue;
+ } else if (isConsecutiveAccess(L, L1, DL) && VL2->isCommutative()) {
+ std::swap(Left[j + 1], Right[j + 1]);
+ continue;
+ }
+ // else unchanged
+ }
+ }
+ }
+}
+
+// Return true if I should be commuted before adding it's left and right
+// operands to the arrays Left and Right.
+//
+// The vectorizer is trying to either have all elements one side being
+// instruction with the same opcode to enable further vectorization, or having
+// a splat to lower the vectorizing cost.
+static bool shouldReorderOperands(int i, Instruction &I,
+ SmallVectorImpl<Value *> &Left,
+ SmallVectorImpl<Value *> &Right,
+ bool AllSameOpcodeLeft,
+ bool AllSameOpcodeRight, bool SplatLeft,
+ bool SplatRight) {
+ Value *VLeft = I.getOperand(0);
+ Value *VRight = I.getOperand(1);
+ // If we have "SplatRight", try to see if commuting is needed to preserve it.
+ if (SplatRight) {
+ if (VRight == Right[i - 1])
+ // Preserve SplatRight
+ return false;
+ if (VLeft == Right[i - 1]) {
+ // Commuting would preserve SplatRight, but we don't want to break
+ // SplatLeft either, i.e. preserve the original order if possible.
+ // (FIXME: why do we care?)
+ if (SplatLeft && VLeft == Left[i - 1])
+ return false;
+ return true;
+ }
+ }
+ // Symmetrically handle Right side.
+ if (SplatLeft) {
+ if (VLeft == Left[i - 1])
+ // Preserve SplatLeft
+ return false;
+ if (VRight == Left[i - 1])
+ return true;
+ }
+
+ Instruction *ILeft = dyn_cast<Instruction>(VLeft);
+ Instruction *IRight = dyn_cast<Instruction>(VRight);
+
+ // If we have "AllSameOpcodeRight", try to see if the left operands preserves
+ // it and not the right, in this case we want to commute.
+ if (AllSameOpcodeRight) {
+ unsigned RightPrevOpcode = cast<Instruction>(Right[i - 1])->getOpcode();
+ if (IRight && RightPrevOpcode == IRight->getOpcode())
+ // Do not commute, a match on the right preserves AllSameOpcodeRight
+ return false;
+ if (ILeft && RightPrevOpcode == ILeft->getOpcode()) {
+ // We have a match and may want to commute, but first check if there is
+ // not also a match on the existing operands on the Left to preserve
+ // AllSameOpcodeLeft, i.e. preserve the original order if possible.
+ // (FIXME: why do we care?)
+ if (AllSameOpcodeLeft && ILeft &&
+ cast<Instruction>(Left[i - 1])->getOpcode() == ILeft->getOpcode())
+ return false;
+ return true;
+ }
+ }
+ // Symmetrically handle Left side.
+ if (AllSameOpcodeLeft) {
+ unsigned LeftPrevOpcode = cast<Instruction>(Left[i - 1])->getOpcode();
+ if (ILeft && LeftPrevOpcode == ILeft->getOpcode())
+ return false;
+ if (IRight && LeftPrevOpcode == IRight->getOpcode())
+ return true;
+ }
+ return false;
+}
+
+void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
+ SmallVectorImpl<Value *> &Left,
+ SmallVectorImpl<Value *> &Right) {
+
+ if (VL.size()) {
+ // Peel the first iteration out of the loop since there's nothing
+ // interesting to do anyway and it simplifies the checks in the loop.
+ auto VLeft = cast<Instruction>(VL[0])->getOperand(0);
+ auto VRight = cast<Instruction>(VL[0])->getOperand(1);
+ if (!isa<Instruction>(VRight) && isa<Instruction>(VLeft))
+ // Favor having instruction to the right. FIXME: why?
+ std::swap(VLeft, VRight);
+ Left.push_back(VLeft);
+ Right.push_back(VRight);
+ }
+
+ // Keep track if we have instructions with all the same opcode on one side.
+ bool AllSameOpcodeLeft = isa<Instruction>(Left[0]);
+ bool AllSameOpcodeRight = isa<Instruction>(Right[0]);
+ // Keep track if we have one side with all the same value (broadcast).
+ bool SplatLeft = true;
+ bool SplatRight = true;
+
+ for (unsigned i = 1, e = VL.size(); i != e; ++i) {
+ Instruction *I = cast<Instruction>(VL[i]);
+ assert(I->isCommutative() && "Can only process commutative instruction");
+ // Commute to favor either a splat or maximizing having the same opcodes on
+ // one side.
+ if (shouldReorderOperands(i, *I, Left, Right, AllSameOpcodeLeft,
+ AllSameOpcodeRight, SplatLeft, SplatRight)) {
+ Left.push_back(I->getOperand(1));
+ Right.push_back(I->getOperand(0));
+ } else {
+ Left.push_back(I->getOperand(0));
+ Right.push_back(I->getOperand(1));
+ }
+ // Update Splat* and AllSameOpcode* after the insertion.
+ SplatRight = SplatRight && (Right[i - 1] == Right[i]);
+ SplatLeft = SplatLeft && (Left[i - 1] == Left[i]);
+ AllSameOpcodeLeft = AllSameOpcodeLeft && isa<Instruction>(Left[i]) &&
+ (cast<Instruction>(Left[i - 1])->getOpcode() ==
+ cast<Instruction>(Left[i])->getOpcode());
+ AllSameOpcodeRight = AllSameOpcodeRight && isa<Instruction>(Right[i]) &&
+ (cast<Instruction>(Right[i - 1])->getOpcode() ==
+ cast<Instruction>(Right[i])->getOpcode());
+ }
+
+ // If one operand end up being broadcast, return this operand order.
+ if (SplatRight || SplatLeft)
+ return;
+
+ const DataLayout &DL = F->getParent()->getDataLayout();
+
+ // Finally check if we can get longer vectorizable chain by reordering
+ // without breaking the good operand order detected above.
+ // E.g. If we have something like-
+ // load a[0] load b[0]
+ // load b[1] load a[1]
+ // load a[2] load b[2]
+ // load a[3] load b[3]
+ // Reordering the second load b[1] load a[1] would allow us to vectorize
+ // this code and we still retain AllSameOpcode property.
+ // FIXME: This load reordering might break AllSameOpcode in some rare cases
+ // such as-
+ // add a[0],c[0] load b[0]
+ // add a[1],c[2] load b[1]
+ // b[2] load b[2]
+ // add a[3],c[3] load b[3]
+ for (unsigned j = 0; j < VL.size() - 1; ++j) {
+ if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
+ if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
+ if (isConsecutiveAccess(L, L1, DL)) {
+ std::swap(Left[j + 1], Right[j + 1]);
+ continue;
+ }
+ }
+ }
+ if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
+ if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
+ if (isConsecutiveAccess(L, L1, DL)) {
+ std::swap(Left[j + 1], Right[j + 1]);
+ continue;
+ }
+ }
+ }
+ // else unchanged
+ }
+}
+
+void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) {
+ Instruction *VL0 = cast<Instruction>(VL[0]);
+ BasicBlock::iterator NextInst(VL0);
+ ++NextInst;
+ Builder.SetInsertPoint(VL0->getParent(), NextInst);
+ Builder.SetCurrentDebugLocation(VL0->getDebugLoc());
+}
+
+Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
+ Value *Vec = UndefValue::get(Ty);
+ // Generate the 'InsertElement' instruction.
+ for (unsigned i = 0; i < Ty->getNumElements(); ++i) {
+ Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
+ if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) {
+ GatherSeq.insert(Insrt);
+ CSEBlocks.insert(Insrt->getParent());
+
+ // Add to our 'need-to-extract' list.
+ if (ScalarToTreeEntry.count(VL[i])) {
+ int Idx = ScalarToTreeEntry[VL[i]];
+ TreeEntry *E = &VectorizableTree[Idx];
+ // Find which lane we need to extract.
+ int FoundLane = -1;
+ for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) {
+ // Is this the lane of the scalar that we are looking for ?
+ if (E->Scalars[Lane] == VL[i]) {
+ FoundLane = Lane;
+ break;
+ }
+ }
+ assert(FoundLane >= 0 && "Could not find the correct lane");
+ ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane));
+ }
+ }
+ }
+
+ return Vec;
+}
+
+Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) const {
+ SmallDenseMap<Value*, int>::const_iterator Entry
+ = ScalarToTreeEntry.find(VL[0]);
+ if (Entry != ScalarToTreeEntry.end()) {
+ int Idx = Entry->second;
+ const TreeEntry *En = &VectorizableTree[Idx];
+ if (En->isSame(VL) && En->VectorizedValue)
+ return En->VectorizedValue;
+ }
+ return nullptr;
+}
+
+Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
+ if (ScalarToTreeEntry.count(VL[0])) {
+ int Idx = ScalarToTreeEntry[VL[0]];
+ TreeEntry *E = &VectorizableTree[Idx];
+ if (E->isSame(VL))
+ return vectorizeTree(E);
+ }
+
+ Type *ScalarTy = VL[0]->getType();
+ if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
+ ScalarTy = SI->getValueOperand()->getType();
+ VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
+
+ return Gather(VL, VecTy);
+}
+
+Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
+ IRBuilder<>::InsertPointGuard Guard(Builder);
+
+ if (E->VectorizedValue) {
+ DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
+ return E->VectorizedValue;
+ }
+
+ Instruction *VL0 = cast<Instruction>(E->Scalars[0]);
+ Type *ScalarTy = VL0->getType();
+ if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
+ ScalarTy = SI->getValueOperand()->getType();
+ VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());
+
+ if (E->NeedToGather) {
+ setInsertPointAfterBundle(E->Scalars);
+ return Gather(E->Scalars, VecTy);
+ }
+
+ const DataLayout &DL = F->getParent()->getDataLayout();
+ unsigned Opcode = getSameOpcode(E->Scalars);
+
+ switch (Opcode) {
+ case Instruction::PHI: {
+ PHINode *PH = dyn_cast<PHINode>(VL0);
+ Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
+ Builder.SetCurrentDebugLocation(PH->getDebugLoc());
+ PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
+ E->VectorizedValue = NewPhi;
+
+ // PHINodes may have multiple entries from the same block. We want to
+ // visit every block once.
+ SmallSet<BasicBlock*, 4> VisitedBBs;
+
+ for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
+ ValueList Operands;
+ BasicBlock *IBB = PH->getIncomingBlock(i);
+
+ if (!VisitedBBs.insert(IBB).second) {
+ NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
+ continue;
+ }
+
+ // Prepare the operand vector.
+ for (Value *V : E->Scalars)
+ Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(IBB));
+
+ Builder.SetInsertPoint(IBB->getTerminator());
+ Builder.SetCurrentDebugLocation(PH->getDebugLoc());
+ Value *Vec = vectorizeTree(Operands);
+ NewPhi->addIncoming(Vec, IBB);
+ }
+
+ assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
+ "Invalid number of incoming values");
+ return NewPhi;
+ }
+
+ case Instruction::ExtractElement: {
+ if (CanReuseExtract(E->Scalars)) {
+ Value *V = VL0->getOperand(0);
+ E->VectorizedValue = V;
+ return V;
+ }
+ return Gather(E->Scalars, VecTy);
+ }
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::SIToFP:
+ case Instruction::UIToFP:
+ case Instruction::Trunc:
+ case Instruction::FPTrunc:
+ case Instruction::BitCast: {
+ ValueList INVL;
+ for (Value *V : E->Scalars)
+ INVL.push_back(cast<Instruction>(V)->getOperand(0));
+
+ setInsertPointAfterBundle(E->Scalars);
+
+ Value *InVec = vectorizeTree(INVL);
+
+ if (Value *V = alreadyVectorized(E->Scalars))
+ return V;
+
+ CastInst *CI = dyn_cast<CastInst>(VL0);
+ Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
+ E->VectorizedValue = V;
+ ++NumVectorInstructions;
+ return V;
+ }
+ case Instruction::FCmp:
+ case Instruction::ICmp: {
+ ValueList LHSV, RHSV;
+ for (Value *V : E->Scalars) {
+ LHSV.push_back(cast<Instruction>(V)->getOperand(0));
+ RHSV.push_back(cast<Instruction>(V)->getOperand(1));
+ }
+
+ setInsertPointAfterBundle(E->Scalars);
+
+ Value *L = vectorizeTree(LHSV);
+ Value *R = vectorizeTree(RHSV);
+
+ if (Value *V = alreadyVectorized(E->Scalars))
+ return V;
+
+ CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
+ Value *V;
+ if (Opcode == Instruction::FCmp)
+ V = Builder.CreateFCmp(P0, L, R);
+ else
+ V = Builder.CreateICmp(P0, L, R);
+
+ E->VectorizedValue = V;
+ ++NumVectorInstructions;
+ return V;
+ }
+ case Instruction::Select: {
+ ValueList TrueVec, FalseVec, CondVec;
+ for (Value *V : E->Scalars) {
+ CondVec.push_back(cast<Instruction>(V)->getOperand(0));
+ TrueVec.push_back(cast<Instruction>(V)->getOperand(1));
+ FalseVec.push_back(cast<Instruction>(V)->getOperand(2));
+ }
+
+ setInsertPointAfterBundle(E->Scalars);
+
+ Value *Cond = vectorizeTree(CondVec);
+ Value *True = vectorizeTree(TrueVec);
+ Value *False = vectorizeTree(FalseVec);
+
+ if (Value *V = alreadyVectorized(E->Scalars))
+ return V;
+
+ Value *V = Builder.CreateSelect(Cond, True, False);
+ E->VectorizedValue = V;
+ ++NumVectorInstructions;
+ return V;
+ }
+ case Instruction::Add:
+ case Instruction::FAdd:
+ case Instruction::Sub:
+ case Instruction::FSub:
+ case Instruction::Mul:
+ case Instruction::FMul:
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::FDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor: {
+ ValueList LHSVL, RHSVL;
+ if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
+ reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL);
+ else
+ for (Value *V : E->Scalars) {
+ LHSVL.push_back(cast<Instruction>(V)->getOperand(0));
+ RHSVL.push_back(cast<Instruction>(V)->getOperand(1));
+ }
+
+ setInsertPointAfterBundle(E->Scalars);
+
+ Value *LHS = vectorizeTree(LHSVL);
+ Value *RHS = vectorizeTree(RHSVL);
+
+ if (LHS == RHS && isa<Instruction>(LHS)) {
+ assert((VL0->getOperand(0) == VL0->getOperand(1)) && "Invalid order");
+ }
+
+ if (Value *V = alreadyVectorized(E->Scalars))
+ return V;
+
+ BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
+ Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS);
+ E->VectorizedValue = V;
+ propagateIRFlags(E->VectorizedValue, E->Scalars);
+ ++NumVectorInstructions;
+
+ if (Instruction *I = dyn_cast<Instruction>(V))
+ return propagateMetadata(I, E->Scalars);
+
+ return V;
+ }
+ case Instruction::Load: {
+ // Loads are inserted at the head of the tree because we don't want to
+ // sink them all the way down past store instructions.
+ setInsertPointAfterBundle(E->Scalars);
+
+ LoadInst *LI = cast<LoadInst>(VL0);
+ Type *ScalarLoadTy = LI->getType();
+ unsigned AS = LI->getPointerAddressSpace();
+
+ Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
+ VecTy->getPointerTo(AS));
+
+ // The pointer operand uses an in-tree scalar so we add the new BitCast to
+ // ExternalUses list to make sure that an extract will be generated in the
+ // future.
+ if (ScalarToTreeEntry.count(LI->getPointerOperand()))
+ ExternalUses.push_back(
+ ExternalUser(LI->getPointerOperand(), cast<User>(VecPtr), 0));
+
+ unsigned Alignment = LI->getAlignment();
+ LI = Builder.CreateLoad(VecPtr);
+ if (!Alignment) {
+ Alignment = DL.getABITypeAlignment(ScalarLoadTy);
+ }
+ LI->setAlignment(Alignment);
+ E->VectorizedValue = LI;
+ ++NumVectorInstructions;
+ return propagateMetadata(LI, E->Scalars);
+ }
+ case Instruction::Store: {
+ StoreInst *SI = cast<StoreInst>(VL0);
+ unsigned Alignment = SI->getAlignment();
+ unsigned AS = SI->getPointerAddressSpace();
+
+ ValueList ValueOp;
+ for (Value *V : E->Scalars)
+ ValueOp.push_back(cast<StoreInst>(V)->getValueOperand());
+
+ setInsertPointAfterBundle(E->Scalars);
+
+ Value *VecValue = vectorizeTree(ValueOp);
+ Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(),
+ VecTy->getPointerTo(AS));
+ StoreInst *S = Builder.CreateStore(VecValue, VecPtr);
+
+ // The pointer operand uses an in-tree scalar so we add the new BitCast to
+ // ExternalUses list to make sure that an extract will be generated in the
+ // future.
+ if (ScalarToTreeEntry.count(SI->getPointerOperand()))
+ ExternalUses.push_back(
+ ExternalUser(SI->getPointerOperand(), cast<User>(VecPtr), 0));
+
+ if (!Alignment) {
+ Alignment = DL.getABITypeAlignment(SI->getValueOperand()->getType());
+ }
+ S->setAlignment(Alignment);
+ E->VectorizedValue = S;
+ ++NumVectorInstructions;
+ return propagateMetadata(S, E->Scalars);
+ }
+ case Instruction::GetElementPtr: {
+ setInsertPointAfterBundle(E->Scalars);
+
+ ValueList Op0VL;
+ for (Value *V : E->Scalars)
+ Op0VL.push_back(cast<GetElementPtrInst>(V)->getOperand(0));
+
+ Value *Op0 = vectorizeTree(Op0VL);
+
+ std::vector<Value *> OpVecs;
+ for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
+ ++j) {
+ ValueList OpVL;
+ for (Value *V : E->Scalars)
+ OpVL.push_back(cast<GetElementPtrInst>(V)->getOperand(j));
+
+ Value *OpVec = vectorizeTree(OpVL);
+ OpVecs.push_back(OpVec);
+ }
+
+ Value *V = Builder.CreateGEP(
+ cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);
+ E->VectorizedValue = V;
+ ++NumVectorInstructions;
+
+ if (Instruction *I = dyn_cast<Instruction>(V))
+ return propagateMetadata(I, E->Scalars);
+
+ return V;
+ }
+ case Instruction::Call: {
+ CallInst *CI = cast<CallInst>(VL0);
+ setInsertPointAfterBundle(E->Scalars);
+ Function *FI;
+ Intrinsic::ID IID = Intrinsic::not_intrinsic;
+ Value *ScalarArg = nullptr;
+ if (CI && (FI = CI->getCalledFunction())) {
+ IID = FI->getIntrinsicID();
+ }
+ std::vector<Value *> OpVecs;
+ for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
+ ValueList OpVL;
+ // ctlz,cttz and powi are special intrinsics whose second argument is
+ // a scalar. This argument should not be vectorized.
+ if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) {
+ CallInst *CEI = cast<CallInst>(E->Scalars[0]);
+ ScalarArg = CEI->getArgOperand(j);
+ OpVecs.push_back(CEI->getArgOperand(j));
+ continue;
+ }
+ for (Value *V : E->Scalars) {
+ CallInst *CEI = cast<CallInst>(V);
+ OpVL.push_back(CEI->getArgOperand(j));
+ }
+
+ Value *OpVec = vectorizeTree(OpVL);
+ DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
+ OpVecs.push_back(OpVec);
+ }
+
+ Module *M = F->getParent();
+ Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
+ Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) };
+ Function *CF = Intrinsic::getDeclaration(M, ID, Tys);
+ Value *V = Builder.CreateCall(CF, OpVecs);
+
+ // The scalar argument uses an in-tree scalar so we add the new vectorized
+ // call to ExternalUses list to make sure that an extract will be
+ // generated in the future.
+ if (ScalarArg && ScalarToTreeEntry.count(ScalarArg))
+ ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
+
+ E->VectorizedValue = V;
+ ++NumVectorInstructions;
+ return V;
+ }
+ case Instruction::ShuffleVector: {
+ ValueList LHSVL, RHSVL;
+ assert(isa<BinaryOperator>(VL0) && "Invalid Shuffle Vector Operand");
+ reorderAltShuffleOperands(E->Scalars, LHSVL, RHSVL);
+ setInsertPointAfterBundle(E->Scalars);
+
+ Value *LHS = vectorizeTree(LHSVL);
+ Value *RHS = vectorizeTree(RHSVL);
+
+ if (Value *V = alreadyVectorized(E->Scalars))
+ return V;
+
+ // Create a vector of LHS op1 RHS
+ BinaryOperator *BinOp0 = cast<BinaryOperator>(VL0);
+ Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS);
+
+ // Create a vector of LHS op2 RHS
+ Instruction *VL1 = cast<Instruction>(E->Scalars[1]);
+ BinaryOperator *BinOp1 = cast<BinaryOperator>(VL1);
+ Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS);
+
+ // Create shuffle to take alternate operations from the vector.
+ // Also, gather up odd and even scalar ops to propagate IR flags to
+ // each vector operation.
+ ValueList OddScalars, EvenScalars;
+ unsigned e = E->Scalars.size();
+ SmallVector<Constant *, 8> Mask(e);
+ for (unsigned i = 0; i < e; ++i) {
+ if (i & 1) {
+ Mask[i] = Builder.getInt32(e + i);
+ OddScalars.push_back(E->Scalars[i]);
+ } else {
+ Mask[i] = Builder.getInt32(i);
+ EvenScalars.push_back(E->Scalars[i]);
+ }
+ }
+
+ Value *ShuffleMask = ConstantVector::get(Mask);
+ propagateIRFlags(V0, EvenScalars);
+ propagateIRFlags(V1, OddScalars);
+
+ Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
+ E->VectorizedValue = V;
+ ++NumVectorInstructions;
+ if (Instruction *I = dyn_cast<Instruction>(V))
+ return propagateMetadata(I, E->Scalars);
+
+ return V;
+ }
+ default:
+ llvm_unreachable("unknown inst");
+ }
+ return nullptr;
+}
+
+Value *BoUpSLP::vectorizeTree() {
+
+ // All blocks must be scheduled before any instructions are inserted.
+ for (auto &BSIter : BlocksSchedules) {
+ scheduleBlock(BSIter.second.get());
+ }
+
+ Builder.SetInsertPoint(&F->getEntryBlock().front());
+ vectorizeTree(&VectorizableTree[0]);
+
+ DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n");
+
+ // Extract all of the elements with the external uses.
+ for (UserList::iterator it = ExternalUses.begin(), e = ExternalUses.end();
+ it != e; ++it) {
+ Value *Scalar = it->Scalar;
+ llvm::User *User = it->User;
+
+ // Skip users that we already RAUW. This happens when one instruction
+ // has multiple uses of the same value.
+ if (std::find(Scalar->user_begin(), Scalar->user_end(), User) ==
+ Scalar->user_end())
+ continue;
+ assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar");
+
+ int Idx = ScalarToTreeEntry[Scalar];
+ TreeEntry *E = &VectorizableTree[Idx];
+ assert(!E->NeedToGather && "Extracting from a gather list");
+
+ Value *Vec = E->VectorizedValue;
+ assert(Vec && "Can't find vectorizable value");
+
+ Value *Lane = Builder.getInt32(it->Lane);
+ // Generate extracts for out-of-tree users.
+ // Find the insertion point for the extractelement lane.
+ if (isa<Instruction>(Vec)){
+ if (PHINode *PH = dyn_cast<PHINode>(User)) {
+ for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {
+ if (PH->getIncomingValue(i) == Scalar) {
+ Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
+ Value *Ex = Builder.CreateExtractElement(Vec, Lane);
+ CSEBlocks.insert(PH->getIncomingBlock(i));
+ PH->setOperand(i, Ex);
+ }
+ }
+ } else {
+ Builder.SetInsertPoint(cast<Instruction>(User));
+ Value *Ex = Builder.CreateExtractElement(Vec, Lane);
+ CSEBlocks.insert(cast<Instruction>(User)->getParent());
+ User->replaceUsesOfWith(Scalar, Ex);
+ }
+ } else {
+ Builder.SetInsertPoint(&F->getEntryBlock().front());
+ Value *Ex = Builder.CreateExtractElement(Vec, Lane);
+ CSEBlocks.insert(&F->getEntryBlock());
+ User->replaceUsesOfWith(Scalar, Ex);
+ }
+
+ DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
+ }
+
+ // For each vectorized value:
+ for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) {
+ TreeEntry *Entry = &VectorizableTree[EIdx];
+
+ // For each lane:
+ for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
+ Value *Scalar = Entry->Scalars[Lane];
+ // No need to handle users of gathered values.
+ if (Entry->NeedToGather)
+ continue;
+
+ assert(Entry->VectorizedValue && "Can't find vectorizable value");
+
+ Type *Ty = Scalar->getType();
+ if (!Ty->isVoidTy()) {
+#ifndef NDEBUG
+ for (User *U : Scalar->users()) {
+ DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
+
+ assert((ScalarToTreeEntry.count(U) ||
+ // It is legal to replace users in the ignorelist by undef.
+ (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), U) !=
+ UserIgnoreList.end())) &&
+ "Replacing out-of-tree value with undef");
+ }
+#endif
+ Value *Undef = UndefValue::get(Ty);
+ Scalar->replaceAllUsesWith(Undef);
+ }
+ DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
+ eraseInstruction(cast<Instruction>(Scalar));
+ }
+ }
+
+ Builder.ClearInsertionPoint();
+
+ return VectorizableTree[0].VectorizedValue;
+}
+
+void BoUpSLP::optimizeGatherSequence() {
+ DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
+ << " gather sequences instructions.\n");
+ // LICM InsertElementInst sequences.
+ for (SetVector<Instruction *>::iterator it = GatherSeq.begin(),
+ e = GatherSeq.end(); it != e; ++it) {
+ InsertElementInst *Insert = dyn_cast<InsertElementInst>(*it);
+
+ if (!Insert)
+ continue;
+
+ // Check if this block is inside a loop.
+ Loop *L = LI->getLoopFor(Insert->getParent());
+ if (!L)
+ continue;
+
+ // Check if it has a preheader.
+ BasicBlock *PreHeader = L->getLoopPreheader();
+ if (!PreHeader)
+ continue;
+
+ // If the vector or the element that we insert into it are
+ // instructions that are defined in this basic block then we can't
+ // hoist this instruction.
+ Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0));
+ Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1));
+ if (CurrVec && L->contains(CurrVec))
+ continue;
+ if (NewElem && L->contains(NewElem))
+ continue;
+
+ // We can hoist this instruction. Move it to the pre-header.
+ Insert->moveBefore(PreHeader->getTerminator());
+ }
+
+ // Make a list of all reachable blocks in our CSE queue.
+ SmallVector<const DomTreeNode *, 8> CSEWorkList;
+ CSEWorkList.reserve(CSEBlocks.size());
+ for (BasicBlock *BB : CSEBlocks)
+ if (DomTreeNode *N = DT->getNode(BB)) {
+ assert(DT->isReachableFromEntry(N));
+ CSEWorkList.push_back(N);
+ }
+
+ // Sort blocks by domination. This ensures we visit a block after all blocks
+ // dominating it are visited.
+ std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(),
+ [this](const DomTreeNode *A, const DomTreeNode *B) {
+ return DT->properlyDominates(A, B);
+ });
+
+ // Perform O(N^2) search over the gather sequences and merge identical
+ // instructions. TODO: We can further optimize this scan if we split the
+ // instructions into different buckets based on the insert lane.
+ SmallVector<Instruction *, 16> Visited;
+ for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
+ assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
+ "Worklist not sorted properly!");
+ BasicBlock *BB = (*I)->getBlock();
+ // For all instructions in blocks containing gather sequences:
+ for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
+ Instruction *In = &*it++;
+ if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))
+ continue;
+
+ // Check if we can replace this instruction with any of the
+ // visited instructions.
+ for (SmallVectorImpl<Instruction *>::iterator v = Visited.begin(),
+ ve = Visited.end();
+ v != ve; ++v) {
+ if (In->isIdenticalTo(*v) &&
+ DT->dominates((*v)->getParent(), In->getParent())) {
+ In->replaceAllUsesWith(*v);
+ eraseInstruction(In);
+ In = nullptr;
+ break;
+ }
+ }
+ if (In) {
+ assert(std::find(Visited.begin(), Visited.end(), In) == Visited.end());
+ Visited.push_back(In);
+ }
+ }
+ }
+ CSEBlocks.clear();
+ GatherSeq.clear();
+}
+
+// Groups the instructions to a bundle (which is then a single scheduling entity)
+// and schedules instructions until the bundle gets ready.
+bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
+ BoUpSLP *SLP) {
+ if (isa<PHINode>(VL[0]))
+ return true;
+
+ // Initialize the instruction bundle.
+ Instruction *OldScheduleEnd = ScheduleEnd;
+ ScheduleData *PrevInBundle = nullptr;
+ ScheduleData *Bundle = nullptr;
+ bool ReSchedule = false;
+ DEBUG(dbgs() << "SLP: bundle: " << *VL[0] << "\n");
+
+ // Make sure that the scheduling region contains all
+ // instructions of the bundle.
+ for (Value *V : VL) {
+ if (!extendSchedulingRegion(V))
+ return false;
+ }
+
+ for (Value *V : VL) {
+ ScheduleData *BundleMember = getScheduleData(V);
+ assert(BundleMember &&
+ "no ScheduleData for bundle member (maybe not in same basic block)");
+ if (BundleMember->IsScheduled) {
+ // A bundle member was scheduled as single instruction before and now
+ // needs to be scheduled as part of the bundle. We just get rid of the
+ // existing schedule.
+ DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
+ << " was already scheduled\n");
+ ReSchedule = true;
+ }
+ assert(BundleMember->isSchedulingEntity() &&
+ "bundle member already part of other bundle");
+ if (PrevInBundle) {
+ PrevInBundle->NextInBundle = BundleMember;
+ } else {
+ Bundle = BundleMember;
+ }
+ BundleMember->UnscheduledDepsInBundle = 0;
+ Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;
+
+ // Group the instructions to a bundle.
+ BundleMember->FirstInBundle = Bundle;
+ PrevInBundle = BundleMember;
+ }
+ if (ScheduleEnd != OldScheduleEnd) {
+ // The scheduling region got new instructions at the lower end (or it is a
+ // new region for the first bundle). This makes it necessary to
+ // recalculate all dependencies.
+ // It is seldom that this needs to be done a second time after adding the
+ // initial bundle to the region.
+ for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
+ ScheduleData *SD = getScheduleData(I);
+ SD->clearDependencies();
+ }
+ ReSchedule = true;
+ }
+ if (ReSchedule) {
+ resetSchedule();
+ initialFillReadyList(ReadyInsts);
+ }
+
+ DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "
+ << BB->getName() << "\n");
+
+ calculateDependencies(Bundle, true, SLP);
+
+ // Now try to schedule the new bundle. As soon as the bundle is "ready" it
+ // means that there are no cyclic dependencies and we can schedule it.
+ // Note that's important that we don't "schedule" the bundle yet (see
+ // cancelScheduling).
+ while (!Bundle->isReady() && !ReadyInsts.empty()) {
+
+ ScheduleData *pickedSD = ReadyInsts.back();
+ ReadyInsts.pop_back();
+
+ if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) {
+ schedule(pickedSD, ReadyInsts);
+ }
+ }
+ if (!Bundle->isReady()) {
+ cancelScheduling(VL);
+ return false;
+ }
+ return true;
+}
+
+void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL) {
+ if (isa<PHINode>(VL[0]))
+ return;
+
+ ScheduleData *Bundle = getScheduleData(VL[0]);
+ DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
+ assert(!Bundle->IsScheduled &&
+ "Can't cancel bundle which is already scheduled");
+ assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&
+ "tried to unbundle something which is not a bundle");
+
+ // Un-bundle: make single instructions out of the bundle.
+ ScheduleData *BundleMember = Bundle;
+ while (BundleMember) {
+ assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
+ BundleMember->FirstInBundle = BundleMember;
+ ScheduleData *Next = BundleMember->NextInBundle;
+ BundleMember->NextInBundle = nullptr;
+ BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;
+ if (BundleMember->UnscheduledDepsInBundle == 0) {
+ ReadyInsts.insert(BundleMember);
+ }
+ BundleMember = Next;
+ }
+}
+
+bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
+ if (getScheduleData(V))
+ return true;
+ Instruction *I = dyn_cast<Instruction>(V);
+ assert(I && "bundle member must be an instruction");
+ assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled");
+ if (!ScheduleStart) {
+ // It's the first instruction in the new region.
+ initScheduleData(I, I->getNextNode(), nullptr, nullptr);
+ ScheduleStart = I;
+ ScheduleEnd = I->getNextNode();
+ assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
+ DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
+ return true;
+ }
+ // Search up and down at the same time, because we don't know if the new
+ // instruction is above or below the existing scheduling region.
+ BasicBlock::reverse_iterator UpIter(ScheduleStart->getIterator());
+ BasicBlock::reverse_iterator UpperEnd = BB->rend();
+ BasicBlock::iterator DownIter(ScheduleEnd);
+ BasicBlock::iterator LowerEnd = BB->end();
+ for (;;) {
+ if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
+ DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
+ return false;
+ }
+
+ if (UpIter != UpperEnd) {
+ if (&*UpIter == I) {
+ initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
+ ScheduleStart = I;
+ DEBUG(dbgs() << "SLP: extend schedule region start to " << *I << "\n");
+ return true;
+ }
+ UpIter++;
+ }
+ if (DownIter != LowerEnd) {
+ if (&*DownIter == I) {
+ initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
+ nullptr);
+ ScheduleEnd = I->getNextNode();
+ assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
+ DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
+ return true;
+ }
+ DownIter++;
+ }
+ assert((UpIter != UpperEnd || DownIter != LowerEnd) &&
+ "instruction not found in block");
+ }
+ return true;
+}
+
+void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
+ Instruction *ToI,
+ ScheduleData *PrevLoadStore,
+ ScheduleData *NextLoadStore) {
+ ScheduleData *CurrentLoadStore = PrevLoadStore;
+ for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
+ ScheduleData *SD = ScheduleDataMap[I];
+ if (!SD) {
+ // Allocate a new ScheduleData for the instruction.
+ if (ChunkPos >= ChunkSize) {
+ ScheduleDataChunks.push_back(
+ llvm::make_unique<ScheduleData[]>(ChunkSize));
+ ChunkPos = 0;
+ }
+ SD = &(ScheduleDataChunks.back()[ChunkPos++]);
+ ScheduleDataMap[I] = SD;
+ SD->Inst = I;
+ }
+ assert(!isInSchedulingRegion(SD) &&
+ "new ScheduleData already in scheduling region");
+ SD->init(SchedulingRegionID);
+
+ if (I->mayReadOrWriteMemory()) {
+ // Update the linked list of memory accessing instructions.
+ if (CurrentLoadStore) {
+ CurrentLoadStore->NextLoadStore = SD;
+ } else {
+ FirstLoadStoreInRegion = SD;
+ }
+ CurrentLoadStore = SD;
+ }
+ }
+ if (NextLoadStore) {
+ if (CurrentLoadStore)
+ CurrentLoadStore->NextLoadStore = NextLoadStore;
+ } else {
+ LastLoadStoreInRegion = CurrentLoadStore;
+ }
+}
+
+void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
+ bool InsertInReadyList,
+ BoUpSLP *SLP) {
+ assert(SD->isSchedulingEntity());
+
+ SmallVector<ScheduleData *, 10> WorkList;
+ WorkList.push_back(SD);
+
+ while (!WorkList.empty()) {
+ ScheduleData *SD = WorkList.back();
+ WorkList.pop_back();
+
+ ScheduleData *BundleMember = SD;
+ while (BundleMember) {
+ assert(isInSchedulingRegion(BundleMember));
+ if (!BundleMember->hasValidDependencies()) {
+
+ DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n");
+ BundleMember->Dependencies = 0;
+ BundleMember->resetUnscheduledDeps();
+
+ // Handle def-use chain dependencies.
+ for (User *U : BundleMember->Inst->users()) {
+ if (isa<Instruction>(U)) {
+ ScheduleData *UseSD = getScheduleData(U);
+ if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
+ BundleMember->Dependencies++;
+ ScheduleData *DestBundle = UseSD->FirstInBundle;
+ if (!DestBundle->IsScheduled) {
+ BundleMember->incrementUnscheduledDeps(1);
+ }
+ if (!DestBundle->hasValidDependencies()) {
+ WorkList.push_back(DestBundle);
+ }
+ }
+ } else {
+ // I'm not sure if this can ever happen. But we need to be safe.
+ // This lets the instruction/bundle never be scheduled and
+ // eventually disable vectorization.
+ BundleMember->Dependencies++;
+ BundleMember->incrementUnscheduledDeps(1);
+ }
+ }
+
+ // Handle the memory dependencies.
+ ScheduleData *DepDest = BundleMember->NextLoadStore;
+ if (DepDest) {
+ Instruction *SrcInst = BundleMember->Inst;
+ MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA);
+ bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
+ unsigned numAliased = 0;
+ unsigned DistToSrc = 1;
+
+ while (DepDest) {
+ assert(isInSchedulingRegion(DepDest));
+
+ // We have two limits to reduce the complexity:
+ // 1) AliasedCheckLimit: It's a small limit to reduce calls to
+ // SLP->isAliased (which is the expensive part in this loop).
+ // 2) MaxMemDepDistance: It's for very large blocks and it aborts
+ // the whole loop (even if the loop is fast, it's quadratic).
+ // It's important for the loop break condition (see below) to
+ // check this limit even between two read-only instructions.
+ if (DistToSrc >= MaxMemDepDistance ||
+ ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
+ (numAliased >= AliasedCheckLimit ||
+ SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
+
+ // We increment the counter only if the locations are aliased
+ // (instead of counting all alias checks). This gives a better
+ // balance between reduced runtime and accurate dependencies.
+ numAliased++;
+
+ DepDest->MemoryDependencies.push_back(BundleMember);
+ BundleMember->Dependencies++;
+ ScheduleData *DestBundle = DepDest->FirstInBundle;
+ if (!DestBundle->IsScheduled) {
+ BundleMember->incrementUnscheduledDeps(1);
+ }
+ if (!DestBundle->hasValidDependencies()) {
+ WorkList.push_back(DestBundle);
+ }
+ }
+ DepDest = DepDest->NextLoadStore;
+
+ // Example, explaining the loop break condition: Let's assume our
+ // starting instruction is i0 and MaxMemDepDistance = 3.
+ //
+ // +--------v--v--v
+ // i0,i1,i2,i3,i4,i5,i6,i7,i8
+ // +--------^--^--^
+ //
+ // MaxMemDepDistance let us stop alias-checking at i3 and we add
+ // dependencies from i0 to i3,i4,.. (even if they are not aliased).
+ // Previously we already added dependencies from i3 to i6,i7,i8
+ // (because of MaxMemDepDistance). As we added a dependency from
+ // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
+ // and we can abort this loop at i6.
+ if (DistToSrc >= 2 * MaxMemDepDistance)
+ break;
+ DistToSrc++;
+ }
+ }
+ }
+ BundleMember = BundleMember->NextInBundle;
+ }
+ if (InsertInReadyList && SD->isReady()) {
+ ReadyInsts.push_back(SD);
+ DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst << "\n");
+ }
+ }
+}
+
+void BoUpSLP::BlockScheduling::resetSchedule() {
+ assert(ScheduleStart &&
+ "tried to reset schedule on block which has not been scheduled");
+ for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
+ ScheduleData *SD = getScheduleData(I);
+ assert(isInSchedulingRegion(SD));
+ SD->IsScheduled = false;
+ SD->resetUnscheduledDeps();
+ }
+ ReadyInsts.clear();
+}
+
+void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
+
+ if (!BS->ScheduleStart)
+ return;
+
+ DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
+
+ BS->resetSchedule();
+
+ // For the real scheduling we use a more sophisticated ready-list: it is
+ // sorted by the original instruction location. This lets the final schedule
+ // be as close as possible to the original instruction order.
+ struct ScheduleDataCompare {
+ bool operator()(ScheduleData *SD1, ScheduleData *SD2) {
+ return SD2->SchedulingPriority < SD1->SchedulingPriority;
+ }
+ };
+ std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
+
+ // Ensure that all dependency data is updated and fill the ready-list with
+ // initial instructions.
+ int Idx = 0;
+ int NumToSchedule = 0;
+ for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
+ I = I->getNextNode()) {
+ ScheduleData *SD = BS->getScheduleData(I);
+ assert(
+ SD->isPartOfBundle() == (ScalarToTreeEntry.count(SD->Inst) != 0) &&
+ "scheduler and vectorizer have different opinion on what is a bundle");
+ SD->FirstInBundle->SchedulingPriority = Idx++;
+ if (SD->isSchedulingEntity()) {
+ BS->calculateDependencies(SD, false, this);
+ NumToSchedule++;
+ }
+ }
+ BS->initialFillReadyList(ReadyInsts);
+
+ Instruction *LastScheduledInst = BS->ScheduleEnd;
+
+ // Do the "real" scheduling.
+ while (!ReadyInsts.empty()) {
+ ScheduleData *picked = *ReadyInsts.begin();
+ ReadyInsts.erase(ReadyInsts.begin());
+
+ // Move the scheduled instruction(s) to their dedicated places, if not
+ // there yet.
+ ScheduleData *BundleMember = picked;
+ while (BundleMember) {
+ Instruction *pickedInst = BundleMember->Inst;
+ if (LastScheduledInst->getNextNode() != pickedInst) {
+ BS->BB->getInstList().remove(pickedInst);
+ BS->BB->getInstList().insert(LastScheduledInst->getIterator(),
+ pickedInst);
+ }
+ LastScheduledInst = pickedInst;
+ BundleMember = BundleMember->NextInBundle;
+ }
+
+ BS->schedule(picked, ReadyInsts);
+ NumToSchedule--;
+ }
+ assert(NumToSchedule == 0 && "could not schedule all instructions");
+
+ // Avoid duplicate scheduling of the block.
+ BS->ScheduleStart = nullptr;
+}
+
+/// The SLPVectorizer Pass.
+struct SLPVectorizer : public FunctionPass {
+ typedef SmallVector<StoreInst *, 8> StoreList;
+ typedef MapVector<Value *, StoreList> StoreListMap;
+
+ /// Pass identification, replacement for typeid
+ static char ID;
+
+ explicit SLPVectorizer() : FunctionPass(ID) {
+ initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
+ }
+
+ ScalarEvolution *SE;
+ TargetTransformInfo *TTI;
+ TargetLibraryInfo *TLI;
+ AliasAnalysis *AA;
+ LoopInfo *LI;
+ DominatorTree *DT;
+ AssumptionCache *AC;
+
+ bool runOnFunction(Function &F) override {
+ if (skipOptnoneFunction(F))
+ return false;
+
+ SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+ TLI = TLIP ? &TLIP->getTLI() : nullptr;
+ AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+ LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+
+ StoreRefs.clear();
+ bool Changed = false;
+
+ // If the target claims to have no vector registers don't attempt
+ // vectorization.
+ if (!TTI->getNumberOfRegisters(true))
+ return false;
+
+ // Use the vector register size specified by the target unless overridden
+ // by a command-line option.
+ // TODO: It would be better to limit the vectorization factor based on
+ // data type rather than just register size. For example, x86 AVX has
+ // 256-bit registers, but it does not support integer operations
+ // at that width (that requires AVX2).
+ if (MaxVectorRegSizeOption.getNumOccurrences())
+ MaxVecRegSize = MaxVectorRegSizeOption;
+ else
+ MaxVecRegSize = TTI->getRegisterBitWidth(true);
+
+ // Don't vectorize when the attribute NoImplicitFloat is used.
+ if (F.hasFnAttribute(Attribute::NoImplicitFloat))
+ return false;
+
+ DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
+
+ // Use the bottom up slp vectorizer to construct chains that start with
+ // store instructions.
+ BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC);
+
+ // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
+ // delete instructions.
+
+ // Scan the blocks in the function in post order.
+ for (auto BB : post_order(&F.getEntryBlock())) {
+ // Vectorize trees that end at stores.
+ if (unsigned count = collectStores(BB, R)) {
+ (void)count;
+ DEBUG(dbgs() << "SLP: Found " << count << " stores to vectorize.\n");
+ Changed |= vectorizeStoreChains(R);
+ }
+
+ // Vectorize trees that end at reductions.
+ Changed |= vectorizeChainsInBlock(BB, R);
+ }
+
+ if (Changed) {
+ R.optimizeGatherSequence();
+ DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
+ DEBUG(verifyFunction(F));
+ }
+ return Changed;
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ FunctionPass::getAnalysisUsage(AU);
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<AAResultsWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.setPreservesCFG();
+ }
+
+private:
+
+ /// \brief Collect memory references and sort them according to their base
+ /// object. We sort the stores to their base objects to reduce the cost of the
+ /// quadratic search on the stores. TODO: We can further reduce this cost
+ /// if we flush the chain creation every time we run into a memory barrier.
+ unsigned collectStores(BasicBlock *BB, BoUpSLP &R);
+
+ /// \brief Try to vectorize a chain that starts at two arithmetic instrs.
+ bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R);
+
+ /// \brief Try to vectorize a list of operands.
+ /// \@param BuildVector A list of users to ignore for the purpose of
+ /// scheduling and that don't need extracting.
+ /// \returns true if a value was vectorized.
+ bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
+ ArrayRef<Value *> BuildVector = None,
+ bool allowReorder = false);
+
+ /// \brief Try to vectorize a chain that may start at the operands of \V;
+ bool tryToVectorize(BinaryOperator *V, BoUpSLP &R);
+
+ /// \brief Vectorize the stores that were collected in StoreRefs.
+ bool vectorizeStoreChains(BoUpSLP &R);
+
+ /// \brief Scan the basic block and look for patterns that are likely to start
+ /// a vectorization chain.
+ bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R);
+
+ bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold,
+ BoUpSLP &R, unsigned VecRegSize);
+
+ bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold,
+ BoUpSLP &R);
+private:
+ StoreListMap StoreRefs;
+ unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
+};
+
+/// \brief Check that the Values in the slice in VL array are still existent in
+/// the WeakVH array.
+/// Vectorization of part of the VL array may cause later values in the VL array
+/// to become invalid. We track when this has happened in the WeakVH array.
+static bool hasValueBeenRAUWed(ArrayRef<Value *> VL, ArrayRef<WeakVH> VH,
+ unsigned SliceBegin, unsigned SliceSize) {
+ VL = VL.slice(SliceBegin, SliceSize);
+ VH = VH.slice(SliceBegin, SliceSize);
+ return !std::equal(VL.begin(), VL.end(), VH.begin());
+}
+
+bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain,
+ int CostThreshold, BoUpSLP &R,
+ unsigned VecRegSize) {
+ unsigned ChainLen = Chain.size();
+ DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
+ << "\n");
+ Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType();
+ auto &DL = cast<StoreInst>(Chain[0])->getModule()->getDataLayout();
+ unsigned Sz = DL.getTypeSizeInBits(StoreTy);
+ unsigned VF = VecRegSize / Sz;
+
+ if (!isPowerOf2_32(Sz) || VF < 2)
+ return false;
+
+ // Keep track of values that were deleted by vectorizing in the loop below.
+ SmallVector<WeakVH, 8> TrackValues(Chain.begin(), Chain.end());
+
+ bool Changed = false;
+ // Look for profitable vectorizable trees at all offsets, starting at zero.
+ for (unsigned i = 0, e = ChainLen; i < e; ++i) {
+ if (i + VF > e)
+ break;
+
+ // Check that a previous iteration of this loop did not delete the Value.
+ if (hasValueBeenRAUWed(Chain, TrackValues, i, VF))
+ continue;
+
+ DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
+ << "\n");
+ ArrayRef<Value *> Operands = Chain.slice(i, VF);
+
+ R.buildTree(Operands);
+
+ int Cost = R.getTreeCost();
+
+ DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
+ if (Cost < CostThreshold) {
+ DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
+ R.vectorizeTree();
+
+ // Move to the next bundle.
+ i += VF - 1;
+ Changed = true;
+ }
+ }
+
+ return Changed;
+}
+
+bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
+ int costThreshold, BoUpSLP &R) {
+ SetVector<StoreInst *> Heads, Tails;
+ SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
+
+ // We may run into multiple chains that merge into a single chain. We mark the
+ // stores that we vectorized so that we don't visit the same store twice.
+ BoUpSLP::ValueSet VectorizedStores;
+ bool Changed = false;
+
+ // Do a quadratic search on all of the given stores and find
+ // all of the pairs of stores that follow each other.
+ SmallVector<unsigned, 16> IndexQueue;
+ for (unsigned i = 0, e = Stores.size(); i < e; ++i) {
+ const DataLayout &DL = Stores[i]->getModule()->getDataLayout();
+ IndexQueue.clear();
+ // If a store has multiple consecutive store candidates, search Stores
+ // array according to the sequence: from i+1 to e, then from i-1 to 0.
+ // This is because usually pairing with immediate succeeding or preceding
+ // candidate create the best chance to find slp vectorization opportunity.
+ unsigned j = 0;
+ for (j = i + 1; j < e; ++j)
+ IndexQueue.push_back(j);
+ for (j = i; j > 0; --j)
+ IndexQueue.push_back(j - 1);
+
+ for (auto &k : IndexQueue) {
+ if (R.isConsecutiveAccess(Stores[i], Stores[k], DL)) {
+ Tails.insert(Stores[k]);
+ Heads.insert(Stores[i]);
+ ConsecutiveChain[Stores[i]] = Stores[k];
+ break;
+ }
+ }
+ }
+
+ // For stores that start but don't end a link in the chain:
+ for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end();
+ it != e; ++it) {
+ if (Tails.count(*it))
+ continue;
+
+ // We found a store instr that starts a chain. Now follow the chain and try
+ // to vectorize it.
+ BoUpSLP::ValueList Operands;
+ StoreInst *I = *it;
+ // Collect the chain into a list.
+ while (Tails.count(I) || Heads.count(I)) {
+ if (VectorizedStores.count(I))
+ break;
+ Operands.push_back(I);
+ // Move to the next value in the chain.
+ I = ConsecutiveChain[I];
+ }
+
+ // FIXME: Is division-by-2 the correct step? Should we assert that the
+ // register size is a power-of-2?
+ for (unsigned Size = MaxVecRegSize; Size >= MinVecRegSize; Size /= 2) {
+ if (vectorizeStoreChain(Operands, costThreshold, R, Size)) {
+ // Mark the vectorized stores so that we don't vectorize them again.
+ VectorizedStores.insert(Operands.begin(), Operands.end());
+ Changed = true;
+ break;
+ }
+ }
+ }
+
+ return Changed;
+}
+
+
+unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) {
+ unsigned count = 0;
+ StoreRefs.clear();
+ const DataLayout &DL = BB->getModule()->getDataLayout();
+ for (Instruction &I : *BB) {
+ StoreInst *SI = dyn_cast<StoreInst>(&I);
+ if (!SI)
+ continue;
+
+ // Don't touch volatile stores.
+ if (!SI->isSimple())
+ continue;
+
+ // Check that the pointer points to scalars.
+ Type *Ty = SI->getValueOperand()->getType();
+ if (!isValidElementType(Ty))
+ continue;
+
+ // Find the base pointer.
+ Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), DL);
+
+ // Save the store locations.
+ StoreRefs[Ptr].push_back(SI);
+ count++;
+ }
+ return count;
+}
+
+bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
+ if (!A || !B)
+ return false;
+ Value *VL[] = { A, B };
+ return tryToVectorizeList(VL, R, None, true);
+}
+
+bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
+ ArrayRef<Value *> BuildVector,
+ bool allowReorder) {
+ if (VL.size() < 2)
+ return false;
+
+ DEBUG(dbgs() << "SLP: Vectorizing a list of length = " << VL.size() << ".\n");
+
+ // Check that all of the parts are scalar instructions of the same type.
+ Instruction *I0 = dyn_cast<Instruction>(VL[0]);
+ if (!I0)
+ return false;
+
+ unsigned Opcode0 = I0->getOpcode();
+ const DataLayout &DL = I0->getModule()->getDataLayout();
+
+ Type *Ty0 = I0->getType();
+ unsigned Sz = DL.getTypeSizeInBits(Ty0);
+ // FIXME: Register size should be a parameter to this function, so we can
+ // try different vectorization factors.
+ unsigned VF = MinVecRegSize / Sz;
+
+ for (Value *V : VL) {
+ Type *Ty = V->getType();
+ if (!isValidElementType(Ty))
+ return false;
+ Instruction *Inst = dyn_cast<Instruction>(V);
+ if (!Inst || Inst->getOpcode() != Opcode0)
+ return false;
+ }
+
+ bool Changed = false;
+
+ // Keep track of values that were deleted by vectorizing in the loop below.
+ SmallVector<WeakVH, 8> TrackValues(VL.begin(), VL.end());
+
+ for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+ unsigned OpsWidth = 0;
+
+ if (i + VF > e)
+ OpsWidth = e - i;
+ else
+ OpsWidth = VF;
+
+ if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
+ break;
+
+ // Check that a previous iteration of this loop did not delete the Value.
+ if (hasValueBeenRAUWed(VL, TrackValues, i, OpsWidth))
+ continue;
+
+ DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
+ << "\n");
+ ArrayRef<Value *> Ops = VL.slice(i, OpsWidth);
+
+ ArrayRef<Value *> BuildVectorSlice;
+ if (!BuildVector.empty())
+ BuildVectorSlice = BuildVector.slice(i, OpsWidth);
+
+ R.buildTree(Ops, BuildVectorSlice);
+ // TODO: check if we can allow reordering also for other cases than
+ // tryToVectorizePair()
+ if (allowReorder && R.shouldReorder()) {
+ assert(Ops.size() == 2);
+ assert(BuildVectorSlice.empty());
+ Value *ReorderedOps[] = { Ops[1], Ops[0] };
+ R.buildTree(ReorderedOps, None);
+ }
+ int Cost = R.getTreeCost();
+
+ if (Cost < -SLPCostThreshold) {
+ DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
+ Value *VectorizedRoot = R.vectorizeTree();
+
+ // Reconstruct the build vector by extracting the vectorized root. This
+ // way we handle the case where some elements of the vector are undefined.
+ // (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2))
+ if (!BuildVectorSlice.empty()) {
+ // The insert point is the last build vector instruction. The vectorized
+ // root will precede it. This guarantees that we get an instruction. The
+ // vectorized tree could have been constant folded.
+ Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back());
+ unsigned VecIdx = 0;
+ for (auto &V : BuildVectorSlice) {
+ IRBuilder<true, NoFolder> Builder(
+ InsertAfter->getParent(), ++BasicBlock::iterator(InsertAfter));
+ InsertElementInst *IE = cast<InsertElementInst>(V);
+ Instruction *Extract = cast<Instruction>(Builder.CreateExtractElement(
+ VectorizedRoot, Builder.getInt32(VecIdx++)));
+ IE->setOperand(1, Extract);
+ IE->removeFromParent();
+ IE->insertAfter(Extract);
+ InsertAfter = IE;
+ }
+ }
+ // Move to the next bundle.
+ i += VF - 1;
+ Changed = true;
+ }
+ }
+
+ return Changed;
+}
+
+bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
+ if (!V)
+ return false;
+
+ // Try to vectorize V.
+ if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R))
+ return true;
+
+ BinaryOperator *A = dyn_cast<BinaryOperator>(V->getOperand(0));
+ BinaryOperator *B = dyn_cast<BinaryOperator>(V->getOperand(1));
+ // Try to skip B.
+ if (B && B->hasOneUse()) {
+ BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
+ BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
+ if (tryToVectorizePair(A, B0, R)) {
+ return true;
+ }
+ if (tryToVectorizePair(A, B1, R)) {
+ return true;
+ }
+ }
+
+ // Try to skip A.
+ if (A && A->hasOneUse()) {
+ BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
+ BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
+ if (tryToVectorizePair(A0, B, R)) {
+ return true;
+ }
+ if (tryToVectorizePair(A1, B, R)) {
+ return true;
+ }
+ }
+ return 0;
+}
+
+/// \brief Generate a shuffle mask to be used in a reduction tree.
+///
+/// \param VecLen The length of the vector to be reduced.
+/// \param NumEltsToRdx The number of elements that should be reduced in the
+/// vector.
+/// \param IsPairwise Whether the reduction is a pairwise or splitting
+/// reduction. A pairwise reduction will generate a mask of
+/// <0,2,...> or <1,3,..> while a splitting reduction will generate
+/// <2,3, undef,undef> for a vector of 4 and NumElts = 2.
+/// \param IsLeft True will generate a mask of even elements, odd otherwise.
+static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx,
+ bool IsPairwise, bool IsLeft,
+ IRBuilder<> &Builder) {
+ assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask");
+
+ SmallVector<Constant *, 32> ShuffleMask(
+ VecLen, UndefValue::get(Builder.getInt32Ty()));
+
+ if (IsPairwise)
+ // Build a mask of 0, 2, ... (left) or 1, 3, ... (right).
+ for (unsigned i = 0; i != NumEltsToRdx; ++i)
+ ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft);
+ else
+ // Move the upper half of the vector to the lower half.
+ for (unsigned i = 0; i != NumEltsToRdx; ++i)
+ ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i);
+
+ return ConstantVector::get(ShuffleMask);
+}
+
+
+/// Model horizontal reductions.
+///
+/// A horizontal reduction is a tree of reduction operations (currently add and
+/// fadd) that has operations that can be put into a vector as its leaf.
+/// For example, this tree:
+///
+/// mul mul mul mul
+/// \ / \ /
+/// + +
+/// \ /
+/// +
+/// This tree has "mul" as its reduced values and "+" as its reduction
+/// operations. A reduction might be feeding into a store or a binary operation
+/// feeding a phi.
+/// ...
+/// \ /
+/// +
+/// |
+/// phi +=
+///
+/// Or:
+/// ...
+/// \ /
+/// +
+/// |
+/// *p =
+///
+class HorizontalReduction {
+ SmallVector<Value *, 16> ReductionOps;
+ SmallVector<Value *, 32> ReducedVals;
+
+ BinaryOperator *ReductionRoot;
+ PHINode *ReductionPHI;
+
+ /// The opcode of the reduction.
+ unsigned ReductionOpcode;
+ /// The opcode of the values we perform a reduction on.
+ unsigned ReducedValueOpcode;
+ /// Should we model this reduction as a pairwise reduction tree or a tree that
+ /// splits the vector in halves and adds those halves.
+ bool IsPairwiseReduction;
+
+public:
+ /// The width of one full horizontal reduction operation.
+ unsigned ReduxWidth;
+
+ HorizontalReduction()
+ : ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0),
+ ReducedValueOpcode(0), IsPairwiseReduction(false), ReduxWidth(0) {}
+
+ /// \brief Try to find a reduction tree.
+ bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) {
+ assert((!Phi ||
+ std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) &&
+ "Thi phi needs to use the binary operator");
+
+ // We could have a initial reductions that is not an add.
+ // r *= v1 + v2 + v3 + v4
+ // In such a case start looking for a tree rooted in the first '+'.
+ if (Phi) {
+ if (B->getOperand(0) == Phi) {
+ Phi = nullptr;
+ B = dyn_cast<BinaryOperator>(B->getOperand(1));
+ } else if (B->getOperand(1) == Phi) {
+ Phi = nullptr;
+ B = dyn_cast<BinaryOperator>(B->getOperand(0));
+ }
+ }
+
+ if (!B)
+ return false;
+
+ Type *Ty = B->getType();
+ if (!isValidElementType(Ty))
+ return false;
+
+ const DataLayout &DL = B->getModule()->getDataLayout();
+ ReductionOpcode = B->getOpcode();
+ ReducedValueOpcode = 0;
+ // FIXME: Register size should be a parameter to this function, so we can
+ // try different vectorization factors.
+ ReduxWidth = MinVecRegSize / DL.getTypeSizeInBits(Ty);
+ ReductionRoot = B;
+ ReductionPHI = Phi;
+
+ if (ReduxWidth < 4)
+ return false;
+
+ // We currently only support adds.
+ if (ReductionOpcode != Instruction::Add &&
+ ReductionOpcode != Instruction::FAdd)
+ return false;
+
+ // Post order traverse the reduction tree starting at B. We only handle true
+ // trees containing only binary operators or selects.
+ SmallVector<std::pair<Instruction *, unsigned>, 32> Stack;
+ Stack.push_back(std::make_pair(B, 0));
+ while (!Stack.empty()) {
+ Instruction *TreeN = Stack.back().first;
+ unsigned EdgeToVist = Stack.back().second++;
+ bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode;
+
+ // Only handle trees in the current basic block.
+ if (TreeN->getParent() != B->getParent())
+ return false;
+
+ // Each tree node needs to have one user except for the ultimate
+ // reduction.
+ if (!TreeN->hasOneUse() && TreeN != B)
+ return false;
+
+ // Postorder vist.
+ if (EdgeToVist == 2 || IsReducedValue) {
+ if (IsReducedValue) {
+ // Make sure that the opcodes of the operations that we are going to
+ // reduce match.
+ if (!ReducedValueOpcode)
+ ReducedValueOpcode = TreeN->getOpcode();
+ else if (ReducedValueOpcode != TreeN->getOpcode())
+ return false;
+ ReducedVals.push_back(TreeN);
+ } else {
+ // We need to be able to reassociate the adds.
+ if (!TreeN->isAssociative())
+ return false;
+ ReductionOps.push_back(TreeN);
+ }
+ // Retract.
+ Stack.pop_back();
+ continue;
+ }
+
+ // Visit left or right.
+ Value *NextV = TreeN->getOperand(EdgeToVist);
+ // We currently only allow BinaryOperator's and SelectInst's as reduction
+ // values in our tree.
+ if (isa<BinaryOperator>(NextV) || isa<SelectInst>(NextV))
+ Stack.push_back(std::make_pair(cast<Instruction>(NextV), 0));
+ else if (NextV != Phi)
+ return false;
+ }
+ return true;
+ }
+
+ /// \brief Attempt to vectorize the tree found by
+ /// matchAssociativeReduction.
+ bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
+ if (ReducedVals.empty())
+ return false;
+
+ unsigned NumReducedVals = ReducedVals.size();
+ if (NumReducedVals < ReduxWidth)
+ return false;
+
+ Value *VectorizedTree = nullptr;
+ IRBuilder<> Builder(ReductionRoot);
+ FastMathFlags Unsafe;
+ Unsafe.setUnsafeAlgebra();
+ Builder.SetFastMathFlags(Unsafe);
+ unsigned i = 0;
+
+ for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {
+ V.buildTree(makeArrayRef(&ReducedVals[i], ReduxWidth), ReductionOps);
+
+ // Estimate cost.
+ int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]);
+ if (Cost >= -SLPCostThreshold)
+ break;
+
+ DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost
+ << ". (HorRdx)\n");
+
+ // Vectorize a tree.
+ DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
+ Value *VectorizedRoot = V.vectorizeTree();
+
+ // Emit a reduction.
+ Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder);
+ if (VectorizedTree) {
+ Builder.SetCurrentDebugLocation(Loc);
+ VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
+ ReducedSubTree, "bin.rdx");
+ } else
+ VectorizedTree = ReducedSubTree;
+ }
+
+ if (VectorizedTree) {
+ // Finish the reduction.
+ for (; i < NumReducedVals; ++i) {
+ Builder.SetCurrentDebugLocation(
+ cast<Instruction>(ReducedVals[i])->getDebugLoc());
+ VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
+ ReducedVals[i]);
+ }
+ // Update users.
+ if (ReductionPHI) {
+ assert(ReductionRoot && "Need a reduction operation");
+ ReductionRoot->setOperand(0, VectorizedTree);
+ ReductionRoot->setOperand(1, ReductionPHI);
+ } else
+ ReductionRoot->replaceAllUsesWith(VectorizedTree);
+ }
+ return VectorizedTree != nullptr;
+ }
+
+ unsigned numReductionValues() const {
+ return ReducedVals.size();
+ }
+
+private:
+ /// \brief Calculate the cost of a reduction.
+ int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) {
+ Type *ScalarTy = FirstReducedVal->getType();
+ Type *VecTy = VectorType::get(ScalarTy, ReduxWidth);
+
+ int PairwiseRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, true);
+ int SplittingRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, false);
+
+ IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;
+ int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;
+
+ int ScalarReduxCost =
+ ReduxWidth * TTI->getArithmeticInstrCost(ReductionOpcode, VecTy);
+
+ DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
+ << " for reduction that starts with " << *FirstReducedVal
+ << " (It is a "
+ << (IsPairwiseReduction ? "pairwise" : "splitting")
+ << " reduction)\n");
+
+ return VecReduxCost - ScalarReduxCost;
+ }
+
+ static Value *createBinOp(IRBuilder<> &Builder, unsigned Opcode, Value *L,
+ Value *R, const Twine &Name = "") {
+ if (Opcode == Instruction::FAdd)
+ return Builder.CreateFAdd(L, R, Name);
+ return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, L, R, Name);
+ }
+
+ /// \brief Emit a horizontal reduction of the vectorized value.
+ Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder) {
+ assert(VectorizedValue && "Need to have a vectorized tree node");
+ assert(isPowerOf2_32(ReduxWidth) &&
+ "We only handle power-of-two reductions for now");
+
+ Value *TmpVec = VectorizedValue;
+ for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
+ if (IsPairwiseReduction) {
+ Value *LeftMask =
+ createRdxShuffleMask(ReduxWidth, i, true, true, Builder);
+ Value *RightMask =
+ createRdxShuffleMask(ReduxWidth, i, true, false, Builder);
+
+ Value *LeftShuf = Builder.CreateShuffleVector(
+ TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");
+ Value *RightShuf = Builder.CreateShuffleVector(
+ TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
+ "rdx.shuf.r");
+ TmpVec = createBinOp(Builder, ReductionOpcode, LeftShuf, RightShuf,
+ "bin.rdx");
+ } else {
+ Value *UpperHalf =
+ createRdxShuffleMask(ReduxWidth, i, false, false, Builder);
+ Value *Shuf = Builder.CreateShuffleVector(
+ TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf");
+ TmpVec = createBinOp(Builder, ReductionOpcode, TmpVec, Shuf, "bin.rdx");
+ }
+ }
+
+ // The result is in the first element of the vector.
+ return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
+ }
+};
+
+/// \brief Recognize construction of vectors like
+/// %ra = insertelement <4 x float> undef, float %s0, i32 0
+/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
+/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
+/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
+///
+/// Returns true if it matches
+///
+static bool findBuildVector(InsertElementInst *FirstInsertElem,
+ SmallVectorImpl<Value *> &BuildVector,
+ SmallVectorImpl<Value *> &BuildVectorOpds) {
+ if (!isa<UndefValue>(FirstInsertElem->getOperand(0)))
+ return false;
+
+ InsertElementInst *IE = FirstInsertElem;
+ while (true) {
+ BuildVector.push_back(IE);
+ BuildVectorOpds.push_back(IE->getOperand(1));
+
+ if (IE->use_empty())
+ return false;
+
+ InsertElementInst *NextUse = dyn_cast<InsertElementInst>(IE->user_back());
+ if (!NextUse)
+ return true;
+
+ // If this isn't the final use, make sure the next insertelement is the only
+ // use. It's OK if the final constructed vector is used multiple times
+ if (!IE->hasOneUse())
+ return false;
+
+ IE = NextUse;
+ }
+
+ return false;
+}
+
+static bool PhiTypeSorterFunc(Value *V, Value *V2) {
+ return V->getType() < V2->getType();
+}
+
+/// \brief Try and get a reduction value from a phi node.
+///
+/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
+/// if they come from either \p ParentBB or a containing loop latch.
+///
+/// \returns A candidate reduction value if possible, or \code nullptr \endcode
+/// if not possible.
+static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
+ BasicBlock *ParentBB, LoopInfo *LI) {
+ // There are situations where the reduction value is not dominated by the
+ // reduction phi. Vectorizing such cases has been reported to cause
+ // miscompiles. See PR25787.
+ auto DominatedReduxValue = [&](Value *R) {
+ return (
+ dyn_cast<Instruction>(R) &&
+ DT->dominates(P->getParent(), dyn_cast<Instruction>(R)->getParent()));
+ };
+
+ Value *Rdx = nullptr;
+
+ // Return the incoming value if it comes from the same BB as the phi node.
+ if (P->getIncomingBlock(0) == ParentBB) {
+ Rdx = P->getIncomingValue(0);
+ } else if (P->getIncomingBlock(1) == ParentBB) {
+ Rdx = P->getIncomingValue(1);
+ }
+
+ if (Rdx && DominatedReduxValue(Rdx))
+ return Rdx;
+
+ // Otherwise, check whether we have a loop latch to look at.
+ Loop *BBL = LI->getLoopFor(ParentBB);
+ if (!BBL)
+ return nullptr;
+ BasicBlock *BBLatch = BBL->getLoopLatch();
+ if (!BBLatch)
+ return nullptr;
+
+ // There is a loop latch, return the incoming value if it comes from
+ // that. This reduction pattern occassionaly turns up.
+ if (P->getIncomingBlock(0) == BBLatch) {
+ Rdx = P->getIncomingValue(0);
+ } else if (P->getIncomingBlock(1) == BBLatch) {
+ Rdx = P->getIncomingValue(1);
+ }
+
+ if (Rdx && DominatedReduxValue(Rdx))
+ return Rdx;
+
+ return nullptr;
+}
+
+/// \brief Attempt to reduce a horizontal reduction.
+/// If it is legal to match a horizontal reduction feeding
+/// the phi node P with reduction operators BI, then check if it
+/// can be done.
+/// \returns true if a horizontal reduction was matched and reduced.
+/// \returns false if a horizontal reduction was not matched.
+static bool canMatchHorizontalReduction(PHINode *P, BinaryOperator *BI,
+ BoUpSLP &R, TargetTransformInfo *TTI) {
+ if (!ShouldVectorizeHor)
+ return false;
+
+ HorizontalReduction HorRdx;
+ if (!HorRdx.matchAssociativeReduction(P, BI))
+ return false;
+
+ // If there is a sufficient number of reduction values, reduce
+ // to a nearby power-of-2. Can safely generate oversized
+ // vectors and rely on the backend to split them to legal sizes.
+ HorRdx.ReduxWidth =
+ std::max((uint64_t)4, PowerOf2Floor(HorRdx.numReductionValues()));
+
+ return HorRdx.tryToReduce(R, TTI);
+}
+
+bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
+ bool Changed = false;
+ SmallVector<Value *, 4> Incoming;
+ SmallSet<Value *, 16> VisitedInstrs;
+
+ bool HaveVectorizedPhiNodes = true;
+ while (HaveVectorizedPhiNodes) {
+ HaveVectorizedPhiNodes = false;
+
+ // Collect the incoming values from the PHIs.
+ Incoming.clear();
+ for (BasicBlock::iterator instr = BB->begin(), ie = BB->end(); instr != ie;
+ ++instr) {
+ PHINode *P = dyn_cast<PHINode>(instr);
+ if (!P)
+ break;
+
+ if (!VisitedInstrs.count(P))
+ Incoming.push_back(P);
+ }
+
+ // Sort by type.
+ std::stable_sort(Incoming.begin(), Incoming.end(), PhiTypeSorterFunc);
+
+ // Try to vectorize elements base on their type.
+ for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(),
+ E = Incoming.end();
+ IncIt != E;) {
+
+ // Look for the next elements with the same type.
+ SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
+ while (SameTypeIt != E &&
+ (*SameTypeIt)->getType() == (*IncIt)->getType()) {
+ VisitedInstrs.insert(*SameTypeIt);
+ ++SameTypeIt;
+ }
+
+ // Try to vectorize them.
+ unsigned NumElts = (SameTypeIt - IncIt);
+ DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n");
+ if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R)) {
+ // Success start over because instructions might have been changed.
+ HaveVectorizedPhiNodes = true;
+ Changed = true;
+ break;
+ }
+
+ // Start over at the next instruction of a different type (or the end).
+ IncIt = SameTypeIt;
+ }
+ }
+
+ VisitedInstrs.clear();
+
+ for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) {
+ // We may go through BB multiple times so skip the one we have checked.
+ if (!VisitedInstrs.insert(&*it).second)
+ continue;
+
+ if (isa<DbgInfoIntrinsic>(it))
+ continue;
+
+ // Try to vectorize reductions that use PHINodes.
+ if (PHINode *P = dyn_cast<PHINode>(it)) {
+ // Check that the PHI is a reduction PHI.
+ if (P->getNumIncomingValues() != 2)
+ return Changed;
+
+ Value *Rdx = getReductionValue(DT, P, BB, LI);
+
+ // Check if this is a Binary Operator.
+ BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx);
+ if (!BI)
+ continue;
+
+ // Try to match and vectorize a horizontal reduction.
+ if (canMatchHorizontalReduction(P, BI, R, TTI)) {
+ Changed = true;
+ it = BB->begin();
+ e = BB->end();
+ continue;
+ }
+
+ Value *Inst = BI->getOperand(0);
+ if (Inst == P)
+ Inst = BI->getOperand(1);
+
+ if (tryToVectorize(dyn_cast<BinaryOperator>(Inst), R)) {
+ // We would like to start over since some instructions are deleted
+ // and the iterator may become invalid value.
+ Changed = true;
+ it = BB->begin();
+ e = BB->end();
+ continue;
+ }
+
+ continue;
+ }
+
+ if (ShouldStartVectorizeHorAtStore)
+ if (StoreInst *SI = dyn_cast<StoreInst>(it))
+ if (BinaryOperator *BinOp =
+ dyn_cast<BinaryOperator>(SI->getValueOperand())) {
+ if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI) ||
+ tryToVectorize(BinOp, R)) {
+ Changed = true;
+ it = BB->begin();
+ e = BB->end();
+ continue;
+ }
+ }
+
+ // Try to vectorize horizontal reductions feeding into a return.
+ if (ReturnInst *RI = dyn_cast<ReturnInst>(it))
+ if (RI->getNumOperands() != 0)
+ if (BinaryOperator *BinOp =
+ dyn_cast<BinaryOperator>(RI->getOperand(0))) {
+ DEBUG(dbgs() << "SLP: Found a return to vectorize.\n");
+ if (tryToVectorizePair(BinOp->getOperand(0),
+ BinOp->getOperand(1), R)) {
+ Changed = true;
+ it = BB->begin();
+ e = BB->end();
+ continue;
+ }
+ }
+
+ // Try to vectorize trees that start at compare instructions.
+ if (CmpInst *CI = dyn_cast<CmpInst>(it)) {
+ if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) {
+ Changed = true;
+ // We would like to start over since some instructions are deleted
+ // and the iterator may become invalid value.
+ it = BB->begin();
+ e = BB->end();
+ continue;
+ }
+
+ for (int i = 0; i < 2; ++i) {
+ if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i))) {
+ if (tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R)) {
+ Changed = true;
+ // We would like to start over since some instructions are deleted
+ // and the iterator may become invalid value.
+ it = BB->begin();
+ e = BB->end();
+ break;
+ }
+ }
+ }
+ continue;
+ }
+
+ // Try to vectorize trees that start at insertelement instructions.
+ if (InsertElementInst *FirstInsertElem = dyn_cast<InsertElementInst>(it)) {
+ SmallVector<Value *, 16> BuildVector;
+ SmallVector<Value *, 16> BuildVectorOpds;
+ if (!findBuildVector(FirstInsertElem, BuildVector, BuildVectorOpds))
+ continue;
+
+ // Vectorize starting with the build vector operands ignoring the
+ // BuildVector instructions for the purpose of scheduling and user
+ // extraction.
+ if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) {
+ Changed = true;
+ it = BB->begin();
+ e = BB->end();
+ }
+
+ continue;
+ }
+ }
+
+ return Changed;
+}
+
+bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) {
+ bool Changed = false;
+ // Attempt to sort and vectorize each of the store-groups.
+ for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end();
+ it != e; ++it) {
+ if (it->second.size() < 2)
+ continue;
+
+ DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
+ << it->second.size() << ".\n");
+
+ // Process the stores in chunks of 16.
+ // TODO: The limit of 16 inhibits greater vectorization factors.
+ // For example, AVX2 supports v32i8. Increasing this limit, however,
+ // may cause a significant compile-time increase.
+ for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) {
+ unsigned Len = std::min<unsigned>(CE - CI, 16);
+ Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len),
+ -SLPCostThreshold, R);
+ }
+ }
+ return Changed;
+}
+
+} // end anonymous namespace
+
+char SLPVectorizer::ID = 0;
+static const char lv_name[] = "SLP Vectorizer";
+INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
+
+namespace llvm {
+Pass *createSLPVectorizerPass() { return new SLPVectorizer(); }
+}
diff --git a/contrib/llvm/lib/Transforms/Vectorize/Vectorize.cpp b/contrib/llvm/lib/Transforms/Vectorize/Vectorize.cpp
new file mode 100644
index 0000000..6e002fd
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Vectorize/Vectorize.cpp
@@ -0,0 +1,48 @@
+//===-- Vectorize.cpp -----------------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements common infrastructure for libLLVMVectorizeOpts.a, which
+// implements several vectorization transformations over the LLVM intermediate
+// representation, including the C bindings for that library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Vectorize.h"
+#include "llvm-c/Initialization.h"
+#include "llvm-c/Transforms/Vectorize.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/IR/LegacyPassManager.h"
+
+using namespace llvm;
+
+/// initializeVectorizationPasses - Initialize all passes linked into the
+/// Vectorization library.
+void llvm::initializeVectorization(PassRegistry &Registry) {
+ initializeBBVectorizePass(Registry);
+ initializeLoopVectorizePass(Registry);
+ initializeSLPVectorizerPass(Registry);
+}
+
+void LLVMInitializeVectorization(LLVMPassRegistryRef R) {
+ initializeVectorization(*unwrap(R));
+}
+
+void LLVMAddBBVectorizePass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createBBVectorizePass());
+}
+
+void LLVMAddLoopVectorizePass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createLoopVectorizePass());
+}
+
+void LLVMAddSLPVectorizePass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createSLPVectorizerPass());
+}
OpenPOWER on IntegriCloud