summaryrefslogtreecommitdiffstats
path: root/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp')
-rw-r--r--contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp1012
1 files changed, 794 insertions, 218 deletions
diff --git a/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp b/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp
index 62d23cb..f7be3e3 100644
--- a/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp
+++ b/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp
@@ -28,12 +28,14 @@
#include "llvm/Type.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/Dominators.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/ValueTracking.h"
@@ -41,17 +43,27 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Support/ValueHandle.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
+#include "llvm/TargetTransformInfo.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Vectorize.h"
#include <algorithm>
#include <map>
using namespace llvm;
+static cl::opt<bool>
+IgnoreTargetInfo("bb-vectorize-ignore-target-info", cl::init(false),
+ cl::Hidden, cl::desc("Ignore target information"));
+
static cl::opt<unsigned>
ReqChainDepth("bb-vectorize-req-chain-depth", cl::init(6), cl::Hidden,
cl::desc("The required chain depth for vectorization"));
+static cl::opt<bool>
+UseChainDepthWithTI("bb-vectorize-use-chain-depth", cl::init(false),
+ cl::Hidden, cl::desc("Use the chain depth requirement with"
+ " target information"));
+
static cl::opt<unsigned>
SearchLimit("bb-vectorize-search-limit", cl::init(400), cl::Hidden,
cl::desc("The maximum search distance for instruction pairs"));
@@ -93,8 +105,9 @@ static cl::opt<bool>
NoFloats("bb-vectorize-no-floats", cl::init(false), cl::Hidden,
cl::desc("Don't try to vectorize floating-point values"));
+// FIXME: This should default to false once pointer vector support works.
static cl::opt<bool>
-NoPointers("bb-vectorize-no-pointers", cl::init(false), cl::Hidden,
+NoPointers("bb-vectorize-no-pointers", cl::init(/*false*/ true), cl::Hidden,
cl::desc("Don't try to vectorize pointer values"));
static cl::opt<bool>
@@ -159,6 +172,12 @@ DebugCycleCheck("bb-vectorize-debug-cycle-check",
cl::init(false), cl::Hidden,
cl::desc("When debugging is enabled, output information on the"
" cycle-checking process"));
+
+static cl::opt<bool>
+PrintAfterEveryPair("bb-vectorize-debug-print-after-every-pair",
+ cl::init(false), cl::Hidden,
+ cl::desc("When debugging is enabled, dump the basic block after"
+ " every pair is fused"));
#endif
STATISTIC(NumFusedOps, "Number of operations fused by bb-vectorize");
@@ -177,13 +196,19 @@ namespace {
BBVectorize(Pass *P, const VectorizeConfig &C)
: BasicBlockPass(ID), Config(C) {
AA = &P->getAnalysis<AliasAnalysis>();
+ DT = &P->getAnalysis<DominatorTree>();
SE = &P->getAnalysis<ScalarEvolution>();
- TD = P->getAnalysisIfAvailable<TargetData>();
+ TD = P->getAnalysisIfAvailable<DataLayout>();
+ TTI = IgnoreTargetInfo ? 0 :
+ P->getAnalysisIfAvailable<TargetTransformInfo>();
+ VTTI = TTI ? TTI->getVectorTargetTransformInfo() : 0;
}
typedef std::pair<Value *, Value *> ValuePair;
+ typedef std::pair<ValuePair, int> ValuePairWithCost;
typedef std::pair<ValuePair, size_t> ValuePairWithDepth;
typedef std::pair<ValuePair, ValuePair> VPPair; // A ValuePair pair
+ typedef std::pair<VPPair, unsigned> VPPairWithType;
typedef std::pair<std::multimap<Value *, Value *>::iterator,
std::multimap<Value *, Value *>::iterator> VPIteratorPair;
typedef std::pair<std::multimap<ValuePair, ValuePair>::iterator,
@@ -191,8 +216,11 @@ namespace {
VPPIteratorPair;
AliasAnalysis *AA;
+ DominatorTree *DT;
ScalarEvolution *SE;
- TargetData *TD;
+ DataLayout *TD;
+ TargetTransformInfo *TTI;
+ const VectorTargetTransformInfo *VTTI;
// FIXME: const correct?
@@ -201,11 +229,23 @@ namespace {
bool getCandidatePairs(BasicBlock &BB,
BasicBlock::iterator &Start,
std::multimap<Value *, Value *> &CandidatePairs,
+ DenseSet<ValuePair> &FixedOrderPairs,
+ DenseMap<ValuePair, int> &CandidatePairCostSavings,
std::vector<Value *> &PairableInsts, bool NonPow2Len);
+ // FIXME: The current implementation does not account for pairs that
+ // are connected in multiple ways. For example:
+ // C1 = A1 / A2; C2 = A2 / A1 (which may be both direct and a swap)
+ enum PairConnectionType {
+ PairConnectionDirect,
+ PairConnectionSwap,
+ PairConnectionSplat
+ };
+
void computeConnectedPairs(std::multimap<Value *, Value *> &CandidatePairs,
std::vector<Value *> &PairableInsts,
- std::multimap<ValuePair, ValuePair> &ConnectedPairs);
+ std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+ DenseMap<VPPair, unsigned> &PairConnectionTypes);
void buildDepMap(BasicBlock &BB,
std::multimap<Value *, Value *> &CandidatePairs,
@@ -213,19 +253,29 @@ namespace {
DenseSet<ValuePair> &PairableInstUsers);
void choosePairs(std::multimap<Value *, Value *> &CandidatePairs,
+ DenseMap<ValuePair, int> &CandidatePairCostSavings,
std::vector<Value *> &PairableInsts,
+ DenseSet<ValuePair> &FixedOrderPairs,
+ DenseMap<VPPair, unsigned> &PairConnectionTypes,
std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+ std::multimap<ValuePair, ValuePair> &ConnectedPairDeps,
DenseSet<ValuePair> &PairableInstUsers,
DenseMap<Value *, Value *>& ChosenPairs);
void fuseChosenPairs(BasicBlock &BB,
std::vector<Value *> &PairableInsts,
- DenseMap<Value *, Value *>& ChosenPairs);
+ DenseMap<Value *, Value *>& ChosenPairs,
+ DenseSet<ValuePair> &FixedOrderPairs,
+ DenseMap<VPPair, unsigned> &PairConnectionTypes,
+ std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+ std::multimap<ValuePair, ValuePair> &ConnectedPairDeps);
+
bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore);
bool areInstsCompatible(Instruction *I, Instruction *J,
- bool IsSimpleLoadStore, bool NonPow2Len);
+ bool IsSimpleLoadStore, bool NonPow2Len,
+ int &CostSavings, int &FixedOrder);
bool trackUsesOfI(DenseSet<Value *> &Users,
AliasSetTracker &WriteSet, Instruction *I,
@@ -236,6 +286,7 @@ namespace {
std::multimap<Value *, Value *> &CandidatePairs,
std::vector<Value *> &PairableInsts,
std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+ DenseMap<VPPair, unsigned> &PairConnectionTypes,
ValuePair P);
bool pairsConflict(ValuePair P, ValuePair Q,
@@ -267,17 +318,21 @@ namespace {
void findBestTreeFor(
std::multimap<Value *, Value *> &CandidatePairs,
+ DenseMap<ValuePair, int> &CandidatePairCostSavings,
std::vector<Value *> &PairableInsts,
+ DenseSet<ValuePair> &FixedOrderPairs,
+ DenseMap<VPPair, unsigned> &PairConnectionTypes,
std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+ std::multimap<ValuePair, ValuePair> &ConnectedPairDeps,
DenseSet<ValuePair> &PairableInstUsers,
std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
DenseMap<Value *, Value *> &ChosenPairs,
DenseSet<ValuePair> &BestTree, size_t &BestMaxDepth,
- size_t &BestEffSize, VPIteratorPair ChoiceRange,
+ int &BestEffSize, VPIteratorPair ChoiceRange,
bool UseCycleCheck);
Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I,
- Instruction *J, unsigned o, bool FlipMemInputs);
+ Instruction *J, unsigned o);
void fillNewShuffleMask(LLVMContext& Context, Instruction *J,
unsigned MaskOffset, unsigned NumInElem,
@@ -289,20 +344,20 @@ namespace {
bool expandIEChain(LLVMContext& Context, Instruction *I, Instruction *J,
unsigned o, Value *&LOp, unsigned numElemL,
- Type *ArgTypeL, Type *ArgTypeR,
+ Type *ArgTypeL, Type *ArgTypeR, bool IBeforeJ,
unsigned IdxOff = 0);
Value *getReplacementInput(LLVMContext& Context, Instruction *I,
- Instruction *J, unsigned o, bool FlipMemInputs);
+ Instruction *J, unsigned o, bool IBeforeJ);
void getReplacementInputsForPair(LLVMContext& Context, Instruction *I,
Instruction *J, SmallVector<Value *, 3> &ReplacedOperands,
- bool FlipMemInputs);
+ bool IBeforeJ);
void replaceOutputsOfPair(LLVMContext& Context, Instruction *I,
Instruction *J, Instruction *K,
Instruction *&InsertionPt, Instruction *&K1,
- Instruction *&K2, bool FlipMemInputs);
+ Instruction *&K2);
void collectPairLoadMoveSet(BasicBlock &BB,
DenseMap<Value *, Value *> &ChosenPairs,
@@ -314,10 +369,6 @@ namespace {
DenseMap<Value *, Value *> &ChosenPairs,
std::multimap<Value *, Value *> &LoadMoveSet);
- void collectPtrInfo(std::vector<Value *> &PairableInsts,
- DenseMap<Value *, Value *> &ChosenPairs,
- DenseSet<Value *> &LowPtrInsts);
-
bool canMoveUsesOfIAfterJ(BasicBlock &BB,
std::multimap<Value *, Value *> &LoadMoveSet,
Instruction *I, Instruction *J);
@@ -330,13 +381,22 @@ namespace {
void combineMetadata(Instruction *K, const Instruction *J);
bool vectorizeBB(BasicBlock &BB) {
+ if (!DT->isReachableFromEntry(&BB)) {
+ DEBUG(dbgs() << "BBV: skipping unreachable " << BB.getName() <<
+ " in " << BB.getParent()->getName() << "\n");
+ return false;
+ }
+
+ DEBUG(if (VTTI) dbgs() << "BBV: using target information\n");
+
bool changed = false;
// Iterate a sufficient number of times to merge types of size 1 bit,
// then 2 bits, then 4, etc. up to half of the target vector width of the
// target vector register.
unsigned n = 1;
for (unsigned v = 2;
- v <= Config.VectorBits && (!Config.MaxIter || n <= Config.MaxIter);
+ (VTTI || v <= Config.VectorBits) &&
+ (!Config.MaxIter || n <= Config.MaxIter);
v *= 2, ++n) {
DEBUG(dbgs() << "BBV: fusing loop #" << n <<
" for " << BB.getName() << " in " <<
@@ -363,8 +423,12 @@ namespace {
virtual bool runOnBasicBlock(BasicBlock &BB) {
AA = &getAnalysis<AliasAnalysis>();
+ DT = &getAnalysis<DominatorTree>();
SE = &getAnalysis<ScalarEvolution>();
- TD = getAnalysisIfAvailable<TargetData>();
+ TD = getAnalysisIfAvailable<DataLayout>();
+ TTI = IgnoreTargetInfo ? 0 :
+ getAnalysisIfAvailable<TargetTransformInfo>();
+ VTTI = TTI ? TTI->getVectorTargetTransformInfo() : 0;
return vectorizeBB(BB);
}
@@ -372,8 +436,10 @@ namespace {
virtual void getAnalysisUsage(AnalysisUsage &AU) const {
BasicBlockPass::getAnalysisUsage(AU);
AU.addRequired<AliasAnalysis>();
+ AU.addRequired<DominatorTree>();
AU.addRequired<ScalarEvolution>();
AU.addPreserved<AliasAnalysis>();
+ AU.addPreserved<DominatorTree>();
AU.addPreserved<ScalarEvolution>();
AU.setPreservesCFG();
}
@@ -415,6 +481,14 @@ namespace {
T2 = cast<CastInst>(I)->getSrcTy();
else
T2 = T1;
+
+ if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
+ T2 = SI->getCondition()->getType();
+ } else if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(I)) {
+ T2 = SI->getOperand(0)->getType();
+ } else if (CmpInst *CI = dyn_cast<CmpInst>(I)) {
+ T2 = CI->getOperand(0)->getType();
+ }
}
// Returns the weight associated with the provided value. A chain of
@@ -446,6 +520,62 @@ namespace {
return 1;
}
+ // Returns the cost of the provided instruction using VTTI.
+ // This does not handle loads and stores.
+ unsigned getInstrCost(unsigned Opcode, Type *T1, Type *T2) {
+ switch (Opcode) {
+ default: break;
+ case Instruction::GetElementPtr:
+ // We mark this instruction as zero-cost because scalar GEPs are usually
+ // lowered to the intruction addressing mode. At the moment we don't
+ // generate vector GEPs.
+ return 0;
+ case Instruction::Br:
+ return VTTI->getCFInstrCost(Opcode);
+ case Instruction::PHI:
+ return 0;
+ case Instruction::Add:
+ case Instruction::FAdd:
+ case Instruction::Sub:
+ case Instruction::FSub:
+ case Instruction::Mul:
+ case Instruction::FMul:
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::FDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ return VTTI->getArithmeticInstrCost(Opcode, T1);
+ case Instruction::Select:
+ case Instruction::ICmp:
+ case Instruction::FCmp:
+ return VTTI->getCmpSelInstrCost(Opcode, T1, T2);
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::SIToFP:
+ case Instruction::UIToFP:
+ case Instruction::Trunc:
+ case Instruction::FPTrunc:
+ case Instruction::BitCast:
+ case Instruction::ShuffleVector:
+ return VTTI->getCastInstrCost(Opcode, T1, T2);
+ }
+
+ return 1;
+ }
+
// This determines the relative offset of two loads or stores, returning
// true if the offset could be determined to be some constant value.
// For example, if OffsetInElmts == 1, then J accesses the memory directly
@@ -453,20 +583,30 @@ namespace {
// directly after J.
bool getPairPtrInfo(Instruction *I, Instruction *J,
Value *&IPtr, Value *&JPtr, unsigned &IAlignment, unsigned &JAlignment,
- int64_t &OffsetInElmts) {
+ unsigned &IAddressSpace, unsigned &JAddressSpace,
+ int64_t &OffsetInElmts, bool ComputeOffset = true) {
OffsetInElmts = 0;
- if (isa<LoadInst>(I)) {
- IPtr = cast<LoadInst>(I)->getPointerOperand();
- JPtr = cast<LoadInst>(J)->getPointerOperand();
- IAlignment = cast<LoadInst>(I)->getAlignment();
- JAlignment = cast<LoadInst>(J)->getAlignment();
+ if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+ LoadInst *LJ = cast<LoadInst>(J);
+ IPtr = LI->getPointerOperand();
+ JPtr = LJ->getPointerOperand();
+ IAlignment = LI->getAlignment();
+ JAlignment = LJ->getAlignment();
+ IAddressSpace = LI->getPointerAddressSpace();
+ JAddressSpace = LJ->getPointerAddressSpace();
} else {
- IPtr = cast<StoreInst>(I)->getPointerOperand();
- JPtr = cast<StoreInst>(J)->getPointerOperand();
- IAlignment = cast<StoreInst>(I)->getAlignment();
- JAlignment = cast<StoreInst>(J)->getAlignment();
+ StoreInst *SI = cast<StoreInst>(I), *SJ = cast<StoreInst>(J);
+ IPtr = SI->getPointerOperand();
+ JPtr = SJ->getPointerOperand();
+ IAlignment = SI->getAlignment();
+ JAlignment = SJ->getAlignment();
+ IAddressSpace = SI->getPointerAddressSpace();
+ JAddressSpace = SJ->getPointerAddressSpace();
}
+ if (!ComputeOffset)
+ return true;
+
const SCEV *IPtrSCEV = SE->getSCEV(IPtr);
const SCEV *JPtrSCEV = SE->getSCEV(JPtr);
@@ -536,6 +676,19 @@ namespace {
return false;
}
+
+ bool isPureIEChain(InsertElementInst *IE) {
+ InsertElementInst *IENext = IE;
+ do {
+ if (!isa<UndefValue>(IENext->getOperand(0)) &&
+ !isa<InsertElementInst>(IENext->getOperand(0))) {
+ return false;
+ }
+ } while ((IENext =
+ dyn_cast<InsertElementInst>(IENext->getOperand(0))));
+
+ return true;
+ }
};
// This function implements one vectorization iteration on the provided
@@ -546,11 +699,18 @@ namespace {
std::vector<Value *> AllPairableInsts;
DenseMap<Value *, Value *> AllChosenPairs;
+ DenseSet<ValuePair> AllFixedOrderPairs;
+ DenseMap<VPPair, unsigned> AllPairConnectionTypes;
+ std::multimap<ValuePair, ValuePair> AllConnectedPairs, AllConnectedPairDeps;
do {
std::vector<Value *> PairableInsts;
std::multimap<Value *, Value *> CandidatePairs;
+ DenseSet<ValuePair> FixedOrderPairs;
+ DenseMap<ValuePair, int> CandidatePairCostSavings;
ShouldContinue = getCandidatePairs(BB, Start, CandidatePairs,
+ FixedOrderPairs,
+ CandidatePairCostSavings,
PairableInsts, NonPow2Len);
if (PairableInsts.empty()) continue;
@@ -563,10 +723,18 @@ namespace {
// Note that it only matters that both members of the second pair use some
// element of the first pair (to allow for splatting).
- std::multimap<ValuePair, ValuePair> ConnectedPairs;
- computeConnectedPairs(CandidatePairs, PairableInsts, ConnectedPairs);
+ std::multimap<ValuePair, ValuePair> ConnectedPairs, ConnectedPairDeps;
+ DenseMap<VPPair, unsigned> PairConnectionTypes;
+ computeConnectedPairs(CandidatePairs, PairableInsts, ConnectedPairs,
+ PairConnectionTypes);
if (ConnectedPairs.empty()) continue;
+ for (std::multimap<ValuePair, ValuePair>::iterator
+ I = ConnectedPairs.begin(), IE = ConnectedPairs.end();
+ I != IE; ++I) {
+ ConnectedPairDeps.insert(VPPair(I->second, I->first));
+ }
+
// Build the pairable-instruction dependency map
DenseSet<ValuePair> PairableInstUsers;
buildDepMap(BB, CandidatePairs, PairableInsts, PairableInstUsers);
@@ -578,13 +746,48 @@ namespace {
// variables.
DenseMap<Value *, Value *> ChosenPairs;
- choosePairs(CandidatePairs, PairableInsts, ConnectedPairs,
+ choosePairs(CandidatePairs, CandidatePairCostSavings,
+ PairableInsts, FixedOrderPairs, PairConnectionTypes,
+ ConnectedPairs, ConnectedPairDeps,
PairableInstUsers, ChosenPairs);
if (ChosenPairs.empty()) continue;
AllPairableInsts.insert(AllPairableInsts.end(), PairableInsts.begin(),
PairableInsts.end());
AllChosenPairs.insert(ChosenPairs.begin(), ChosenPairs.end());
+
+ // Only for the chosen pairs, propagate information on fixed-order pairs,
+ // pair connections, and their types to the data structures used by the
+ // pair fusion procedures.
+ for (DenseMap<Value *, Value *>::iterator I = ChosenPairs.begin(),
+ IE = ChosenPairs.end(); I != IE; ++I) {
+ if (FixedOrderPairs.count(*I))
+ AllFixedOrderPairs.insert(*I);
+ else if (FixedOrderPairs.count(ValuePair(I->second, I->first)))
+ AllFixedOrderPairs.insert(ValuePair(I->second, I->first));
+
+ for (DenseMap<Value *, Value *>::iterator J = ChosenPairs.begin();
+ J != IE; ++J) {
+ DenseMap<VPPair, unsigned>::iterator K =
+ PairConnectionTypes.find(VPPair(*I, *J));
+ if (K != PairConnectionTypes.end()) {
+ AllPairConnectionTypes.insert(*K);
+ } else {
+ K = PairConnectionTypes.find(VPPair(*J, *I));
+ if (K != PairConnectionTypes.end())
+ AllPairConnectionTypes.insert(*K);
+ }
+ }
+ }
+
+ for (std::multimap<ValuePair, ValuePair>::iterator
+ I = ConnectedPairs.begin(), IE = ConnectedPairs.end();
+ I != IE; ++I) {
+ if (AllPairConnectionTypes.count(*I)) {
+ AllConnectedPairs.insert(*I);
+ AllConnectedPairDeps.insert(VPPair(I->second, I->first));
+ }
+ }
} while (ShouldContinue);
if (AllChosenPairs.empty()) return false;
@@ -597,11 +800,13 @@ namespace {
// replaced with a vector_extract on the result. Subsequent optimization
// passes should coalesce the build/extract combinations.
- fuseChosenPairs(BB, AllPairableInsts, AllChosenPairs);
+ fuseChosenPairs(BB, AllPairableInsts, AllChosenPairs, AllFixedOrderPairs,
+ AllPairConnectionTypes,
+ AllConnectedPairs, AllConnectedPairDeps);
// It is important to cleanup here so that future iterations of this
// function have less work to do.
- (void) SimplifyInstructionsInBlock(&BB, TD);
+ (void) SimplifyInstructionsInBlock(&BB, TD, AA->getTargetLibraryInfo());
return true;
}
@@ -667,15 +872,22 @@ namespace {
!(VectorType::isValidElementType(T2) || T2->isVectorTy()))
return false;
- if (T1->getScalarSizeInBits() == 1 && T2->getScalarSizeInBits() == 1) {
+ if (T1->getScalarSizeInBits() == 1) {
if (!Config.VectorizeBools)
return false;
} else {
- if (!Config.VectorizeInts
- && (T1->isIntOrIntVectorTy() || T2->isIntOrIntVectorTy()))
+ if (!Config.VectorizeInts && T1->isIntOrIntVectorTy())
return false;
}
-
+
+ if (T2->getScalarSizeInBits() == 1) {
+ if (!Config.VectorizeBools)
+ return false;
+ } else {
+ if (!Config.VectorizeInts && T2->isIntOrIntVectorTy())
+ return false;
+ }
+
if (!Config.VectorizeFloats
&& (T1->isFPOrFPVectorTy() || T2->isFPOrFPVectorTy()))
return false;
@@ -691,8 +903,8 @@ namespace {
T2->getScalarType()->isPointerTy()))
return false;
- if (T1->getPrimitiveSizeInBits() >= Config.VectorBits ||
- T2->getPrimitiveSizeInBits() >= Config.VectorBits)
+ if (!VTTI && (T1->getPrimitiveSizeInBits() >= Config.VectorBits ||
+ T2->getPrimitiveSizeInBits() >= Config.VectorBits))
return false;
return true;
@@ -703,10 +915,14 @@ namespace {
// that I has already been determined to be vectorizable and that J is not
// in the use tree of I.
bool BBVectorize::areInstsCompatible(Instruction *I, Instruction *J,
- bool IsSimpleLoadStore, bool NonPow2Len) {
+ bool IsSimpleLoadStore, bool NonPow2Len,
+ int &CostSavings, int &FixedOrder) {
DEBUG(if (DebugInstructionExamination) dbgs() << "BBV: looking at " << *I <<
" <-> " << *J << "\n");
+ CostSavings = 0;
+ FixedOrder = 0;
+
// Loads and stores can be merged if they have different alignments,
// but are otherwise the same.
if (!J->isSameOperationAs(I, Instruction::CompareIgnoringAlignment |
@@ -719,38 +935,84 @@ namespace {
unsigned MaxTypeBits = std::max(
IT1->getPrimitiveSizeInBits() + JT1->getPrimitiveSizeInBits(),
IT2->getPrimitiveSizeInBits() + JT2->getPrimitiveSizeInBits());
- if (MaxTypeBits > Config.VectorBits)
+ if (!VTTI && MaxTypeBits > Config.VectorBits)
return false;
// FIXME: handle addsub-type operations!
if (IsSimpleLoadStore) {
Value *IPtr, *JPtr;
- unsigned IAlignment, JAlignment;
+ unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace;
int64_t OffsetInElmts = 0;
if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
+ IAddressSpace, JAddressSpace,
OffsetInElmts) && abs64(OffsetInElmts) == 1) {
- if (Config.AlignedOnly) {
- Type *aTypeI = isa<StoreInst>(I) ?
- cast<StoreInst>(I)->getValueOperand()->getType() : I->getType();
- Type *aTypeJ = isa<StoreInst>(J) ?
- cast<StoreInst>(J)->getValueOperand()->getType() : J->getType();
+ FixedOrder = (int) OffsetInElmts;
+ unsigned BottomAlignment = IAlignment;
+ if (OffsetInElmts < 0) BottomAlignment = JAlignment;
+
+ Type *aTypeI = isa<StoreInst>(I) ?
+ cast<StoreInst>(I)->getValueOperand()->getType() : I->getType();
+ Type *aTypeJ = isa<StoreInst>(J) ?
+ cast<StoreInst>(J)->getValueOperand()->getType() : J->getType();
+ Type *VType = getVecTypeForPair(aTypeI, aTypeJ);
+ if (Config.AlignedOnly) {
// An aligned load or store is possible only if the instruction
// with the lower offset has an alignment suitable for the
// vector type.
- unsigned BottomAlignment = IAlignment;
- if (OffsetInElmts < 0) BottomAlignment = JAlignment;
-
- Type *VType = getVecTypeForPair(aTypeI, aTypeJ);
unsigned VecAlignment = TD->getPrefTypeAlignment(VType);
if (BottomAlignment < VecAlignment)
return false;
}
+
+ if (VTTI) {
+ unsigned ICost = VTTI->getMemoryOpCost(I->getOpcode(), I->getType(),
+ IAlignment, IAddressSpace);
+ unsigned JCost = VTTI->getMemoryOpCost(J->getOpcode(), J->getType(),
+ JAlignment, JAddressSpace);
+ unsigned VCost = VTTI->getMemoryOpCost(I->getOpcode(), VType,
+ BottomAlignment,
+ IAddressSpace);
+ if (VCost > ICost + JCost)
+ return false;
+
+ // We don't want to fuse to a type that will be split, even
+ // if the two input types will also be split and there is no other
+ // associated cost.
+ unsigned VParts = VTTI->getNumberOfParts(VType);
+ if (VParts > 1)
+ return false;
+ else if (!VParts && VCost == ICost + JCost)
+ return false;
+
+ CostSavings = ICost + JCost - VCost;
+ }
} else {
return false;
}
+ } else if (VTTI) {
+ unsigned ICost = getInstrCost(I->getOpcode(), IT1, IT2);
+ unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2);
+ Type *VT1 = getVecTypeForPair(IT1, JT1),
+ *VT2 = getVecTypeForPair(IT2, JT2);
+ unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2);
+
+ if (VCost > ICost + JCost)
+ return false;
+
+ // We don't want to fuse to a type that will be split, even
+ // if the two input types will also be split and there is no other
+ // associated cost.
+ unsigned VParts1 = VTTI->getNumberOfParts(VT1),
+ VParts2 = VTTI->getNumberOfParts(VT2);
+ if (VParts1 > 1 || VParts2 > 1)
+ return false;
+ else if ((!VParts1 || !VParts2) && VCost == ICost + JCost)
+ return false;
+
+ CostSavings = ICost + JCost - VCost;
}
// The powi intrinsic is special because only the first argument is
@@ -833,6 +1095,8 @@ namespace {
bool BBVectorize::getCandidatePairs(BasicBlock &BB,
BasicBlock::iterator &Start,
std::multimap<Value *, Value *> &CandidatePairs,
+ DenseSet<ValuePair> &FixedOrderPairs,
+ DenseMap<ValuePair, int> &CandidatePairCostSavings,
std::vector<Value *> &PairableInsts, bool NonPow2Len) {
BasicBlock::iterator E = BB.end();
if (Start == E) return false;
@@ -869,7 +1133,9 @@ namespace {
// J does not use I, and comes before the first use of I, so it can be
// merged with I if the instructions are compatible.
- if (!areInstsCompatible(I, J, IsSimpleLoadStore, NonPow2Len)) continue;
+ int CostSavings, FixedOrder;
+ if (!areInstsCompatible(I, J, IsSimpleLoadStore, NonPow2Len,
+ CostSavings, FixedOrder)) continue;
// J is a candidate for merging with I.
if (!PairableInsts.size() ||
@@ -878,6 +1144,14 @@ namespace {
}
CandidatePairs.insert(ValuePair(I, J));
+ if (VTTI)
+ CandidatePairCostSavings.insert(ValuePairWithCost(ValuePair(I, J),
+ CostSavings));
+
+ if (FixedOrder == 1)
+ FixedOrderPairs.insert(ValuePair(I, J));
+ else if (FixedOrder == -1)
+ FixedOrderPairs.insert(ValuePair(J, I));
// The next call to this function must start after the last instruction
// selected during this invocation.
@@ -887,7 +1161,8 @@ namespace {
}
DEBUG(if (DebugCandidateSelection) dbgs() << "BBV: candidate pair "
- << *I << " <-> " << *J << "\n");
+ << *I << " <-> " << *J << " (cost savings: " <<
+ CostSavings << ")\n");
// If we have already found too many pairs, break here and this function
// will be called again starting after the last instruction selected
@@ -915,6 +1190,7 @@ namespace {
std::multimap<Value *, Value *> &CandidatePairs,
std::vector<Value *> &PairableInsts,
std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+ DenseMap<VPPair, unsigned> &PairConnectionTypes,
ValuePair P) {
StoreInst *SI, *SJ;
@@ -946,12 +1222,18 @@ namespace {
VPIteratorPair JPairRange = CandidatePairs.equal_range(*J);
// Look for <I, J>:
- if (isSecondInIteratorPair<Value*>(*J, IPairRange))
- ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J)));
+ if (isSecondInIteratorPair<Value*>(*J, IPairRange)) {
+ VPPair VP(P, ValuePair(*I, *J));
+ ConnectedPairs.insert(VP);
+ PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionDirect));
+ }
// Look for <J, I>:
- if (isSecondInIteratorPair<Value*>(*I, JPairRange))
- ConnectedPairs.insert(VPPair(P, ValuePair(*J, *I)));
+ if (isSecondInIteratorPair<Value*>(*I, JPairRange)) {
+ VPPair VP(P, ValuePair(*J, *I));
+ ConnectedPairs.insert(VP);
+ PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSwap));
+ }
}
if (Config.SplatBreaksChain) continue;
@@ -962,8 +1244,11 @@ namespace {
P.first == SJ->getPointerOperand())
continue;
- if (isSecondInIteratorPair<Value*>(*J, IPairRange))
- ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J)));
+ if (isSecondInIteratorPair<Value*>(*J, IPairRange)) {
+ VPPair VP(P, ValuePair(*I, *J));
+ ConnectedPairs.insert(VP);
+ PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat));
+ }
}
}
@@ -985,8 +1270,11 @@ namespace {
P.second == SJ->getPointerOperand())
continue;
- if (isSecondInIteratorPair<Value*>(*J, IPairRange))
- ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J)));
+ if (isSecondInIteratorPair<Value*>(*J, IPairRange)) {
+ VPPair VP(P, ValuePair(*I, *J));
+ ConnectedPairs.insert(VP);
+ PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat));
+ }
}
}
}
@@ -997,7 +1285,8 @@ namespace {
void BBVectorize::computeConnectedPairs(
std::multimap<Value *, Value *> &CandidatePairs,
std::vector<Value *> &PairableInsts,
- std::multimap<ValuePair, ValuePair> &ConnectedPairs) {
+ std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+ DenseMap<VPPair, unsigned> &PairConnectionTypes) {
for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
PE = PairableInsts.end(); PI != PE; ++PI) {
@@ -1006,7 +1295,7 @@ namespace {
for (std::multimap<Value *, Value *>::iterator P = choiceRange.first;
P != choiceRange.second; ++P)
computePairsConnectedTo(CandidatePairs, PairableInsts,
- ConnectedPairs, *P);
+ ConnectedPairs, PairConnectionTypes, *P);
}
DEBUG(dbgs() << "BBV: found " << ConnectedPairs.size()
@@ -1196,7 +1485,7 @@ namespace {
PrunedTree.insert(QTop.first);
// Visit each child, pruning as necessary...
- DenseMap<ValuePair, size_t> BestChildren;
+ SmallVector<ValuePairWithDepth, 8> BestChildren;
VPPIteratorPair QTopRange = ConnectedPairs.equal_range(QTop.first);
for (std::multimap<ValuePair, ValuePair>::iterator K = QTopRange.first;
K != QTopRange.second; ++K) {
@@ -1228,7 +1517,7 @@ namespace {
DenseSet<ValuePair> CurrentPairs;
bool CanAdd = true;
- for (DenseMap<ValuePair, size_t>::iterator C2
+ for (SmallVector<ValuePairWithDepth, 8>::iterator C2
= BestChildren.begin(), E2 = BestChildren.end();
C2 != E2; ++C2) {
if (C2->first.first == C->first.first ||
@@ -1313,22 +1602,22 @@ namespace {
// to an already-selected child. Check for this here, and if a
// conflict is found, then remove the previously-selected child
// before adding this one in its place.
- for (DenseMap<ValuePair, size_t>::iterator C2
+ for (SmallVector<ValuePairWithDepth, 8>::iterator C2
= BestChildren.begin(); C2 != BestChildren.end();) {
if (C2->first.first == C->first.first ||
C2->first.first == C->first.second ||
C2->first.second == C->first.first ||
C2->first.second == C->first.second ||
pairsConflict(C2->first, C->first, PairableInstUsers))
- BestChildren.erase(C2++);
+ C2 = BestChildren.erase(C2);
else
++C2;
}
- BestChildren.insert(ValuePairWithDepth(C->first, C->second));
+ BestChildren.push_back(ValuePairWithDepth(C->first, C->second));
}
- for (DenseMap<ValuePair, size_t>::iterator C
+ for (SmallVector<ValuePairWithDepth, 8>::iterator C
= BestChildren.begin(), E2 = BestChildren.end();
C != E2; ++C) {
size_t DepthF = getDepthFactor(C->first.first);
@@ -1341,13 +1630,17 @@ namespace {
// pairs, given the choice of root pairs as an iterator range.
void BBVectorize::findBestTreeFor(
std::multimap<Value *, Value *> &CandidatePairs,
+ DenseMap<ValuePair, int> &CandidatePairCostSavings,
std::vector<Value *> &PairableInsts,
+ DenseSet<ValuePair> &FixedOrderPairs,
+ DenseMap<VPPair, unsigned> &PairConnectionTypes,
std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+ std::multimap<ValuePair, ValuePair> &ConnectedPairDeps,
DenseSet<ValuePair> &PairableInstUsers,
std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
DenseMap<Value *, Value *> &ChosenPairs,
DenseSet<ValuePair> &BestTree, size_t &BestMaxDepth,
- size_t &BestEffSize, VPIteratorPair ChoiceRange,
+ int &BestEffSize, VPIteratorPair ChoiceRange,
bool UseCycleCheck) {
for (std::multimap<Value *, Value *>::iterator J = ChoiceRange.first;
J != ChoiceRange.second; ++J) {
@@ -1397,17 +1690,289 @@ namespace {
PairableInstUsers, PairableInstUserMap, ChosenPairs, Tree,
PrunedTree, *J, UseCycleCheck);
- size_t EffSize = 0;
- for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(),
- E = PrunedTree.end(); S != E; ++S)
- EffSize += getDepthFactor(S->first);
+ int EffSize = 0;
+ if (VTTI) {
+ DenseSet<Value *> PrunedTreeInstrs;
+ for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(),
+ E = PrunedTree.end(); S != E; ++S) {
+ PrunedTreeInstrs.insert(S->first);
+ PrunedTreeInstrs.insert(S->second);
+ }
+
+ // The set of pairs that have already contributed to the total cost.
+ DenseSet<ValuePair> IncomingPairs;
+
+ // If the cost model were perfect, this might not be necessary; but we
+ // need to make sure that we don't get stuck vectorizing our own
+ // shuffle chains.
+ bool HasNontrivialInsts = false;
+
+ // The node weights represent the cost savings associated with
+ // fusing the pair of instructions.
+ for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(),
+ E = PrunedTree.end(); S != E; ++S) {
+ if (!isa<ShuffleVectorInst>(S->first) &&
+ !isa<InsertElementInst>(S->first) &&
+ !isa<ExtractElementInst>(S->first))
+ HasNontrivialInsts = true;
+
+ bool FlipOrder = false;
+
+ if (getDepthFactor(S->first)) {
+ int ESContrib = CandidatePairCostSavings.find(*S)->second;
+ DEBUG(if (DebugPairSelection) dbgs() << "\tweight {"
+ << *S->first << " <-> " << *S->second << "} = " <<
+ ESContrib << "\n");
+ EffSize += ESContrib;
+ }
+
+ // The edge weights contribute in a negative sense: they represent
+ // the cost of shuffles.
+ VPPIteratorPair IP = ConnectedPairDeps.equal_range(*S);
+ if (IP.first != ConnectedPairDeps.end()) {
+ unsigned NumDepsDirect = 0, NumDepsSwap = 0;
+ for (std::multimap<ValuePair, ValuePair>::iterator Q = IP.first;
+ Q != IP.second; ++Q) {
+ if (!PrunedTree.count(Q->second))
+ continue;
+ DenseMap<VPPair, unsigned>::iterator R =
+ PairConnectionTypes.find(VPPair(Q->second, Q->first));
+ assert(R != PairConnectionTypes.end() &&
+ "Cannot find pair connection type");
+ if (R->second == PairConnectionDirect)
+ ++NumDepsDirect;
+ else if (R->second == PairConnectionSwap)
+ ++NumDepsSwap;
+ }
+
+ // If there are more swaps than direct connections, then
+ // the pair order will be flipped during fusion. So the real
+ // number of swaps is the minimum number.
+ FlipOrder = !FixedOrderPairs.count(*S) &&
+ ((NumDepsSwap > NumDepsDirect) ||
+ FixedOrderPairs.count(ValuePair(S->second, S->first)));
+
+ for (std::multimap<ValuePair, ValuePair>::iterator Q = IP.first;
+ Q != IP.second; ++Q) {
+ if (!PrunedTree.count(Q->second))
+ continue;
+ DenseMap<VPPair, unsigned>::iterator R =
+ PairConnectionTypes.find(VPPair(Q->second, Q->first));
+ assert(R != PairConnectionTypes.end() &&
+ "Cannot find pair connection type");
+ Type *Ty1 = Q->second.first->getType(),
+ *Ty2 = Q->second.second->getType();
+ Type *VTy = getVecTypeForPair(Ty1, Ty2);
+ if ((R->second == PairConnectionDirect && FlipOrder) ||
+ (R->second == PairConnectionSwap && !FlipOrder) ||
+ R->second == PairConnectionSplat) {
+ int ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
+ VTy, VTy);
+ DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
+ *Q->second.first << " <-> " << *Q->second.second <<
+ "} -> {" <<
+ *S->first << " <-> " << *S->second << "} = " <<
+ ESContrib << "\n");
+ EffSize -= ESContrib;
+ }
+ }
+ }
+
+ // Compute the cost of outgoing edges. We assume that edges outgoing
+ // to shuffles, inserts or extracts can be merged, and so contribute
+ // no additional cost.
+ if (!S->first->getType()->isVoidTy()) {
+ Type *Ty1 = S->first->getType(),
+ *Ty2 = S->second->getType();
+ Type *VTy = getVecTypeForPair(Ty1, Ty2);
+
+ bool NeedsExtraction = false;
+ for (Value::use_iterator I = S->first->use_begin(),
+ IE = S->first->use_end(); I != IE; ++I) {
+ if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(*I)) {
+ // Shuffle can be folded if it has no other input
+ if (isa<UndefValue>(SI->getOperand(1)))
+ continue;
+ }
+ if (isa<ExtractElementInst>(*I))
+ continue;
+ if (PrunedTreeInstrs.count(*I))
+ continue;
+ NeedsExtraction = true;
+ break;
+ }
+
+ if (NeedsExtraction) {
+ int ESContrib;
+ if (Ty1->isVectorTy())
+ ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
+ Ty1, VTy);
+ else
+ ESContrib = (int) VTTI->getVectorInstrCost(
+ Instruction::ExtractElement, VTy, 0);
+
+ DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
+ *S->first << "} = " << ESContrib << "\n");
+ EffSize -= ESContrib;
+ }
+
+ NeedsExtraction = false;
+ for (Value::use_iterator I = S->second->use_begin(),
+ IE = S->second->use_end(); I != IE; ++I) {
+ if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(*I)) {
+ // Shuffle can be folded if it has no other input
+ if (isa<UndefValue>(SI->getOperand(1)))
+ continue;
+ }
+ if (isa<ExtractElementInst>(*I))
+ continue;
+ if (PrunedTreeInstrs.count(*I))
+ continue;
+ NeedsExtraction = true;
+ break;
+ }
+
+ if (NeedsExtraction) {
+ int ESContrib;
+ if (Ty2->isVectorTy())
+ ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
+ Ty2, VTy);
+ else
+ ESContrib = (int) VTTI->getVectorInstrCost(
+ Instruction::ExtractElement, VTy, 1);
+ DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
+ *S->second << "} = " << ESContrib << "\n");
+ EffSize -= ESContrib;
+ }
+ }
+
+ // Compute the cost of incoming edges.
+ if (!isa<LoadInst>(S->first) && !isa<StoreInst>(S->first)) {
+ Instruction *S1 = cast<Instruction>(S->first),
+ *S2 = cast<Instruction>(S->second);
+ for (unsigned o = 0; o < S1->getNumOperands(); ++o) {
+ Value *O1 = S1->getOperand(o), *O2 = S2->getOperand(o);
+
+ // Combining constants into vector constants (or small vector
+ // constants into larger ones are assumed free).
+ if (isa<Constant>(O1) && isa<Constant>(O2))
+ continue;
+
+ if (FlipOrder)
+ std::swap(O1, O2);
+
+ ValuePair VP = ValuePair(O1, O2);
+ ValuePair VPR = ValuePair(O2, O1);
+
+ // Internal edges are not handled here.
+ if (PrunedTree.count(VP) || PrunedTree.count(VPR))
+ continue;
+
+ Type *Ty1 = O1->getType(),
+ *Ty2 = O2->getType();
+ Type *VTy = getVecTypeForPair(Ty1, Ty2);
+
+ // Combining vector operations of the same type is also assumed
+ // folded with other operations.
+ if (Ty1 == Ty2) {
+ // If both are insert elements, then both can be widened.
+ InsertElementInst *IEO1 = dyn_cast<InsertElementInst>(O1),
+ *IEO2 = dyn_cast<InsertElementInst>(O2);
+ if (IEO1 && IEO2 && isPureIEChain(IEO1) && isPureIEChain(IEO2))
+ continue;
+ // If both are extract elements, and both have the same input
+ // type, then they can be replaced with a shuffle
+ ExtractElementInst *EIO1 = dyn_cast<ExtractElementInst>(O1),
+ *EIO2 = dyn_cast<ExtractElementInst>(O2);
+ if (EIO1 && EIO2 &&
+ EIO1->getOperand(0)->getType() ==
+ EIO2->getOperand(0)->getType())
+ continue;
+ // If both are a shuffle with equal operand types and only two
+ // unqiue operands, then they can be replaced with a single
+ // shuffle
+ ShuffleVectorInst *SIO1 = dyn_cast<ShuffleVectorInst>(O1),
+ *SIO2 = dyn_cast<ShuffleVectorInst>(O2);
+ if (SIO1 && SIO2 &&
+ SIO1->getOperand(0)->getType() ==
+ SIO2->getOperand(0)->getType()) {
+ SmallSet<Value *, 4> SIOps;
+ SIOps.insert(SIO1->getOperand(0));
+ SIOps.insert(SIO1->getOperand(1));
+ SIOps.insert(SIO2->getOperand(0));
+ SIOps.insert(SIO2->getOperand(1));
+ if (SIOps.size() <= 2)
+ continue;
+ }
+ }
+
+ int ESContrib;
+ // This pair has already been formed.
+ if (IncomingPairs.count(VP)) {
+ continue;
+ } else if (IncomingPairs.count(VPR)) {
+ ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
+ VTy, VTy);
+ } else if (!Ty1->isVectorTy() && !Ty2->isVectorTy()) {
+ ESContrib = (int) VTTI->getVectorInstrCost(
+ Instruction::InsertElement, VTy, 0);
+ ESContrib += (int) VTTI->getVectorInstrCost(
+ Instruction::InsertElement, VTy, 1);
+ } else if (!Ty1->isVectorTy()) {
+ // O1 needs to be inserted into a vector of size O2, and then
+ // both need to be shuffled together.
+ ESContrib = (int) VTTI->getVectorInstrCost(
+ Instruction::InsertElement, Ty2, 0);
+ ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
+ VTy, Ty2);
+ } else if (!Ty2->isVectorTy()) {
+ // O2 needs to be inserted into a vector of size O1, and then
+ // both need to be shuffled together.
+ ESContrib = (int) VTTI->getVectorInstrCost(
+ Instruction::InsertElement, Ty1, 0);
+ ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
+ VTy, Ty1);
+ } else {
+ Type *TyBig = Ty1, *TySmall = Ty2;
+ if (Ty2->getVectorNumElements() > Ty1->getVectorNumElements())
+ std::swap(TyBig, TySmall);
+
+ ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
+ VTy, TyBig);
+ if (TyBig != TySmall)
+ ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
+ TyBig, TySmall);
+ }
+
+ DEBUG(if (DebugPairSelection) dbgs() << "\tcost {"
+ << *O1 << " <-> " << *O2 << "} = " <<
+ ESContrib << "\n");
+ EffSize -= ESContrib;
+ IncomingPairs.insert(VP);
+ }
+ }
+ }
+
+ if (!HasNontrivialInsts) {
+ DEBUG(if (DebugPairSelection) dbgs() <<
+ "\tNo non-trivial instructions in tree;"
+ " override to zero effective size\n");
+ EffSize = 0;
+ }
+ } else {
+ for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(),
+ E = PrunedTree.end(); S != E; ++S)
+ EffSize += (int) getDepthFactor(S->first);
+ }
DEBUG(if (DebugPairSelection)
dbgs() << "BBV: found pruned Tree for pair {"
<< *J->first << " <-> " << *J->second << "} of depth " <<
MaxDepth << " and size " << PrunedTree.size() <<
" (effective size: " << EffSize << ")\n");
- if (MaxDepth >= Config.ReqChainDepth && EffSize > BestEffSize) {
+ if (((VTTI && !UseChainDepthWithTI) ||
+ MaxDepth >= Config.ReqChainDepth) &&
+ EffSize > 0 && EffSize > BestEffSize) {
BestMaxDepth = MaxDepth;
BestEffSize = EffSize;
BestTree = PrunedTree;
@@ -1419,8 +1984,12 @@ namespace {
// that will be fused into vector instructions.
void BBVectorize::choosePairs(
std::multimap<Value *, Value *> &CandidatePairs,
+ DenseMap<ValuePair, int> &CandidatePairCostSavings,
std::vector<Value *> &PairableInsts,
+ DenseSet<ValuePair> &FixedOrderPairs,
+ DenseMap<VPPair, unsigned> &PairConnectionTypes,
std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+ std::multimap<ValuePair, ValuePair> &ConnectedPairDeps,
DenseSet<ValuePair> &PairableInstUsers,
DenseMap<Value *, Value *>& ChosenPairs) {
bool UseCycleCheck =
@@ -1435,9 +2004,12 @@ namespace {
VPIteratorPair ChoiceRange = CandidatePairs.equal_range(*I);
// The best pair to choose and its tree:
- size_t BestMaxDepth = 0, BestEffSize = 0;
+ size_t BestMaxDepth = 0;
+ int BestEffSize = 0;
DenseSet<ValuePair> BestTree;
- findBestTreeFor(CandidatePairs, PairableInsts, ConnectedPairs,
+ findBestTreeFor(CandidatePairs, CandidatePairCostSavings,
+ PairableInsts, FixedOrderPairs, PairConnectionTypes,
+ ConnectedPairs, ConnectedPairDeps,
PairableInstUsers, PairableInstUserMap, ChosenPairs,
BestTree, BestMaxDepth, BestEffSize, ChoiceRange,
UseCycleCheck);
@@ -1490,24 +2062,19 @@ namespace {
// Returns the value that is to be used as the pointer input to the vector
// instruction that fuses I with J.
Value *BBVectorize::getReplacementPointerInput(LLVMContext& Context,
- Instruction *I, Instruction *J, unsigned o,
- bool FlipMemInputs) {
+ Instruction *I, Instruction *J, unsigned o) {
Value *IPtr, *JPtr;
- unsigned IAlignment, JAlignment;
+ unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace;
int64_t OffsetInElmts;
- // Note: the analysis might fail here, that is why FlipMemInputs has
+ // Note: the analysis might fail here, that is why the pair order has
// been precomputed (OffsetInElmts must be unused here).
(void) getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
- OffsetInElmts);
+ IAddressSpace, JAddressSpace,
+ OffsetInElmts, false);
// The pointer value is taken to be the one with the lowest offset.
- Value *VPtr;
- if (!FlipMemInputs) {
- VPtr = IPtr;
- } else {
- VPtr = JPtr;
- }
+ Value *VPtr = IPtr;
Type *ArgTypeI = cast<PointerType>(IPtr->getType())->getElementType();
Type *ArgTypeJ = cast<PointerType>(JPtr->getType())->getElementType();
@@ -1515,7 +2082,7 @@ namespace {
Type *VArgPtrType = PointerType::get(VArgType,
cast<PointerType>(IPtr->getType())->getAddressSpace());
return new BitCastInst(VPtr, VArgPtrType, getReplacementName(I, true, o),
- /* insert before */ FlipMemInputs ? J : I);
+ /* insert before */ I);
}
void BBVectorize::fillNewShuffleMask(LLVMContext& Context, Instruction *J,
@@ -1585,23 +2152,12 @@ namespace {
Instruction *J, unsigned o, Value *&LOp,
unsigned numElemL,
Type *ArgTypeL, Type *ArgTypeH,
- unsigned IdxOff) {
+ bool IBeforeJ, unsigned IdxOff) {
bool ExpandedIEChain = false;
if (InsertElementInst *LIE = dyn_cast<InsertElementInst>(LOp)) {
// If we have a pure insertelement chain, then this can be rewritten
// into a chain that directly builds the larger type.
- bool PureChain = true;
- InsertElementInst *LIENext = LIE;
- do {
- if (!isa<UndefValue>(LIENext->getOperand(0)) &&
- !isa<InsertElementInst>(LIENext->getOperand(0))) {
- PureChain = false;
- break;
- }
- } while ((LIENext =
- dyn_cast<InsertElementInst>(LIENext->getOperand(0))));
-
- if (PureChain) {
+ if (isPureIEChain(LIE)) {
SmallVector<Value *, 8> VectElemts(numElemL,
UndefValue::get(ArgTypeL->getScalarType()));
InsertElementInst *LIENext = LIE;
@@ -1619,8 +2175,9 @@ namespace {
LIENext = InsertElementInst::Create(LIEPrev, VectElemts[i],
ConstantInt::get(Type::getInt32Ty(Context),
i + IdxOff),
- getReplacementName(I, true, o, i+1));
- LIENext->insertBefore(J);
+ getReplacementName(IBeforeJ ? I : J,
+ true, o, i+1));
+ LIENext->insertBefore(IBeforeJ ? J : I);
LIEPrev = LIENext;
}
@@ -1635,7 +2192,7 @@ namespace {
// Returns the value to be used as the specified operand of the vector
// instruction that fuses I with J.
Value *BBVectorize::getReplacementInput(LLVMContext& Context, Instruction *I,
- Instruction *J, unsigned o, bool FlipMemInputs) {
+ Instruction *J, unsigned o, bool IBeforeJ) {
Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1);
@@ -1646,12 +2203,6 @@ namespace {
Instruction *L = I, *H = J;
Type *ArgTypeL = ArgTypeI, *ArgTypeH = ArgTypeJ;
- if (FlipMemInputs) {
- L = J;
- H = I;
- ArgTypeL = ArgTypeJ;
- ArgTypeH = ArgTypeI;
- }
unsigned numElemL;
if (ArgTypeL->isVectorTy())
@@ -1804,8 +2355,9 @@ namespace {
Instruction *S =
new ShuffleVectorInst(I1, UndefValue::get(I1T),
ConstantVector::get(Mask),
- getReplacementName(I, true, o));
- S->insertBefore(J);
+ getReplacementName(IBeforeJ ? I : J,
+ true, o));
+ S->insertBefore(IBeforeJ ? J : I);
return S;
}
@@ -1826,8 +2378,9 @@ namespace {
Instruction *NewI1 =
new ShuffleVectorInst(I1, UndefValue::get(I1T),
ConstantVector::get(Mask),
- getReplacementName(I, true, o, 1));
- NewI1->insertBefore(J);
+ getReplacementName(IBeforeJ ? I : J,
+ true, o, 1));
+ NewI1->insertBefore(IBeforeJ ? J : I);
I1 = NewI1;
I1T = I2T;
I1Elem = I2Elem;
@@ -1842,8 +2395,9 @@ namespace {
Instruction *NewI2 =
new ShuffleVectorInst(I2, UndefValue::get(I2T),
ConstantVector::get(Mask),
- getReplacementName(I, true, o, 1));
- NewI2->insertBefore(J);
+ getReplacementName(IBeforeJ ? I : J,
+ true, o, 1));
+ NewI2->insertBefore(IBeforeJ ? J : I);
I2 = NewI2;
I2T = I1T;
I2Elem = I1Elem;
@@ -1863,8 +2417,8 @@ namespace {
Instruction *NewOp =
new ShuffleVectorInst(I1, I2, ConstantVector::get(Mask),
- getReplacementName(I, true, o));
- NewOp->insertBefore(J);
+ getReplacementName(IBeforeJ ? I : J, true, o));
+ NewOp->insertBefore(IBeforeJ ? J : I);
return NewOp;
}
}
@@ -1872,17 +2426,17 @@ namespace {
Type *ArgType = ArgTypeL;
if (numElemL < numElemH) {
if (numElemL == 1 && expandIEChain(Context, I, J, o, HOp, numElemH,
- ArgTypeL, VArgType, 1)) {
+ ArgTypeL, VArgType, IBeforeJ, 1)) {
// This is another short-circuit case: we're combining a scalar into
// a vector that is formed by an IE chain. We've just expanded the IE
// chain, now insert the scalar and we're done.
Instruction *S = InsertElementInst::Create(HOp, LOp, CV0,
- getReplacementName(I, true, o));
- S->insertBefore(J);
+ getReplacementName(IBeforeJ ? I : J, true, o));
+ S->insertBefore(IBeforeJ ? J : I);
return S;
} else if (!expandIEChain(Context, I, J, o, LOp, numElemL, ArgTypeL,
- ArgTypeH)) {
+ ArgTypeH, IBeforeJ)) {
// The two vector inputs to the shuffle must be the same length,
// so extend the smaller vector to be the same length as the larger one.
Instruction *NLOp;
@@ -1897,29 +2451,32 @@ namespace {
NLOp = new ShuffleVectorInst(LOp, UndefValue::get(ArgTypeL),
ConstantVector::get(Mask),
- getReplacementName(I, true, o, 1));
+ getReplacementName(IBeforeJ ? I : J,
+ true, o, 1));
} else {
NLOp = InsertElementInst::Create(UndefValue::get(ArgTypeH), LOp, CV0,
- getReplacementName(I, true, o, 1));
+ getReplacementName(IBeforeJ ? I : J,
+ true, o, 1));
}
- NLOp->insertBefore(J);
+ NLOp->insertBefore(IBeforeJ ? J : I);
LOp = NLOp;
}
ArgType = ArgTypeH;
} else if (numElemL > numElemH) {
if (numElemH == 1 && expandIEChain(Context, I, J, o, LOp, numElemL,
- ArgTypeH, VArgType)) {
+ ArgTypeH, VArgType, IBeforeJ)) {
Instruction *S =
InsertElementInst::Create(LOp, HOp,
ConstantInt::get(Type::getInt32Ty(Context),
numElemL),
- getReplacementName(I, true, o));
- S->insertBefore(J);
+ getReplacementName(IBeforeJ ? I : J,
+ true, o));
+ S->insertBefore(IBeforeJ ? J : I);
return S;
} else if (!expandIEChain(Context, I, J, o, HOp, numElemH, ArgTypeH,
- ArgTypeL)) {
+ ArgTypeL, IBeforeJ)) {
Instruction *NHOp;
if (numElemH > 1) {
std::vector<Constant *> Mask(numElemL);
@@ -1931,13 +2488,15 @@ namespace {
NHOp = new ShuffleVectorInst(HOp, UndefValue::get(ArgTypeH),
ConstantVector::get(Mask),
- getReplacementName(I, true, o, 1));
+ getReplacementName(IBeforeJ ? I : J,
+ true, o, 1));
} else {
NHOp = InsertElementInst::Create(UndefValue::get(ArgTypeL), HOp, CV0,
- getReplacementName(I, true, o, 1));
+ getReplacementName(IBeforeJ ? I : J,
+ true, o, 1));
}
- NHOp->insertBefore(J);
+ NHOp->insertBefore(IBeforeJ ? J : I);
HOp = NHOp;
}
}
@@ -1955,19 +2514,21 @@ namespace {
}
Instruction *BV = new ShuffleVectorInst(LOp, HOp,
- ConstantVector::get(Mask),
- getReplacementName(I, true, o));
- BV->insertBefore(J);
+ ConstantVector::get(Mask),
+ getReplacementName(IBeforeJ ? I : J, true, o));
+ BV->insertBefore(IBeforeJ ? J : I);
return BV;
}
Instruction *BV1 = InsertElementInst::Create(
UndefValue::get(VArgType), LOp, CV0,
- getReplacementName(I, true, o, 1));
- BV1->insertBefore(I);
+ getReplacementName(IBeforeJ ? I : J,
+ true, o, 1));
+ BV1->insertBefore(IBeforeJ ? J : I);
Instruction *BV2 = InsertElementInst::Create(BV1, HOp, CV1,
- getReplacementName(I, true, o, 2));
- BV2->insertBefore(J);
+ getReplacementName(IBeforeJ ? I : J,
+ true, o, 2));
+ BV2->insertBefore(IBeforeJ ? J : I);
return BV2;
}
@@ -1976,7 +2537,7 @@ namespace {
void BBVectorize::getReplacementInputsForPair(LLVMContext& Context,
Instruction *I, Instruction *J,
SmallVector<Value *, 3> &ReplacedOperands,
- bool FlipMemInputs) {
+ bool IBeforeJ) {
unsigned NumOperands = I->getNumOperands();
for (unsigned p = 0, o = NumOperands-1; p < NumOperands; ++p, --o) {
@@ -1985,8 +2546,7 @@ namespace {
if (isa<LoadInst>(I) || (o == 1 && isa<StoreInst>(I))) {
// This is the pointer for a load/store instruction.
- ReplacedOperands[o] = getReplacementPointerInput(Context, I, J, o,
- FlipMemInputs);
+ ReplacedOperands[o] = getReplacementPointerInput(Context, I, J, o);
continue;
} else if (isa<CallInst>(I)) {
Function *F = cast<CallInst>(I)->getCalledFunction();
@@ -2014,8 +2574,7 @@ namespace {
continue;
}
- ReplacedOperands[o] =
- getReplacementInput(Context, I, J, o, FlipMemInputs);
+ ReplacedOperands[o] = getReplacementInput(Context, I, J, o, IBeforeJ);
}
}
@@ -2026,8 +2585,7 @@ namespace {
void BBVectorize::replaceOutputsOfPair(LLVMContext& Context, Instruction *I,
Instruction *J, Instruction *K,
Instruction *&InsertionPt,
- Instruction *&K1, Instruction *&K2,
- bool FlipMemInputs) {
+ Instruction *&K1, Instruction *&K2) {
if (isa<StoreInst>(I)) {
AA->replaceWithNewValue(I, K);
AA->replaceWithNewValue(J, K);
@@ -2057,13 +2615,11 @@ namespace {
}
K1 = new ShuffleVectorInst(K, UndefValue::get(VType),
- ConstantVector::get(
- FlipMemInputs ? Mask2 : Mask1),
+ ConstantVector::get( Mask1),
getReplacementName(K, false, 1));
} else {
Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
- Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem-1);
- K1 = ExtractElementInst::Create(K, FlipMemInputs ? CV1 : CV0,
+ K1 = ExtractElementInst::Create(K, CV0,
getReplacementName(K, false, 1));
}
@@ -2075,13 +2631,11 @@ namespace {
}
K2 = new ShuffleVectorInst(K, UndefValue::get(VType),
- ConstantVector::get(
- FlipMemInputs ? Mask1 : Mask2),
+ ConstantVector::get( Mask2),
getReplacementName(K, false, 2));
} else {
- Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem-1);
- K2 = ExtractElementInst::Create(K, FlipMemInputs ? CV0 : CV1,
+ K2 = ExtractElementInst::Create(K, CV1,
getReplacementName(K, false, 2));
}
@@ -2181,36 +2735,6 @@ namespace {
}
}
- // As with the aliasing information, SCEV can also change because of
- // vectorization. This information is used to compute relative pointer
- // offsets; the necessary information will be cached here prior to
- // fusion.
- void BBVectorize::collectPtrInfo(std::vector<Value *> &PairableInsts,
- DenseMap<Value *, Value *> &ChosenPairs,
- DenseSet<Value *> &LowPtrInsts) {
- for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
- PIE = PairableInsts.end(); PI != PIE; ++PI) {
- DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(*PI);
- if (P == ChosenPairs.end()) continue;
-
- Instruction *I = cast<Instruction>(P->first);
- Instruction *J = cast<Instruction>(P->second);
-
- if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
- continue;
-
- Value *IPtr, *JPtr;
- unsigned IAlignment, JAlignment;
- int64_t OffsetInElmts;
- if (!getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
- OffsetInElmts) || abs64(OffsetInElmts) != 1)
- llvm_unreachable("Pre-fusion pointer analysis failed");
-
- Value *LowPI = (OffsetInElmts > 0) ? I : J;
- LowPtrInsts.insert(LowPI);
- }
- }
-
// When the first instruction in each pair is cloned, it will inherit its
// parent's metadata. This metadata must be combined with that of the other
// instruction in a safe way.
@@ -2244,27 +2768,27 @@ namespace {
// second member).
void BBVectorize::fuseChosenPairs(BasicBlock &BB,
std::vector<Value *> &PairableInsts,
- DenseMap<Value *, Value *> &ChosenPairs) {
+ DenseMap<Value *, Value *> &ChosenPairs,
+ DenseSet<ValuePair> &FixedOrderPairs,
+ DenseMap<VPPair, unsigned> &PairConnectionTypes,
+ std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+ std::multimap<ValuePair, ValuePair> &ConnectedPairDeps) {
LLVMContext& Context = BB.getContext();
// During the vectorization process, the order of the pairs to be fused
// could be flipped. So we'll add each pair, flipped, into the ChosenPairs
// list. After a pair is fused, the flipped pair is removed from the list.
- std::vector<ValuePair> FlippedPairs;
- FlippedPairs.reserve(ChosenPairs.size());
+ DenseSet<ValuePair> FlippedPairs;
for (DenseMap<Value *, Value *>::iterator P = ChosenPairs.begin(),
E = ChosenPairs.end(); P != E; ++P)
- FlippedPairs.push_back(ValuePair(P->second, P->first));
- for (std::vector<ValuePair>::iterator P = FlippedPairs.begin(),
+ FlippedPairs.insert(ValuePair(P->second, P->first));
+ for (DenseSet<ValuePair>::iterator P = FlippedPairs.begin(),
E = FlippedPairs.end(); P != E; ++P)
ChosenPairs.insert(*P);
std::multimap<Value *, Value *> LoadMoveSet;
collectLoadMoveSet(BB, PairableInsts, ChosenPairs, LoadMoveSet);
- DenseSet<Value *> LowPtrInsts;
- collectPtrInfo(PairableInsts, ChosenPairs, LowPtrInsts);
-
DEBUG(dbgs() << "BBV: initial: \n" << BB << "\n");
for (BasicBlock::iterator PI = BB.getFirstInsertionPt(); PI != BB.end();) {
@@ -2304,44 +2828,92 @@ namespace {
continue;
}
- bool FlipMemInputs = false;
- if (isa<LoadInst>(I) || isa<StoreInst>(I))
- FlipMemInputs = (LowPtrInsts.find(I) == LowPtrInsts.end());
+ // If the pair must have the other order, then flip it.
+ bool FlipPairOrder = FixedOrderPairs.count(ValuePair(J, I));
+ if (!FlipPairOrder && !FixedOrderPairs.count(ValuePair(I, J))) {
+ // This pair does not have a fixed order, and so we might want to
+ // flip it if that will yield fewer shuffles. We count the number
+ // of dependencies connected via swaps, and those directly connected,
+ // and flip the order if the number of swaps is greater.
+ bool OrigOrder = true;
+ VPPIteratorPair IP = ConnectedPairDeps.equal_range(ValuePair(I, J));
+ if (IP.first == ConnectedPairDeps.end()) {
+ IP = ConnectedPairDeps.equal_range(ValuePair(J, I));
+ OrigOrder = false;
+ }
+ if (IP.first != ConnectedPairDeps.end()) {
+ unsigned NumDepsDirect = 0, NumDepsSwap = 0;
+ for (std::multimap<ValuePair, ValuePair>::iterator Q = IP.first;
+ Q != IP.second; ++Q) {
+ DenseMap<VPPair, unsigned>::iterator R =
+ PairConnectionTypes.find(VPPair(Q->second, Q->first));
+ assert(R != PairConnectionTypes.end() &&
+ "Cannot find pair connection type");
+ if (R->second == PairConnectionDirect)
+ ++NumDepsDirect;
+ else if (R->second == PairConnectionSwap)
+ ++NumDepsSwap;
+ }
+
+ if (!OrigOrder)
+ std::swap(NumDepsDirect, NumDepsSwap);
+
+ if (NumDepsSwap > NumDepsDirect) {
+ FlipPairOrder = true;
+ DEBUG(dbgs() << "BBV: reordering pair: " << *I <<
+ " <-> " << *J << "\n");
+ }
+ }
+ }
+
+ Instruction *L = I, *H = J;
+ if (FlipPairOrder)
+ std::swap(H, L);
+
+ // If the pair being fused uses the opposite order from that in the pair
+ // connection map, then we need to flip the types.
+ VPPIteratorPair IP = ConnectedPairs.equal_range(ValuePair(H, L));
+ for (std::multimap<ValuePair, ValuePair>::iterator Q = IP.first;
+ Q != IP.second; ++Q) {
+ DenseMap<VPPair, unsigned>::iterator R = PairConnectionTypes.find(*Q);
+ assert(R != PairConnectionTypes.end() &&
+ "Cannot find pair connection type");
+ if (R->second == PairConnectionDirect)
+ R->second = PairConnectionSwap;
+ else if (R->second == PairConnectionSwap)
+ R->second = PairConnectionDirect;
+ }
+
+ bool LBeforeH = !FlipPairOrder;
unsigned NumOperands = I->getNumOperands();
SmallVector<Value *, 3> ReplacedOperands(NumOperands);
- getReplacementInputsForPair(Context, I, J, ReplacedOperands,
- FlipMemInputs);
+ getReplacementInputsForPair(Context, L, H, ReplacedOperands,
+ LBeforeH);
// Make a copy of the original operation, change its type to the vector
// type and replace its operands with the vector operands.
- Instruction *K = I->clone();
- if (I->hasName()) K->takeName(I);
+ Instruction *K = L->clone();
+ if (L->hasName())
+ K->takeName(L);
+ else if (H->hasName())
+ K->takeName(H);
if (!isa<StoreInst>(K))
- K->mutateType(getVecTypeForPair(I->getType(), J->getType()));
+ K->mutateType(getVecTypeForPair(L->getType(), H->getType()));
- combineMetadata(K, J);
+ combineMetadata(K, H);
+ K->intersectOptionalDataWith(H);
for (unsigned o = 0; o < NumOperands; ++o)
K->setOperand(o, ReplacedOperands[o]);
- // If we've flipped the memory inputs, make sure that we take the correct
- // alignment.
- if (FlipMemInputs) {
- if (isa<StoreInst>(K))
- cast<StoreInst>(K)->setAlignment(cast<StoreInst>(J)->getAlignment());
- else
- cast<LoadInst>(K)->setAlignment(cast<LoadInst>(J)->getAlignment());
- }
-
K->insertAfter(J);
// Instruction insertion point:
Instruction *InsertionPt = K;
Instruction *K1 = 0, *K2 = 0;
- replaceOutputsOfPair(Context, I, J, K, InsertionPt, K1, K2,
- FlipMemInputs);
+ replaceOutputsOfPair(Context, L, H, K, InsertionPt, K1, K2);
// The use tree of the first original instruction must be moved to after
// the location of the second instruction. The entire use tree of the
@@ -2351,10 +2923,10 @@ namespace {
moveUsesOfIAfterJ(BB, LoadMoveSet, InsertionPt, I, J);
if (!isa<StoreInst>(I)) {
- I->replaceAllUsesWith(K1);
- J->replaceAllUsesWith(K2);
- AA->replaceWithNewValue(I, K1);
- AA->replaceWithNewValue(J, K2);
+ L->replaceAllUsesWith(K1);
+ H->replaceAllUsesWith(K2);
+ AA->replaceWithNewValue(L, K1);
+ AA->replaceWithNewValue(H, K2);
}
// Instructions that may read from memory may be in the load move set.
@@ -2387,6 +2959,9 @@ namespace {
SE->forgetValue(J);
I->eraseFromParent();
J->eraseFromParent();
+
+ DEBUG(if (PrintAfterEveryPair) dbgs() << "BBV: block is now: \n" <<
+ BB << "\n");
}
DEBUG(dbgs() << "BBV: final: \n" << BB << "\n");
@@ -2397,6 +2972,7 @@ char BBVectorize::ID = 0;
static const char bb_vectorize_name[] = "Basic-Block Vectorization";
INITIALIZE_PASS_BEGIN(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)
INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DominatorTree)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
INITIALIZE_PASS_END(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)
OpenPOWER on IntegriCloud