summaryrefslogtreecommitdiffstats
path: root/contrib/llvm/lib/Transforms/Scalar
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm/lib/Transforms/Scalar')
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/ADCE.cpp108
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp428
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/BDCE.cpp102
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp606
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp98
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp419
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/DCE.cpp156
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp961
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp890
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp80
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp543
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/GVN.cpp2931
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp2199
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp1503
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp1955
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LICM.cpp1107
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp282
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp254
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp836
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp1112
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp195
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp1307
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp566
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp1527
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp624
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp5024
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp1030
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp1342
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp148
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp192
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp1304
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp587
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp577
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp163
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp953
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp2306
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp132
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp2915
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/SCCP.cpp1980
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/SROA.cpp4291
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/Scalar.cpp244
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp2630
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp678
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp1265
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp239
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/Sink.cpp290
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp243
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp724
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp953
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp851
50 files changed, 51850 insertions, 0 deletions
diff --git a/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp
new file mode 100644
index 0000000..590a52d
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp
@@ -0,0 +1,108 @@
+//===- ADCE.cpp - Code to perform dead code elimination -------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Aggressive Dead Code Elimination pass. This pass
+// optimistically assumes that all instructions are dead until proven otherwise,
+// allowing it to eliminate dead computations that other DCE passes do not
+// catch, particularly involving loop computations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/ADCE.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "adce"
+
+STATISTIC(NumRemoved, "Number of instructions removed");
+
+static bool aggressiveDCE(Function& F) {
+ SmallPtrSet<Instruction*, 128> Alive;
+ SmallVector<Instruction*, 128> Worklist;
+
+ // Collect the set of "root" instructions that are known live.
+ for (Instruction &I : instructions(F)) {
+ if (isa<TerminatorInst>(I) || isa<DbgInfoIntrinsic>(I) || I.isEHPad() ||
+ I.mayHaveSideEffects()) {
+ Alive.insert(&I);
+ Worklist.push_back(&I);
+ }
+ }
+
+ // Propagate liveness backwards to operands.
+ while (!Worklist.empty()) {
+ Instruction *Curr = Worklist.pop_back_val();
+ for (Use &OI : Curr->operands()) {
+ if (Instruction *Inst = dyn_cast<Instruction>(OI))
+ if (Alive.insert(Inst).second)
+ Worklist.push_back(Inst);
+ }
+ }
+
+ // The inverse of the live set is the dead set. These are those instructions
+ // which have no side effects and do not influence the control flow or return
+ // value of the function, and may therefore be deleted safely.
+ // NOTE: We reuse the Worklist vector here for memory efficiency.
+ for (Instruction &I : instructions(F)) {
+ if (!Alive.count(&I)) {
+ Worklist.push_back(&I);
+ I.dropAllReferences();
+ }
+ }
+
+ for (Instruction *&I : Worklist) {
+ ++NumRemoved;
+ I->eraseFromParent();
+ }
+
+ return !Worklist.empty();
+}
+
+PreservedAnalyses ADCEPass::run(Function &F) {
+ if (aggressiveDCE(F))
+ return PreservedAnalyses::none();
+ return PreservedAnalyses::all();
+}
+
+namespace {
+struct ADCELegacyPass : public FunctionPass {
+ static char ID; // Pass identification, replacement for typeid
+ ADCELegacyPass() : FunctionPass(ID) {
+ initializeADCELegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function& F) override {
+ if (skipOptnoneFunction(F))
+ return false;
+ return aggressiveDCE(F);
+ }
+
+ void getAnalysisUsage(AnalysisUsage& AU) const override {
+ AU.setPreservesCFG();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ }
+};
+}
+
+char ADCELegacyPass::ID = 0;
+INITIALIZE_PASS(ADCELegacyPass, "adce", "Aggressive Dead Code Elimination",
+ false, false)
+
+FunctionPass *llvm::createAggressiveDCEPass() { return new ADCELegacyPass(); }
diff --git a/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp b/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
new file mode 100644
index 0000000..4b721d3
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
@@ -0,0 +1,428 @@
+//===----------------------- AlignmentFromAssumptions.cpp -----------------===//
+// Set Load/Store Alignments From Assumptions
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a ScalarEvolution-based transformation to set
+// the alignments of load, stores and memory intrinsics based on the truth
+// expressions of assume intrinsics. The primary motivation is to handle
+// complex alignment assumptions that apply to vector loads and stores that
+// appear after vectorization and unrolling.
+//
+//===----------------------------------------------------------------------===//
+
+#define AA_NAME "alignment-from-assumptions"
+#define DEBUG_TYPE AA_NAME
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+STATISTIC(NumLoadAlignChanged,
+ "Number of loads changed by alignment assumptions");
+STATISTIC(NumStoreAlignChanged,
+ "Number of stores changed by alignment assumptions");
+STATISTIC(NumMemIntAlignChanged,
+ "Number of memory intrinsics changed by alignment assumptions");
+
+namespace {
+struct AlignmentFromAssumptions : public FunctionPass {
+ static char ID; // Pass identification, replacement for typeid
+ AlignmentFromAssumptions() : FunctionPass(ID) {
+ initializeAlignmentFromAssumptionsPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+
+ AU.setPreservesCFG();
+ AU.addPreserved<AAResultsWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<ScalarEvolutionWrapperPass>();
+ }
+
+ // For memory transfers, we need a common alignment for both the source and
+ // destination. If we have a new alignment for only one operand of a transfer
+ // instruction, save it in these maps. If we reach the other operand through
+ // another assumption later, then we may change the alignment at that point.
+ DenseMap<MemTransferInst *, unsigned> NewDestAlignments, NewSrcAlignments;
+
+ ScalarEvolution *SE;
+ DominatorTree *DT;
+
+ bool extractAlignmentInfo(CallInst *I, Value *&AAPtr, const SCEV *&AlignSCEV,
+ const SCEV *&OffSCEV);
+ bool processAssumption(CallInst *I);
+};
+}
+
+char AlignmentFromAssumptions::ID = 0;
+static const char aip_name[] = "Alignment from assumptions";
+INITIALIZE_PASS_BEGIN(AlignmentFromAssumptions, AA_NAME,
+ aip_name, false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_END(AlignmentFromAssumptions, AA_NAME,
+ aip_name, false, false)
+
+FunctionPass *llvm::createAlignmentFromAssumptionsPass() {
+ return new AlignmentFromAssumptions();
+}
+
+// Given an expression for the (constant) alignment, AlignSCEV, and an
+// expression for the displacement between a pointer and the aligned address,
+// DiffSCEV, compute the alignment of the displaced pointer if it can be reduced
+// to a constant. Using SCEV to compute alignment handles the case where
+// DiffSCEV is a recurrence with constant start such that the aligned offset
+// is constant. e.g. {16,+,32} % 32 -> 16.
+static unsigned getNewAlignmentDiff(const SCEV *DiffSCEV,
+ const SCEV *AlignSCEV,
+ ScalarEvolution *SE) {
+ // DiffUnits = Diff % int64_t(Alignment)
+ const SCEV *DiffAlignDiv = SE->getUDivExpr(DiffSCEV, AlignSCEV);
+ const SCEV *DiffAlign = SE->getMulExpr(DiffAlignDiv, AlignSCEV);
+ const SCEV *DiffUnitsSCEV = SE->getMinusSCEV(DiffAlign, DiffSCEV);
+
+ DEBUG(dbgs() << "\talignment relative to " << *AlignSCEV << " is " <<
+ *DiffUnitsSCEV << " (diff: " << *DiffSCEV << ")\n");
+
+ if (const SCEVConstant *ConstDUSCEV =
+ dyn_cast<SCEVConstant>(DiffUnitsSCEV)) {
+ int64_t DiffUnits = ConstDUSCEV->getValue()->getSExtValue();
+
+ // If the displacement is an exact multiple of the alignment, then the
+ // displaced pointer has the same alignment as the aligned pointer, so
+ // return the alignment value.
+ if (!DiffUnits)
+ return (unsigned)
+ cast<SCEVConstant>(AlignSCEV)->getValue()->getSExtValue();
+
+ // If the displacement is not an exact multiple, but the remainder is a
+ // constant, then return this remainder (but only if it is a power of 2).
+ uint64_t DiffUnitsAbs = std::abs(DiffUnits);
+ if (isPowerOf2_64(DiffUnitsAbs))
+ return (unsigned) DiffUnitsAbs;
+ }
+
+ return 0;
+}
+
+// There is an address given by an offset OffSCEV from AASCEV which has an
+// alignment AlignSCEV. Use that information, if possible, to compute a new
+// alignment for Ptr.
+static unsigned getNewAlignment(const SCEV *AASCEV, const SCEV *AlignSCEV,
+ const SCEV *OffSCEV, Value *Ptr,
+ ScalarEvolution *SE) {
+ const SCEV *PtrSCEV = SE->getSCEV(Ptr);
+ const SCEV *DiffSCEV = SE->getMinusSCEV(PtrSCEV, AASCEV);
+
+ // On 32-bit platforms, DiffSCEV might now have type i32 -- we've always
+ // sign-extended OffSCEV to i64, so make sure they agree again.
+ DiffSCEV = SE->getNoopOrSignExtend(DiffSCEV, OffSCEV->getType());
+
+ // What we really want to know is the overall offset to the aligned
+ // address. This address is displaced by the provided offset.
+ DiffSCEV = SE->getMinusSCEV(DiffSCEV, OffSCEV);
+
+ DEBUG(dbgs() << "AFI: alignment of " << *Ptr << " relative to " <<
+ *AlignSCEV << " and offset " << *OffSCEV <<
+ " using diff " << *DiffSCEV << "\n");
+
+ unsigned NewAlignment = getNewAlignmentDiff(DiffSCEV, AlignSCEV, SE);
+ DEBUG(dbgs() << "\tnew alignment: " << NewAlignment << "\n");
+
+ if (NewAlignment) {
+ return NewAlignment;
+ } else if (const SCEVAddRecExpr *DiffARSCEV =
+ dyn_cast<SCEVAddRecExpr>(DiffSCEV)) {
+ // The relative offset to the alignment assumption did not yield a constant,
+ // but we should try harder: if we assume that a is 32-byte aligned, then in
+ // for (i = 0; i < 1024; i += 4) r += a[i]; not all of the loads from a are
+ // 32-byte aligned, but instead alternate between 32 and 16-byte alignment.
+ // As a result, the new alignment will not be a constant, but can still
+ // be improved over the default (of 4) to 16.
+
+ const SCEV *DiffStartSCEV = DiffARSCEV->getStart();
+ const SCEV *DiffIncSCEV = DiffARSCEV->getStepRecurrence(*SE);
+
+ DEBUG(dbgs() << "\ttrying start/inc alignment using start " <<
+ *DiffStartSCEV << " and inc " << *DiffIncSCEV << "\n");
+
+ // Now compute the new alignment using the displacement to the value in the
+ // first iteration, and also the alignment using the per-iteration delta.
+ // If these are the same, then use that answer. Otherwise, use the smaller
+ // one, but only if it divides the larger one.
+ NewAlignment = getNewAlignmentDiff(DiffStartSCEV, AlignSCEV, SE);
+ unsigned NewIncAlignment = getNewAlignmentDiff(DiffIncSCEV, AlignSCEV, SE);
+
+ DEBUG(dbgs() << "\tnew start alignment: " << NewAlignment << "\n");
+ DEBUG(dbgs() << "\tnew inc alignment: " << NewIncAlignment << "\n");
+
+ if (!NewAlignment || !NewIncAlignment) {
+ return 0;
+ } else if (NewAlignment > NewIncAlignment) {
+ if (NewAlignment % NewIncAlignment == 0) {
+ DEBUG(dbgs() << "\tnew start/inc alignment: " <<
+ NewIncAlignment << "\n");
+ return NewIncAlignment;
+ }
+ } else if (NewIncAlignment > NewAlignment) {
+ if (NewIncAlignment % NewAlignment == 0) {
+ DEBUG(dbgs() << "\tnew start/inc alignment: " <<
+ NewAlignment << "\n");
+ return NewAlignment;
+ }
+ } else if (NewIncAlignment == NewAlignment) {
+ DEBUG(dbgs() << "\tnew start/inc alignment: " <<
+ NewAlignment << "\n");
+ return NewAlignment;
+ }
+ }
+
+ return 0;
+}
+
+bool AlignmentFromAssumptions::extractAlignmentInfo(CallInst *I,
+ Value *&AAPtr, const SCEV *&AlignSCEV,
+ const SCEV *&OffSCEV) {
+ // An alignment assume must be a statement about the least-significant
+ // bits of the pointer being zero, possibly with some offset.
+ ICmpInst *ICI = dyn_cast<ICmpInst>(I->getArgOperand(0));
+ if (!ICI)
+ return false;
+
+ // This must be an expression of the form: x & m == 0.
+ if (ICI->getPredicate() != ICmpInst::ICMP_EQ)
+ return false;
+
+ // Swap things around so that the RHS is 0.
+ Value *CmpLHS = ICI->getOperand(0);
+ Value *CmpRHS = ICI->getOperand(1);
+ const SCEV *CmpLHSSCEV = SE->getSCEV(CmpLHS);
+ const SCEV *CmpRHSSCEV = SE->getSCEV(CmpRHS);
+ if (CmpLHSSCEV->isZero())
+ std::swap(CmpLHS, CmpRHS);
+ else if (!CmpRHSSCEV->isZero())
+ return false;
+
+ BinaryOperator *CmpBO = dyn_cast<BinaryOperator>(CmpLHS);
+ if (!CmpBO || CmpBO->getOpcode() != Instruction::And)
+ return false;
+
+ // Swap things around so that the right operand of the and is a constant
+ // (the mask); we cannot deal with variable masks.
+ Value *AndLHS = CmpBO->getOperand(0);
+ Value *AndRHS = CmpBO->getOperand(1);
+ const SCEV *AndLHSSCEV = SE->getSCEV(AndLHS);
+ const SCEV *AndRHSSCEV = SE->getSCEV(AndRHS);
+ if (isa<SCEVConstant>(AndLHSSCEV)) {
+ std::swap(AndLHS, AndRHS);
+ std::swap(AndLHSSCEV, AndRHSSCEV);
+ }
+
+ const SCEVConstant *MaskSCEV = dyn_cast<SCEVConstant>(AndRHSSCEV);
+ if (!MaskSCEV)
+ return false;
+
+ // The mask must have some trailing ones (otherwise the condition is
+ // trivial and tells us nothing about the alignment of the left operand).
+ unsigned TrailingOnes = MaskSCEV->getAPInt().countTrailingOnes();
+ if (!TrailingOnes)
+ return false;
+
+ // Cap the alignment at the maximum with which LLVM can deal (and make sure
+ // we don't overflow the shift).
+ uint64_t Alignment;
+ TrailingOnes = std::min(TrailingOnes,
+ unsigned(sizeof(unsigned) * CHAR_BIT - 1));
+ Alignment = std::min(1u << TrailingOnes, +Value::MaximumAlignment);
+
+ Type *Int64Ty = Type::getInt64Ty(I->getParent()->getParent()->getContext());
+ AlignSCEV = SE->getConstant(Int64Ty, Alignment);
+
+ // The LHS might be a ptrtoint instruction, or it might be the pointer
+ // with an offset.
+ AAPtr = nullptr;
+ OffSCEV = nullptr;
+ if (PtrToIntInst *PToI = dyn_cast<PtrToIntInst>(AndLHS)) {
+ AAPtr = PToI->getPointerOperand();
+ OffSCEV = SE->getZero(Int64Ty);
+ } else if (const SCEVAddExpr* AndLHSAddSCEV =
+ dyn_cast<SCEVAddExpr>(AndLHSSCEV)) {
+ // Try to find the ptrtoint; subtract it and the rest is the offset.
+ for (SCEVAddExpr::op_iterator J = AndLHSAddSCEV->op_begin(),
+ JE = AndLHSAddSCEV->op_end(); J != JE; ++J)
+ if (const SCEVUnknown *OpUnk = dyn_cast<SCEVUnknown>(*J))
+ if (PtrToIntInst *PToI = dyn_cast<PtrToIntInst>(OpUnk->getValue())) {
+ AAPtr = PToI->getPointerOperand();
+ OffSCEV = SE->getMinusSCEV(AndLHSAddSCEV, *J);
+ break;
+ }
+ }
+
+ if (!AAPtr)
+ return false;
+
+ // Sign extend the offset to 64 bits (so that it is like all of the other
+ // expressions).
+ unsigned OffSCEVBits = OffSCEV->getType()->getPrimitiveSizeInBits();
+ if (OffSCEVBits < 64)
+ OffSCEV = SE->getSignExtendExpr(OffSCEV, Int64Ty);
+ else if (OffSCEVBits > 64)
+ return false;
+
+ AAPtr = AAPtr->stripPointerCasts();
+ return true;
+}
+
+bool AlignmentFromAssumptions::processAssumption(CallInst *ACall) {
+ Value *AAPtr;
+ const SCEV *AlignSCEV, *OffSCEV;
+ if (!extractAlignmentInfo(ACall, AAPtr, AlignSCEV, OffSCEV))
+ return false;
+
+ const SCEV *AASCEV = SE->getSCEV(AAPtr);
+
+ // Apply the assumption to all other users of the specified pointer.
+ SmallPtrSet<Instruction *, 32> Visited;
+ SmallVector<Instruction*, 16> WorkList;
+ for (User *J : AAPtr->users()) {
+ if (J == ACall)
+ continue;
+
+ if (Instruction *K = dyn_cast<Instruction>(J))
+ if (isValidAssumeForContext(ACall, K, DT))
+ WorkList.push_back(K);
+ }
+
+ while (!WorkList.empty()) {
+ Instruction *J = WorkList.pop_back_val();
+
+ if (LoadInst *LI = dyn_cast<LoadInst>(J)) {
+ unsigned NewAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV,
+ LI->getPointerOperand(), SE);
+
+ if (NewAlignment > LI->getAlignment()) {
+ LI->setAlignment(NewAlignment);
+ ++NumLoadAlignChanged;
+ }
+ } else if (StoreInst *SI = dyn_cast<StoreInst>(J)) {
+ unsigned NewAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV,
+ SI->getPointerOperand(), SE);
+
+ if (NewAlignment > SI->getAlignment()) {
+ SI->setAlignment(NewAlignment);
+ ++NumStoreAlignChanged;
+ }
+ } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(J)) {
+ unsigned NewDestAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV,
+ MI->getDest(), SE);
+
+ // For memory transfers, we need a common alignment for both the
+ // source and destination. If we have a new alignment for this
+ // instruction, but only for one operand, save it. If we reach the
+ // other operand through another assumption later, then we may
+ // change the alignment at that point.
+ if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) {
+ unsigned NewSrcAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV,
+ MTI->getSource(), SE);
+
+ DenseMap<MemTransferInst *, unsigned>::iterator DI =
+ NewDestAlignments.find(MTI);
+ unsigned AltDestAlignment = (DI == NewDestAlignments.end()) ?
+ 0 : DI->second;
+
+ DenseMap<MemTransferInst *, unsigned>::iterator SI =
+ NewSrcAlignments.find(MTI);
+ unsigned AltSrcAlignment = (SI == NewSrcAlignments.end()) ?
+ 0 : SI->second;
+
+ DEBUG(dbgs() << "\tmem trans: " << NewDestAlignment << " " <<
+ AltDestAlignment << " " << NewSrcAlignment <<
+ " " << AltSrcAlignment << "\n");
+
+ // Of these four alignments, pick the largest possible...
+ unsigned NewAlignment = 0;
+ if (NewDestAlignment <= std::max(NewSrcAlignment, AltSrcAlignment))
+ NewAlignment = std::max(NewAlignment, NewDestAlignment);
+ if (AltDestAlignment <= std::max(NewSrcAlignment, AltSrcAlignment))
+ NewAlignment = std::max(NewAlignment, AltDestAlignment);
+ if (NewSrcAlignment <= std::max(NewDestAlignment, AltDestAlignment))
+ NewAlignment = std::max(NewAlignment, NewSrcAlignment);
+ if (AltSrcAlignment <= std::max(NewDestAlignment, AltDestAlignment))
+ NewAlignment = std::max(NewAlignment, AltSrcAlignment);
+
+ if (NewAlignment > MI->getAlignment()) {
+ MI->setAlignment(ConstantInt::get(Type::getInt32Ty(
+ MI->getParent()->getContext()), NewAlignment));
+ ++NumMemIntAlignChanged;
+ }
+
+ NewDestAlignments.insert(std::make_pair(MTI, NewDestAlignment));
+ NewSrcAlignments.insert(std::make_pair(MTI, NewSrcAlignment));
+ } else if (NewDestAlignment > MI->getAlignment()) {
+ assert((!isa<MemIntrinsic>(MI) || isa<MemSetInst>(MI)) &&
+ "Unknown memory intrinsic");
+
+ MI->setAlignment(ConstantInt::get(Type::getInt32Ty(
+ MI->getParent()->getContext()), NewDestAlignment));
+ ++NumMemIntAlignChanged;
+ }
+ }
+
+ // Now that we've updated that use of the pointer, look for other uses of
+ // the pointer to update.
+ Visited.insert(J);
+ for (User *UJ : J->users()) {
+ Instruction *K = cast<Instruction>(UJ);
+ if (!Visited.count(K) && isValidAssumeForContext(ACall, K, DT))
+ WorkList.push_back(K);
+ }
+ }
+
+ return true;
+}
+
+bool AlignmentFromAssumptions::runOnFunction(Function &F) {
+ bool Changed = false;
+ auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
+ NewDestAlignments.clear();
+ NewSrcAlignments.clear();
+
+ for (auto &AssumeVH : AC.assumptions())
+ if (AssumeVH)
+ Changed |= processAssumption(cast<CallInst>(AssumeVH));
+
+ return Changed;
+}
+
diff --git a/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp b/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp
new file mode 100644
index 0000000..cb9b8b6
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp
@@ -0,0 +1,102 @@
+//===---- BDCE.cpp - Bit-tracking dead code elimination -------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Bit-Tracking Dead Code Elimination pass. Some
+// instructions (shifts, some ands, ors, etc.) kill some of their input bits.
+// We track these dead bits and remove instructions that compute only these
+// dead bits.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/DemandedBits.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "bdce"
+
+STATISTIC(NumRemoved, "Number of instructions removed (unused)");
+STATISTIC(NumSimplified, "Number of instructions trivialized (dead bits)");
+
+namespace {
+struct BDCE : public FunctionPass {
+ static char ID; // Pass identification, replacement for typeid
+ BDCE() : FunctionPass(ID) {
+ initializeBDCEPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function& F) override;
+
+ void getAnalysisUsage(AnalysisUsage& AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<DemandedBits>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ }
+};
+}
+
+char BDCE::ID = 0;
+INITIALIZE_PASS_BEGIN(BDCE, "bdce", "Bit-Tracking Dead Code Elimination",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(DemandedBits)
+INITIALIZE_PASS_END(BDCE, "bdce", "Bit-Tracking Dead Code Elimination",
+ false, false)
+
+bool BDCE::runOnFunction(Function& F) {
+ if (skipOptnoneFunction(F))
+ return false;
+ DemandedBits &DB = getAnalysis<DemandedBits>();
+
+ SmallVector<Instruction*, 128> Worklist;
+ bool Changed = false;
+ for (Instruction &I : instructions(F)) {
+ if (I.getType()->isIntegerTy() &&
+ !DB.getDemandedBits(&I).getBoolValue()) {
+ // For live instructions that have all dead bits, first make them dead by
+ // replacing all uses with something else. Then, if they don't need to
+ // remain live (because they have side effects, etc.) we can remove them.
+ DEBUG(dbgs() << "BDCE: Trivializing: " << I << " (all bits dead)\n");
+ // FIXME: In theory we could substitute undef here instead of zero.
+ // This should be reconsidered once we settle on the semantics of
+ // undef, poison, etc.
+ Value *Zero = ConstantInt::get(I.getType(), 0);
+ ++NumSimplified;
+ I.replaceAllUsesWith(Zero);
+ Changed = true;
+ }
+ if (!DB.isInstructionDead(&I))
+ continue;
+
+ Worklist.push_back(&I);
+ I.dropAllReferences();
+ Changed = true;
+ }
+
+ for (Instruction *&I : Worklist) {
+ ++NumRemoved;
+ I->eraseFromParent();
+ }
+
+ return Changed;
+}
+
+FunctionPass *llvm::createBitTrackingDCEPass() {
+ return new BDCE();
+}
+
diff --git a/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
new file mode 100644
index 0000000..84f7f5f
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
@@ -0,0 +1,606 @@
+//===- ConstantHoisting.cpp - Prepare code for expensive constants --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass identifies expensive constants to hoist and coalesces them to
+// better prepare it for SelectionDAG-based code generation. This works around
+// the limitations of the basic-block-at-a-time approach.
+//
+// First it scans all instructions for integer constants and calculates its
+// cost. If the constant can be folded into the instruction (the cost is
+// TCC_Free) or the cost is just a simple operation (TCC_BASIC), then we don't
+// consider it expensive and leave it alone. This is the default behavior and
+// the default implementation of getIntImmCost will always return TCC_Free.
+//
+// If the cost is more than TCC_BASIC, then the integer constant can't be folded
+// into the instruction and it might be beneficial to hoist the constant.
+// Similar constants are coalesced to reduce register pressure and
+// materialization code.
+//
+// When a constant is hoisted, it is also hidden behind a bitcast to force it to
+// be live-out of the basic block. Otherwise the constant would be just
+// duplicated and each basic block would have its own copy in the SelectionDAG.
+// The SelectionDAG recognizes such constants as opaque and doesn't perform
+// certain transformations on them, which would create a new expensive constant.
+//
+// This optimization is only applied to integer constants in instructions and
+// simple (this means not nested) constant cast expressions. For example:
+// %0 = load i64* inttoptr (i64 big_constant to i64*)
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <tuple>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "consthoist"
+
+STATISTIC(NumConstantsHoisted, "Number of constants hoisted");
+STATISTIC(NumConstantsRebased, "Number of constants rebased");
+
+namespace {
+struct ConstantUser;
+struct RebasedConstantInfo;
+
+typedef SmallVector<ConstantUser, 8> ConstantUseListType;
+typedef SmallVector<RebasedConstantInfo, 4> RebasedConstantListType;
+
+/// \brief Keeps track of the user of a constant and the operand index where the
+/// constant is used.
+struct ConstantUser {
+ Instruction *Inst;
+ unsigned OpndIdx;
+
+ ConstantUser(Instruction *Inst, unsigned Idx) : Inst(Inst), OpndIdx(Idx) { }
+};
+
+/// \brief Keeps track of a constant candidate and its uses.
+struct ConstantCandidate {
+ ConstantUseListType Uses;
+ ConstantInt *ConstInt;
+ unsigned CumulativeCost;
+
+ ConstantCandidate(ConstantInt *ConstInt)
+ : ConstInt(ConstInt), CumulativeCost(0) { }
+
+ /// \brief Add the user to the use list and update the cost.
+ void addUser(Instruction *Inst, unsigned Idx, unsigned Cost) {
+ CumulativeCost += Cost;
+ Uses.push_back(ConstantUser(Inst, Idx));
+ }
+};
+
+/// \brief This represents a constant that has been rebased with respect to a
+/// base constant. The difference to the base constant is recorded in Offset.
+struct RebasedConstantInfo {
+ ConstantUseListType Uses;
+ Constant *Offset;
+
+ RebasedConstantInfo(ConstantUseListType &&Uses, Constant *Offset)
+ : Uses(std::move(Uses)), Offset(Offset) { }
+};
+
+/// \brief A base constant and all its rebased constants.
+struct ConstantInfo {
+ ConstantInt *BaseConstant;
+ RebasedConstantListType RebasedConstants;
+};
+
+/// \brief The constant hoisting pass.
+class ConstantHoisting : public FunctionPass {
+ typedef DenseMap<ConstantInt *, unsigned> ConstCandMapType;
+ typedef std::vector<ConstantCandidate> ConstCandVecType;
+
+ const TargetTransformInfo *TTI;
+ DominatorTree *DT;
+ BasicBlock *Entry;
+
+ /// Keeps track of constant candidates found in the function.
+ ConstCandVecType ConstCandVec;
+
+ /// Keep track of cast instructions we already cloned.
+ SmallDenseMap<Instruction *, Instruction *> ClonedCastMap;
+
+ /// These are the final constants we decided to hoist.
+ SmallVector<ConstantInfo, 8> ConstantVec;
+public:
+ static char ID; // Pass identification, replacement for typeid
+ ConstantHoisting() : FunctionPass(ID), TTI(nullptr), DT(nullptr),
+ Entry(nullptr) {
+ initializeConstantHoistingPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &Fn) override;
+
+ const char *getPassName() const override { return "Constant Hoisting"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ }
+
+private:
+ /// \brief Initialize the pass.
+ void setup(Function &Fn) {
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(Fn);
+ Entry = &Fn.getEntryBlock();
+ }
+
+ /// \brief Cleanup.
+ void cleanup() {
+ ConstantVec.clear();
+ ClonedCastMap.clear();
+ ConstCandVec.clear();
+
+ TTI = nullptr;
+ DT = nullptr;
+ Entry = nullptr;
+ }
+
+ Instruction *findMatInsertPt(Instruction *Inst, unsigned Idx = ~0U) const;
+ Instruction *findConstantInsertionPoint(const ConstantInfo &ConstInfo) const;
+ void collectConstantCandidates(ConstCandMapType &ConstCandMap,
+ Instruction *Inst, unsigned Idx,
+ ConstantInt *ConstInt);
+ void collectConstantCandidates(ConstCandMapType &ConstCandMap,
+ Instruction *Inst);
+ void collectConstantCandidates(Function &Fn);
+ void findAndMakeBaseConstant(ConstCandVecType::iterator S,
+ ConstCandVecType::iterator E);
+ void findBaseConstants();
+ void emitBaseConstants(Instruction *Base, Constant *Offset,
+ const ConstantUser &ConstUser);
+ bool emitBaseConstants();
+ void deleteDeadCastInst() const;
+ bool optimizeConstants(Function &Fn);
+};
+}
+
+char ConstantHoisting::ID = 0;
+INITIALIZE_PASS_BEGIN(ConstantHoisting, "consthoist", "Constant Hoisting",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(ConstantHoisting, "consthoist", "Constant Hoisting",
+ false, false)
+
+FunctionPass *llvm::createConstantHoistingPass() {
+ return new ConstantHoisting();
+}
+
+/// \brief Perform the constant hoisting optimization for the given function.
+bool ConstantHoisting::runOnFunction(Function &Fn) {
+ if (skipOptnoneFunction(Fn))
+ return false;
+
+ DEBUG(dbgs() << "********** Begin Constant Hoisting **********\n");
+ DEBUG(dbgs() << "********** Function: " << Fn.getName() << '\n');
+
+ setup(Fn);
+
+ bool MadeChange = optimizeConstants(Fn);
+
+ if (MadeChange) {
+ DEBUG(dbgs() << "********** Function after Constant Hoisting: "
+ << Fn.getName() << '\n');
+ DEBUG(dbgs() << Fn);
+ }
+ DEBUG(dbgs() << "********** End Constant Hoisting **********\n");
+
+ cleanup();
+
+ return MadeChange;
+}
+
+
+/// \brief Find the constant materialization insertion point.
+Instruction *ConstantHoisting::findMatInsertPt(Instruction *Inst,
+ unsigned Idx) const {
+ // If the operand is a cast instruction, then we have to materialize the
+ // constant before the cast instruction.
+ if (Idx != ~0U) {
+ Value *Opnd = Inst->getOperand(Idx);
+ if (auto CastInst = dyn_cast<Instruction>(Opnd))
+ if (CastInst->isCast())
+ return CastInst;
+ }
+
+ // The simple and common case. This also includes constant expressions.
+ if (!isa<PHINode>(Inst) && !Inst->isEHPad())
+ return Inst;
+
+ // We can't insert directly before a phi node or an eh pad. Insert before
+ // the terminator of the incoming or dominating block.
+ assert(Entry != Inst->getParent() && "PHI or landing pad in entry block!");
+ if (Idx != ~0U && isa<PHINode>(Inst))
+ return cast<PHINode>(Inst)->getIncomingBlock(Idx)->getTerminator();
+
+ BasicBlock *IDom = DT->getNode(Inst->getParent())->getIDom()->getBlock();
+ return IDom->getTerminator();
+}
+
+/// \brief Find an insertion point that dominates all uses.
+Instruction *ConstantHoisting::
+findConstantInsertionPoint(const ConstantInfo &ConstInfo) const {
+ assert(!ConstInfo.RebasedConstants.empty() && "Invalid constant info entry.");
+ // Collect all basic blocks.
+ SmallPtrSet<BasicBlock *, 8> BBs;
+ for (auto const &RCI : ConstInfo.RebasedConstants)
+ for (auto const &U : RCI.Uses)
+ BBs.insert(findMatInsertPt(U.Inst, U.OpndIdx)->getParent());
+
+ if (BBs.count(Entry))
+ return &Entry->front();
+
+ while (BBs.size() >= 2) {
+ BasicBlock *BB, *BB1, *BB2;
+ BB1 = *BBs.begin();
+ BB2 = *std::next(BBs.begin());
+ BB = DT->findNearestCommonDominator(BB1, BB2);
+ if (BB == Entry)
+ return &Entry->front();
+ BBs.erase(BB1);
+ BBs.erase(BB2);
+ BBs.insert(BB);
+ }
+ assert((BBs.size() == 1) && "Expected only one element.");
+ Instruction &FirstInst = (*BBs.begin())->front();
+ return findMatInsertPt(&FirstInst);
+}
+
+
+/// \brief Record constant integer ConstInt for instruction Inst at operand
+/// index Idx.
+///
+/// The operand at index Idx is not necessarily the constant integer itself. It
+/// could also be a cast instruction or a constant expression that uses the
+// constant integer.
+void ConstantHoisting::collectConstantCandidates(ConstCandMapType &ConstCandMap,
+ Instruction *Inst,
+ unsigned Idx,
+ ConstantInt *ConstInt) {
+ unsigned Cost;
+ // Ask the target about the cost of materializing the constant for the given
+ // instruction and operand index.
+ if (auto IntrInst = dyn_cast<IntrinsicInst>(Inst))
+ Cost = TTI->getIntImmCost(IntrInst->getIntrinsicID(), Idx,
+ ConstInt->getValue(), ConstInt->getType());
+ else
+ Cost = TTI->getIntImmCost(Inst->getOpcode(), Idx, ConstInt->getValue(),
+ ConstInt->getType());
+
+ // Ignore cheap integer constants.
+ if (Cost > TargetTransformInfo::TCC_Basic) {
+ ConstCandMapType::iterator Itr;
+ bool Inserted;
+ std::tie(Itr, Inserted) = ConstCandMap.insert(std::make_pair(ConstInt, 0));
+ if (Inserted) {
+ ConstCandVec.push_back(ConstantCandidate(ConstInt));
+ Itr->second = ConstCandVec.size() - 1;
+ }
+ ConstCandVec[Itr->second].addUser(Inst, Idx, Cost);
+ DEBUG(if (isa<ConstantInt>(Inst->getOperand(Idx)))
+ dbgs() << "Collect constant " << *ConstInt << " from " << *Inst
+ << " with cost " << Cost << '\n';
+ else
+ dbgs() << "Collect constant " << *ConstInt << " indirectly from "
+ << *Inst << " via " << *Inst->getOperand(Idx) << " with cost "
+ << Cost << '\n';
+ );
+ }
+}
+
+/// \brief Scan the instruction for expensive integer constants and record them
+/// in the constant candidate vector.
+void ConstantHoisting::collectConstantCandidates(ConstCandMapType &ConstCandMap,
+ Instruction *Inst) {
+ // Skip all cast instructions. They are visited indirectly later on.
+ if (Inst->isCast())
+ return;
+
+ // Can't handle inline asm. Skip it.
+ if (auto Call = dyn_cast<CallInst>(Inst))
+ if (isa<InlineAsm>(Call->getCalledValue()))
+ return;
+
+ // Scan all operands.
+ for (unsigned Idx = 0, E = Inst->getNumOperands(); Idx != E; ++Idx) {
+ Value *Opnd = Inst->getOperand(Idx);
+
+ // Visit constant integers.
+ if (auto ConstInt = dyn_cast<ConstantInt>(Opnd)) {
+ collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt);
+ continue;
+ }
+
+ // Visit cast instructions that have constant integers.
+ if (auto CastInst = dyn_cast<Instruction>(Opnd)) {
+ // Only visit cast instructions, which have been skipped. All other
+ // instructions should have already been visited.
+ if (!CastInst->isCast())
+ continue;
+
+ if (auto *ConstInt = dyn_cast<ConstantInt>(CastInst->getOperand(0))) {
+ // Pretend the constant is directly used by the instruction and ignore
+ // the cast instruction.
+ collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt);
+ continue;
+ }
+ }
+
+ // Visit constant expressions that have constant integers.
+ if (auto ConstExpr = dyn_cast<ConstantExpr>(Opnd)) {
+ // Only visit constant cast expressions.
+ if (!ConstExpr->isCast())
+ continue;
+
+ if (auto ConstInt = dyn_cast<ConstantInt>(ConstExpr->getOperand(0))) {
+ // Pretend the constant is directly used by the instruction and ignore
+ // the constant expression.
+ collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt);
+ continue;
+ }
+ }
+ } // end of for all operands
+}
+
+/// \brief Collect all integer constants in the function that cannot be folded
+/// into an instruction itself.
+void ConstantHoisting::collectConstantCandidates(Function &Fn) {
+ ConstCandMapType ConstCandMap;
+ for (BasicBlock &BB : Fn)
+ for (Instruction &Inst : BB)
+ collectConstantCandidates(ConstCandMap, &Inst);
+}
+
+/// \brief Find the base constant within the given range and rebase all other
+/// constants with respect to the base constant.
+void ConstantHoisting::findAndMakeBaseConstant(ConstCandVecType::iterator S,
+ ConstCandVecType::iterator E) {
+ auto MaxCostItr = S;
+ unsigned NumUses = 0;
+ // Use the constant that has the maximum cost as base constant.
+ for (auto ConstCand = S; ConstCand != E; ++ConstCand) {
+ NumUses += ConstCand->Uses.size();
+ if (ConstCand->CumulativeCost > MaxCostItr->CumulativeCost)
+ MaxCostItr = ConstCand;
+ }
+
+ // Don't hoist constants that have only one use.
+ if (NumUses <= 1)
+ return;
+
+ ConstantInfo ConstInfo;
+ ConstInfo.BaseConstant = MaxCostItr->ConstInt;
+ Type *Ty = ConstInfo.BaseConstant->getType();
+
+ // Rebase the constants with respect to the base constant.
+ for (auto ConstCand = S; ConstCand != E; ++ConstCand) {
+ APInt Diff = ConstCand->ConstInt->getValue() -
+ ConstInfo.BaseConstant->getValue();
+ Constant *Offset = Diff == 0 ? nullptr : ConstantInt::get(Ty, Diff);
+ ConstInfo.RebasedConstants.push_back(
+ RebasedConstantInfo(std::move(ConstCand->Uses), Offset));
+ }
+ ConstantVec.push_back(std::move(ConstInfo));
+}
+
+/// \brief Finds and combines constant candidates that can be easily
+/// rematerialized with an add from a common base constant.
+void ConstantHoisting::findBaseConstants() {
+ // Sort the constants by value and type. This invalidates the mapping!
+ std::sort(ConstCandVec.begin(), ConstCandVec.end(),
+ [](const ConstantCandidate &LHS, const ConstantCandidate &RHS) {
+ if (LHS.ConstInt->getType() != RHS.ConstInt->getType())
+ return LHS.ConstInt->getType()->getBitWidth() <
+ RHS.ConstInt->getType()->getBitWidth();
+ return LHS.ConstInt->getValue().ult(RHS.ConstInt->getValue());
+ });
+
+ // Simple linear scan through the sorted constant candidate vector for viable
+ // merge candidates.
+ auto MinValItr = ConstCandVec.begin();
+ for (auto CC = std::next(ConstCandVec.begin()), E = ConstCandVec.end();
+ CC != E; ++CC) {
+ if (MinValItr->ConstInt->getType() == CC->ConstInt->getType()) {
+ // Check if the constant is in range of an add with immediate.
+ APInt Diff = CC->ConstInt->getValue() - MinValItr->ConstInt->getValue();
+ if ((Diff.getBitWidth() <= 64) &&
+ TTI->isLegalAddImmediate(Diff.getSExtValue()))
+ continue;
+ }
+ // We either have now a different constant type or the constant is not in
+ // range of an add with immediate anymore.
+ findAndMakeBaseConstant(MinValItr, CC);
+ // Start a new base constant search.
+ MinValItr = CC;
+ }
+ // Finalize the last base constant search.
+ findAndMakeBaseConstant(MinValItr, ConstCandVec.end());
+}
+
+/// \brief Updates the operand at Idx in instruction Inst with the result of
+/// instruction Mat. If the instruction is a PHI node then special
+/// handling for duplicate values form the same incomming basic block is
+/// required.
+/// \return The update will always succeed, but the return value indicated if
+/// Mat was used for the update or not.
+static bool updateOperand(Instruction *Inst, unsigned Idx, Instruction *Mat) {
+ if (auto PHI = dyn_cast<PHINode>(Inst)) {
+ // Check if any previous operand of the PHI node has the same incoming basic
+ // block. This is a very odd case that happens when the incoming basic block
+ // has a switch statement. In this case use the same value as the previous
+ // operand(s), otherwise we will fail verification due to different values.
+ // The values are actually the same, but the variable names are different
+ // and the verifier doesn't like that.
+ BasicBlock *IncomingBB = PHI->getIncomingBlock(Idx);
+ for (unsigned i = 0; i < Idx; ++i) {
+ if (PHI->getIncomingBlock(i) == IncomingBB) {
+ Value *IncomingVal = PHI->getIncomingValue(i);
+ Inst->setOperand(Idx, IncomingVal);
+ return false;
+ }
+ }
+ }
+
+ Inst->setOperand(Idx, Mat);
+ return true;
+}
+
+/// \brief Emit materialization code for all rebased constants and update their
+/// users.
+void ConstantHoisting::emitBaseConstants(Instruction *Base, Constant *Offset,
+ const ConstantUser &ConstUser) {
+ Instruction *Mat = Base;
+ if (Offset) {
+ Instruction *InsertionPt = findMatInsertPt(ConstUser.Inst,
+ ConstUser.OpndIdx);
+ Mat = BinaryOperator::Create(Instruction::Add, Base, Offset,
+ "const_mat", InsertionPt);
+
+ DEBUG(dbgs() << "Materialize constant (" << *Base->getOperand(0)
+ << " + " << *Offset << ") in BB "
+ << Mat->getParent()->getName() << '\n' << *Mat << '\n');
+ Mat->setDebugLoc(ConstUser.Inst->getDebugLoc());
+ }
+ Value *Opnd = ConstUser.Inst->getOperand(ConstUser.OpndIdx);
+
+ // Visit constant integer.
+ if (isa<ConstantInt>(Opnd)) {
+ DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n');
+ if (!updateOperand(ConstUser.Inst, ConstUser.OpndIdx, Mat) && Offset)
+ Mat->eraseFromParent();
+ DEBUG(dbgs() << "To : " << *ConstUser.Inst << '\n');
+ return;
+ }
+
+ // Visit cast instruction.
+ if (auto CastInst = dyn_cast<Instruction>(Opnd)) {
+ assert(CastInst->isCast() && "Expected an cast instruction!");
+ // Check if we already have visited this cast instruction before to avoid
+ // unnecessary cloning.
+ Instruction *&ClonedCastInst = ClonedCastMap[CastInst];
+ if (!ClonedCastInst) {
+ ClonedCastInst = CastInst->clone();
+ ClonedCastInst->setOperand(0, Mat);
+ ClonedCastInst->insertAfter(CastInst);
+ // Use the same debug location as the original cast instruction.
+ ClonedCastInst->setDebugLoc(CastInst->getDebugLoc());
+ DEBUG(dbgs() << "Clone instruction: " << *CastInst << '\n'
+ << "To : " << *ClonedCastInst << '\n');
+ }
+
+ DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n');
+ updateOperand(ConstUser.Inst, ConstUser.OpndIdx, ClonedCastInst);
+ DEBUG(dbgs() << "To : " << *ConstUser.Inst << '\n');
+ return;
+ }
+
+ // Visit constant expression.
+ if (auto ConstExpr = dyn_cast<ConstantExpr>(Opnd)) {
+ Instruction *ConstExprInst = ConstExpr->getAsInstruction();
+ ConstExprInst->setOperand(0, Mat);
+ ConstExprInst->insertBefore(findMatInsertPt(ConstUser.Inst,
+ ConstUser.OpndIdx));
+
+ // Use the same debug location as the instruction we are about to update.
+ ConstExprInst->setDebugLoc(ConstUser.Inst->getDebugLoc());
+
+ DEBUG(dbgs() << "Create instruction: " << *ConstExprInst << '\n'
+ << "From : " << *ConstExpr << '\n');
+ DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n');
+ if (!updateOperand(ConstUser.Inst, ConstUser.OpndIdx, ConstExprInst)) {
+ ConstExprInst->eraseFromParent();
+ if (Offset)
+ Mat->eraseFromParent();
+ }
+ DEBUG(dbgs() << "To : " << *ConstUser.Inst << '\n');
+ return;
+ }
+}
+
+/// \brief Hoist and hide the base constant behind a bitcast and emit
+/// materialization code for derived constants.
+bool ConstantHoisting::emitBaseConstants() {
+ bool MadeChange = false;
+ for (auto const &ConstInfo : ConstantVec) {
+ // Hoist and hide the base constant behind a bitcast.
+ Instruction *IP = findConstantInsertionPoint(ConstInfo);
+ IntegerType *Ty = ConstInfo.BaseConstant->getType();
+ Instruction *Base =
+ new BitCastInst(ConstInfo.BaseConstant, Ty, "const", IP);
+ DEBUG(dbgs() << "Hoist constant (" << *ConstInfo.BaseConstant << ") to BB "
+ << IP->getParent()->getName() << '\n' << *Base << '\n');
+ NumConstantsHoisted++;
+
+ // Emit materialization code for all rebased constants.
+ for (auto const &RCI : ConstInfo.RebasedConstants) {
+ NumConstantsRebased++;
+ for (auto const &U : RCI.Uses)
+ emitBaseConstants(Base, RCI.Offset, U);
+ }
+
+ // Use the same debug location as the last user of the constant.
+ assert(!Base->use_empty() && "The use list is empty!?");
+ assert(isa<Instruction>(Base->user_back()) &&
+ "All uses should be instructions.");
+ Base->setDebugLoc(cast<Instruction>(Base->user_back())->getDebugLoc());
+
+ // Correct for base constant, which we counted above too.
+ NumConstantsRebased--;
+ MadeChange = true;
+ }
+ return MadeChange;
+}
+
+/// \brief Check all cast instructions we made a copy of and remove them if they
+/// have no more users.
+void ConstantHoisting::deleteDeadCastInst() const {
+ for (auto const &I : ClonedCastMap)
+ if (I.first->use_empty())
+ I.first->eraseFromParent();
+}
+
+/// \brief Optimize expensive integer constants in the given function.
+bool ConstantHoisting::optimizeConstants(Function &Fn) {
+ // Collect all constant candidates.
+ collectConstantCandidates(Fn);
+
+ // There are no constant candidates to worry about.
+ if (ConstCandVec.empty())
+ return false;
+
+ // Combine constants that can be easily materialized with an add from a common
+ // base constant.
+ findBaseConstants();
+
+ // There are no constants to emit.
+ if (ConstantVec.empty())
+ return false;
+
+ // Finally hoist the base constant and emit materialization code for dependent
+ // constants.
+ bool MadeChange = emitBaseConstants();
+
+ // Cleanup dead instructions.
+ deleteDeadCastInst();
+
+ return MadeChange;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp b/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp
new file mode 100644
index 0000000..c974ebb
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp
@@ -0,0 +1,98 @@
+//===- ConstantProp.cpp - Code to perform Simple Constant Propagation -----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements constant propagation and merging:
+//
+// Specifically, this:
+// * Converts instructions like "add int 1, 2" into 3
+//
+// Notice that:
+// * This pass has a habit of making definitions be dead. It is a good idea
+// to run a DIE pass sometime after running this pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/Pass.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include <set>
+using namespace llvm;
+
+#define DEBUG_TYPE "constprop"
+
+STATISTIC(NumInstKilled, "Number of instructions killed");
+
+namespace {
+ struct ConstantPropagation : public FunctionPass {
+ static char ID; // Pass identification, replacement for typeid
+ ConstantPropagation() : FunctionPass(ID) {
+ initializeConstantPropagationPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ }
+ };
+}
+
+char ConstantPropagation::ID = 0;
+INITIALIZE_PASS_BEGIN(ConstantPropagation, "constprop",
+ "Simple constant propagation", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(ConstantPropagation, "constprop",
+ "Simple constant propagation", false, false)
+
+FunctionPass *llvm::createConstantPropagationPass() {
+ return new ConstantPropagation();
+}
+
+bool ConstantPropagation::runOnFunction(Function &F) {
+ // Initialize the worklist to all of the instructions ready to process...
+ std::set<Instruction*> WorkList;
+ for(inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
+ WorkList.insert(&*i);
+ }
+ bool Changed = false;
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ TargetLibraryInfo *TLI =
+ &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+
+ while (!WorkList.empty()) {
+ Instruction *I = *WorkList.begin();
+ WorkList.erase(WorkList.begin()); // Get an element from the worklist...
+
+ if (!I->use_empty()) // Don't muck with dead instructions...
+ if (Constant *C = ConstantFoldInstruction(I, DL, TLI)) {
+ // Add all of the users of this instruction to the worklist, they might
+ // be constant propagatable now...
+ for (User *U : I->users())
+ WorkList.insert(cast<Instruction>(U));
+
+ // Replace all of the uses of a variable with uses of the constant.
+ I->replaceAllUsesWith(C);
+
+ // Remove the dead instruction.
+ WorkList.erase(I);
+ I->eraseFromParent();
+
+ // We made a change to the function...
+ Changed = true;
+ ++NumInstKilled;
+ }
+ }
+ return Changed;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
new file mode 100644
index 0000000..686bd40
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -0,0 +1,419 @@
+//===- CorrelatedValuePropagation.cpp - Propagate CFG-derived info --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Correlated Value Propagation pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LazyValueInfo.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "correlated-value-propagation"
+
+STATISTIC(NumPhis, "Number of phis propagated");
+STATISTIC(NumSelects, "Number of selects propagated");
+STATISTIC(NumMemAccess, "Number of memory access targets propagated");
+STATISTIC(NumCmps, "Number of comparisons propagated");
+STATISTIC(NumReturns, "Number of return values propagated");
+STATISTIC(NumDeadCases, "Number of switch cases removed");
+
+namespace {
+ class CorrelatedValuePropagation : public FunctionPass {
+ LazyValueInfo *LVI;
+
+ bool processSelect(SelectInst *SI);
+ bool processPHI(PHINode *P);
+ bool processMemAccess(Instruction *I);
+ bool processCmp(CmpInst *C);
+ bool processSwitch(SwitchInst *SI);
+ bool processCallSite(CallSite CS);
+
+ /// Return a constant value for V usable at At and everything it
+ /// dominates. If no such Constant can be found, return nullptr.
+ Constant *getConstantAt(Value *V, Instruction *At);
+
+ public:
+ static char ID;
+ CorrelatedValuePropagation(): FunctionPass(ID) {
+ initializeCorrelatedValuePropagationPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<LazyValueInfo>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ }
+ };
+}
+
+char CorrelatedValuePropagation::ID = 0;
+INITIALIZE_PASS_BEGIN(CorrelatedValuePropagation, "correlated-propagation",
+ "Value Propagation", false, false)
+INITIALIZE_PASS_DEPENDENCY(LazyValueInfo)
+INITIALIZE_PASS_END(CorrelatedValuePropagation, "correlated-propagation",
+ "Value Propagation", false, false)
+
+// Public interface to the Value Propagation pass
+Pass *llvm::createCorrelatedValuePropagationPass() {
+ return new CorrelatedValuePropagation();
+}
+
+bool CorrelatedValuePropagation::processSelect(SelectInst *S) {
+ if (S->getType()->isVectorTy()) return false;
+ if (isa<Constant>(S->getOperand(0))) return false;
+
+ Constant *C = LVI->getConstant(S->getOperand(0), S->getParent(), S);
+ if (!C) return false;
+
+ ConstantInt *CI = dyn_cast<ConstantInt>(C);
+ if (!CI) return false;
+
+ Value *ReplaceWith = S->getOperand(1);
+ Value *Other = S->getOperand(2);
+ if (!CI->isOne()) std::swap(ReplaceWith, Other);
+ if (ReplaceWith == S) ReplaceWith = UndefValue::get(S->getType());
+
+ S->replaceAllUsesWith(ReplaceWith);
+ S->eraseFromParent();
+
+ ++NumSelects;
+
+ return true;
+}
+
+bool CorrelatedValuePropagation::processPHI(PHINode *P) {
+ bool Changed = false;
+
+ BasicBlock *BB = P->getParent();
+ for (unsigned i = 0, e = P->getNumIncomingValues(); i < e; ++i) {
+ Value *Incoming = P->getIncomingValue(i);
+ if (isa<Constant>(Incoming)) continue;
+
+ Value *V = LVI->getConstantOnEdge(Incoming, P->getIncomingBlock(i), BB, P);
+
+ // Look if the incoming value is a select with a scalar condition for which
+ // LVI can tells us the value. In that case replace the incoming value with
+ // the appropriate value of the select. This often allows us to remove the
+ // select later.
+ if (!V) {
+ SelectInst *SI = dyn_cast<SelectInst>(Incoming);
+ if (!SI) continue;
+
+ Value *Condition = SI->getCondition();
+ if (!Condition->getType()->isVectorTy()) {
+ if (Constant *C = LVI->getConstantOnEdge(
+ Condition, P->getIncomingBlock(i), BB, P)) {
+ if (C->isOneValue()) {
+ V = SI->getTrueValue();
+ } else if (C->isZeroValue()) {
+ V = SI->getFalseValue();
+ }
+ // Once LVI learns to handle vector types, we could also add support
+ // for vector type constants that are not all zeroes or all ones.
+ }
+ }
+
+ // Look if the select has a constant but LVI tells us that the incoming
+ // value can never be that constant. In that case replace the incoming
+ // value with the other value of the select. This often allows us to
+ // remove the select later.
+ if (!V) {
+ Constant *C = dyn_cast<Constant>(SI->getFalseValue());
+ if (!C) continue;
+
+ if (LVI->getPredicateOnEdge(ICmpInst::ICMP_EQ, SI, C,
+ P->getIncomingBlock(i), BB, P) !=
+ LazyValueInfo::False)
+ continue;
+ V = SI->getTrueValue();
+ }
+
+ DEBUG(dbgs() << "CVP: Threading PHI over " << *SI << '\n');
+ }
+
+ P->setIncomingValue(i, V);
+ Changed = true;
+ }
+
+ // FIXME: Provide TLI, DT, AT to SimplifyInstruction.
+ const DataLayout &DL = BB->getModule()->getDataLayout();
+ if (Value *V = SimplifyInstruction(P, DL)) {
+ P->replaceAllUsesWith(V);
+ P->eraseFromParent();
+ Changed = true;
+ }
+
+ if (Changed)
+ ++NumPhis;
+
+ return Changed;
+}
+
+bool CorrelatedValuePropagation::processMemAccess(Instruction *I) {
+ Value *Pointer = nullptr;
+ if (LoadInst *L = dyn_cast<LoadInst>(I))
+ Pointer = L->getPointerOperand();
+ else
+ Pointer = cast<StoreInst>(I)->getPointerOperand();
+
+ if (isa<Constant>(Pointer)) return false;
+
+ Constant *C = LVI->getConstant(Pointer, I->getParent(), I);
+ if (!C) return false;
+
+ ++NumMemAccess;
+ I->replaceUsesOfWith(Pointer, C);
+ return true;
+}
+
+/// processCmp - See if LazyValueInfo's ability to exploit edge conditions,
+/// or range information is sufficient to prove this comparison. Even for
+/// local conditions, this can sometimes prove conditions instcombine can't by
+/// exploiting range information.
+bool CorrelatedValuePropagation::processCmp(CmpInst *C) {
+ Value *Op0 = C->getOperand(0);
+ Constant *Op1 = dyn_cast<Constant>(C->getOperand(1));
+ if (!Op1) return false;
+
+ // As a policy choice, we choose not to waste compile time on anything where
+ // the comparison is testing local values. While LVI can sometimes reason
+ // about such cases, it's not its primary purpose. We do make sure to do
+ // the block local query for uses from terminator instructions, but that's
+ // handled in the code for each terminator.
+ auto *I = dyn_cast<Instruction>(Op0);
+ if (I && I->getParent() == C->getParent())
+ return false;
+
+ LazyValueInfo::Tristate Result =
+ LVI->getPredicateAt(C->getPredicate(), Op0, Op1, C);
+ if (Result == LazyValueInfo::Unknown) return false;
+
+ ++NumCmps;
+ if (Result == LazyValueInfo::True)
+ C->replaceAllUsesWith(ConstantInt::getTrue(C->getContext()));
+ else
+ C->replaceAllUsesWith(ConstantInt::getFalse(C->getContext()));
+ C->eraseFromParent();
+
+ return true;
+}
+
+/// processSwitch - Simplify a switch instruction by removing cases which can
+/// never fire. If the uselessness of a case could be determined locally then
+/// constant propagation would already have figured it out. Instead, walk the
+/// predecessors and statically evaluate cases based on information available
+/// on that edge. Cases that cannot fire no matter what the incoming edge can
+/// safely be removed. If a case fires on every incoming edge then the entire
+/// switch can be removed and replaced with a branch to the case destination.
+bool CorrelatedValuePropagation::processSwitch(SwitchInst *SI) {
+ Value *Cond = SI->getCondition();
+ BasicBlock *BB = SI->getParent();
+
+ // If the condition was defined in same block as the switch then LazyValueInfo
+ // currently won't say anything useful about it, though in theory it could.
+ if (isa<Instruction>(Cond) && cast<Instruction>(Cond)->getParent() == BB)
+ return false;
+
+ // If the switch is unreachable then trying to improve it is a waste of time.
+ pred_iterator PB = pred_begin(BB), PE = pred_end(BB);
+ if (PB == PE) return false;
+
+ // Analyse each switch case in turn. This is done in reverse order so that
+ // removing a case doesn't cause trouble for the iteration.
+ bool Changed = false;
+ for (SwitchInst::CaseIt CI = SI->case_end(), CE = SI->case_begin(); CI-- != CE;
+ ) {
+ ConstantInt *Case = CI.getCaseValue();
+
+ // Check to see if the switch condition is equal to/not equal to the case
+ // value on every incoming edge, equal/not equal being the same each time.
+ LazyValueInfo::Tristate State = LazyValueInfo::Unknown;
+ for (pred_iterator PI = PB; PI != PE; ++PI) {
+ // Is the switch condition equal to the case value?
+ LazyValueInfo::Tristate Value = LVI->getPredicateOnEdge(CmpInst::ICMP_EQ,
+ Cond, Case, *PI,
+ BB, SI);
+ // Give up on this case if nothing is known.
+ if (Value == LazyValueInfo::Unknown) {
+ State = LazyValueInfo::Unknown;
+ break;
+ }
+
+ // If this was the first edge to be visited, record that all other edges
+ // need to give the same result.
+ if (PI == PB) {
+ State = Value;
+ continue;
+ }
+
+ // If this case is known to fire for some edges and known not to fire for
+ // others then there is nothing we can do - give up.
+ if (Value != State) {
+ State = LazyValueInfo::Unknown;
+ break;
+ }
+ }
+
+ if (State == LazyValueInfo::False) {
+ // This case never fires - remove it.
+ CI.getCaseSuccessor()->removePredecessor(BB);
+ SI->removeCase(CI); // Does not invalidate the iterator.
+
+ // The condition can be modified by removePredecessor's PHI simplification
+ // logic.
+ Cond = SI->getCondition();
+
+ ++NumDeadCases;
+ Changed = true;
+ } else if (State == LazyValueInfo::True) {
+ // This case always fires. Arrange for the switch to be turned into an
+ // unconditional branch by replacing the switch condition with the case
+ // value.
+ SI->setCondition(Case);
+ NumDeadCases += SI->getNumCases();
+ Changed = true;
+ break;
+ }
+ }
+
+ if (Changed)
+ // If the switch has been simplified to the point where it can be replaced
+ // by a branch then do so now.
+ ConstantFoldTerminator(BB);
+
+ return Changed;
+}
+
+/// processCallSite - Infer nonnull attributes for the arguments at the
+/// specified callsite.
+bool CorrelatedValuePropagation::processCallSite(CallSite CS) {
+ SmallVector<unsigned, 4> Indices;
+ unsigned ArgNo = 0;
+
+ for (Value *V : CS.args()) {
+ PointerType *Type = dyn_cast<PointerType>(V->getType());
+
+ if (Type && !CS.paramHasAttr(ArgNo + 1, Attribute::NonNull) &&
+ LVI->getPredicateAt(ICmpInst::ICMP_EQ, V,
+ ConstantPointerNull::get(Type),
+ CS.getInstruction()) == LazyValueInfo::False)
+ Indices.push_back(ArgNo + 1);
+ ArgNo++;
+ }
+
+ assert(ArgNo == CS.arg_size() && "sanity check");
+
+ if (Indices.empty())
+ return false;
+
+ AttributeSet AS = CS.getAttributes();
+ LLVMContext &Ctx = CS.getInstruction()->getContext();
+ AS = AS.addAttribute(Ctx, Indices, Attribute::get(Ctx, Attribute::NonNull));
+ CS.setAttributes(AS);
+
+ return true;
+}
+
+Constant *CorrelatedValuePropagation::getConstantAt(Value *V, Instruction *At) {
+ if (Constant *C = LVI->getConstant(V, At->getParent(), At))
+ return C;
+
+ // TODO: The following really should be sunk inside LVI's core algorithm, or
+ // at least the outer shims around such.
+ auto *C = dyn_cast<CmpInst>(V);
+ if (!C) return nullptr;
+
+ Value *Op0 = C->getOperand(0);
+ Constant *Op1 = dyn_cast<Constant>(C->getOperand(1));
+ if (!Op1) return nullptr;
+
+ LazyValueInfo::Tristate Result =
+ LVI->getPredicateAt(C->getPredicate(), Op0, Op1, At);
+ if (Result == LazyValueInfo::Unknown)
+ return nullptr;
+
+ return (Result == LazyValueInfo::True) ?
+ ConstantInt::getTrue(C->getContext()) :
+ ConstantInt::getFalse(C->getContext());
+}
+
+bool CorrelatedValuePropagation::runOnFunction(Function &F) {
+ if (skipOptnoneFunction(F))
+ return false;
+
+ LVI = &getAnalysis<LazyValueInfo>();
+
+ bool FnChanged = false;
+
+ for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) {
+ bool BBChanged = false;
+ for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); BI != BE; ) {
+ Instruction *II = &*BI++;
+ switch (II->getOpcode()) {
+ case Instruction::Select:
+ BBChanged |= processSelect(cast<SelectInst>(II));
+ break;
+ case Instruction::PHI:
+ BBChanged |= processPHI(cast<PHINode>(II));
+ break;
+ case Instruction::ICmp:
+ case Instruction::FCmp:
+ BBChanged |= processCmp(cast<CmpInst>(II));
+ break;
+ case Instruction::Load:
+ case Instruction::Store:
+ BBChanged |= processMemAccess(II);
+ break;
+ case Instruction::Call:
+ case Instruction::Invoke:
+ BBChanged |= processCallSite(CallSite(II));
+ break;
+ }
+ }
+
+ Instruction *Term = FI->getTerminator();
+ switch (Term->getOpcode()) {
+ case Instruction::Switch:
+ BBChanged |= processSwitch(cast<SwitchInst>(Term));
+ break;
+ case Instruction::Ret: {
+ auto *RI = cast<ReturnInst>(Term);
+ // Try to determine the return value if we can. This is mainly here to
+ // simplify the writing of unit tests, but also helps to enable IPO by
+ // constant folding the return values of callees.
+ auto *RetVal = RI->getReturnValue();
+ if (!RetVal) break; // handle "ret void"
+ if (isa<Constant>(RetVal)) break; // nothing to do
+ if (auto *C = getConstantAt(RetVal, RI)) {
+ ++NumReturns;
+ RI->replaceUsesOfWith(RetVal, C);
+ BBChanged = true;
+ }
+ }
+ };
+
+ FnChanged |= BBChanged;
+ }
+
+ return FnChanged;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/DCE.cpp b/contrib/llvm/lib/Transforms/Scalar/DCE.cpp
new file mode 100644
index 0000000..b67c3c7
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/DCE.cpp
@@ -0,0 +1,156 @@
+//===- DCE.cpp - Code to perform dead code elimination --------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements dead inst elimination and dead code elimination.
+//
+// Dead Inst Elimination performs a single pass over the function removing
+// instructions that are obviously dead. Dead Code Elimination is similar, but
+// it rechecks instructions that were used by removed instructions to see if
+// they are newly dead.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/Pass.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "dce"
+
+STATISTIC(DIEEliminated, "Number of insts removed by DIE pass");
+STATISTIC(DCEEliminated, "Number of insts removed");
+
+namespace {
+ //===--------------------------------------------------------------------===//
+ // DeadInstElimination pass implementation
+ //
+ struct DeadInstElimination : public BasicBlockPass {
+ static char ID; // Pass identification, replacement for typeid
+ DeadInstElimination() : BasicBlockPass(ID) {
+ initializeDeadInstEliminationPass(*PassRegistry::getPassRegistry());
+ }
+ bool runOnBasicBlock(BasicBlock &BB) override {
+ if (skipOptnoneFunction(BB))
+ return false;
+ auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+ TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr;
+ bool Changed = false;
+ for (BasicBlock::iterator DI = BB.begin(); DI != BB.end(); ) {
+ Instruction *Inst = &*DI++;
+ if (isInstructionTriviallyDead(Inst, TLI)) {
+ Inst->eraseFromParent();
+ Changed = true;
+ ++DIEEliminated;
+ }
+ }
+ return Changed;
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ }
+ };
+}
+
+char DeadInstElimination::ID = 0;
+INITIALIZE_PASS(DeadInstElimination, "die",
+ "Dead Instruction Elimination", false, false)
+
+Pass *llvm::createDeadInstEliminationPass() {
+ return new DeadInstElimination();
+}
+
+
+namespace {
+ //===--------------------------------------------------------------------===//
+ // DeadCodeElimination pass implementation
+ //
+ struct DCE : public FunctionPass {
+ static char ID; // Pass identification, replacement for typeid
+ DCE() : FunctionPass(ID) {
+ initializeDCEPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ }
+ };
+}
+
+char DCE::ID = 0;
+INITIALIZE_PASS(DCE, "dce", "Dead Code Elimination", false, false)
+
+static bool DCEInstruction(Instruction *I,
+ SmallSetVector<Instruction *, 16> &WorkList,
+ const TargetLibraryInfo *TLI) {
+ if (isInstructionTriviallyDead(I, TLI)) {
+ // Null out all of the instruction's operands to see if any operand becomes
+ // dead as we go.
+ for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+ Value *OpV = I->getOperand(i);
+ I->setOperand(i, nullptr);
+
+ if (!OpV->use_empty() || I == OpV)
+ continue;
+
+ // If the operand is an instruction that became dead as we nulled out the
+ // operand, and if it is 'trivially' dead, delete it in a future loop
+ // iteration.
+ if (Instruction *OpI = dyn_cast<Instruction>(OpV))
+ if (isInstructionTriviallyDead(OpI, TLI))
+ WorkList.insert(OpI);
+ }
+
+ I->eraseFromParent();
+ ++DCEEliminated;
+ return true;
+ }
+ return false;
+}
+
+bool DCE::runOnFunction(Function &F) {
+ if (skipOptnoneFunction(F))
+ return false;
+
+ auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+ TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr;
+
+ bool MadeChange = false;
+ SmallSetVector<Instruction *, 16> WorkList;
+ // Iterate over the original function, only adding insts to the worklist
+ // if they actually need to be revisited. This avoids having to pre-init
+ // the worklist with the entire function's worth of instructions.
+ for (inst_iterator FI = inst_begin(F), FE = inst_end(F); FI != FE;) {
+ Instruction *I = &*FI;
+ ++FI;
+
+ // We're visiting this instruction now, so make sure it's not in the
+ // worklist from an earlier visit.
+ if (!WorkList.count(I))
+ MadeChange |= DCEInstruction(I, WorkList, TLI);
+ }
+
+ while (!WorkList.empty()) {
+ Instruction *I = WorkList.pop_back_val();
+ MadeChange |= DCEInstruction(I, WorkList, TLI);
+ }
+ return MadeChange;
+}
+
+FunctionPass *llvm::createDeadCodeEliminationPass() {
+ return new DCE();
+}
+
diff --git a/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
new file mode 100644
index 0000000..36ad0a5
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -0,0 +1,961 @@
+//===- DeadStoreElimination.cpp - Fast Dead Store Elimination -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a trivial dead store elimination that only considers
+// basic-block local redundant stores.
+//
+// FIXME: This should eventually be extended to be a post-dominator tree
+// traversal. Doing so would be pretty trivial.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "dse"
+
+STATISTIC(NumRedundantStores, "Number of redundant stores deleted");
+STATISTIC(NumFastStores, "Number of stores deleted");
+STATISTIC(NumFastOther , "Number of other instrs removed");
+
+namespace {
+ struct DSE : public FunctionPass {
+ AliasAnalysis *AA;
+ MemoryDependenceAnalysis *MD;
+ DominatorTree *DT;
+ const TargetLibraryInfo *TLI;
+
+ static char ID; // Pass identification, replacement for typeid
+ DSE() : FunctionPass(ID), AA(nullptr), MD(nullptr), DT(nullptr) {
+ initializeDSEPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipOptnoneFunction(F))
+ return false;
+
+ AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+ MD = &getAnalysis<MemoryDependenceAnalysis>();
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+
+ bool Changed = false;
+ for (BasicBlock &I : F)
+ // Only check non-dead blocks. Dead blocks may have strange pointer
+ // cycles that will confuse alias analysis.
+ if (DT->isReachableFromEntry(&I))
+ Changed |= runOnBasicBlock(I);
+
+ AA = nullptr; MD = nullptr; DT = nullptr;
+ return Changed;
+ }
+
+ bool runOnBasicBlock(BasicBlock &BB);
+ bool MemoryIsNotModifiedBetween(Instruction *FirstI, Instruction *SecondI);
+ bool HandleFree(CallInst *F);
+ bool handleEndBlock(BasicBlock &BB);
+ void RemoveAccessedObjects(const MemoryLocation &LoadedLoc,
+ SmallSetVector<Value *, 16> &DeadStackObjects,
+ const DataLayout &DL);
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addRequired<MemoryDependenceAnalysis>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addPreserved<MemoryDependenceAnalysis>();
+ }
+ };
+}
+
+char DSE::ID = 0;
+INITIALIZE_PASS_BEGIN(DSE, "dse", "Dead Store Elimination", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(DSE, "dse", "Dead Store Elimination", false, false)
+
+FunctionPass *llvm::createDeadStoreEliminationPass() { return new DSE(); }
+
+//===----------------------------------------------------------------------===//
+// Helper functions
+//===----------------------------------------------------------------------===//
+
+/// DeleteDeadInstruction - Delete this instruction. Before we do, go through
+/// and zero out all the operands of this instruction. If any of them become
+/// dead, delete them and the computation tree that feeds them.
+///
+/// If ValueSet is non-null, remove any deleted instructions from it as well.
+///
+static void DeleteDeadInstruction(Instruction *I,
+ MemoryDependenceAnalysis &MD,
+ const TargetLibraryInfo &TLI,
+ SmallSetVector<Value*, 16> *ValueSet = nullptr) {
+ SmallVector<Instruction*, 32> NowDeadInsts;
+
+ NowDeadInsts.push_back(I);
+ --NumFastOther;
+
+ // Before we touch this instruction, remove it from memdep!
+ do {
+ Instruction *DeadInst = NowDeadInsts.pop_back_val();
+ ++NumFastOther;
+
+ // This instruction is dead, zap it, in stages. Start by removing it from
+ // MemDep, which needs to know the operands and needs it to be in the
+ // function.
+ MD.removeInstruction(DeadInst);
+
+ for (unsigned op = 0, e = DeadInst->getNumOperands(); op != e; ++op) {
+ Value *Op = DeadInst->getOperand(op);
+ DeadInst->setOperand(op, nullptr);
+
+ // If this operand just became dead, add it to the NowDeadInsts list.
+ if (!Op->use_empty()) continue;
+
+ if (Instruction *OpI = dyn_cast<Instruction>(Op))
+ if (isInstructionTriviallyDead(OpI, &TLI))
+ NowDeadInsts.push_back(OpI);
+ }
+
+ DeadInst->eraseFromParent();
+
+ if (ValueSet) ValueSet->remove(DeadInst);
+ } while (!NowDeadInsts.empty());
+}
+
+
+/// hasMemoryWrite - Does this instruction write some memory? This only returns
+/// true for things that we can analyze with other helpers below.
+static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo &TLI) {
+ if (isa<StoreInst>(I))
+ return true;
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+ switch (II->getIntrinsicID()) {
+ default:
+ return false;
+ case Intrinsic::memset:
+ case Intrinsic::memmove:
+ case Intrinsic::memcpy:
+ case Intrinsic::init_trampoline:
+ case Intrinsic::lifetime_end:
+ return true;
+ }
+ }
+ if (auto CS = CallSite(I)) {
+ if (Function *F = CS.getCalledFunction()) {
+ if (TLI.has(LibFunc::strcpy) &&
+ F->getName() == TLI.getName(LibFunc::strcpy)) {
+ return true;
+ }
+ if (TLI.has(LibFunc::strncpy) &&
+ F->getName() == TLI.getName(LibFunc::strncpy)) {
+ return true;
+ }
+ if (TLI.has(LibFunc::strcat) &&
+ F->getName() == TLI.getName(LibFunc::strcat)) {
+ return true;
+ }
+ if (TLI.has(LibFunc::strncat) &&
+ F->getName() == TLI.getName(LibFunc::strncat)) {
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+/// getLocForWrite - Return a Location stored to by the specified instruction.
+/// If isRemovable returns true, this function and getLocForRead completely
+/// describe the memory operations for this instruction.
+static MemoryLocation getLocForWrite(Instruction *Inst, AliasAnalysis &AA) {
+ if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+ return MemoryLocation::get(SI);
+
+ if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(Inst)) {
+ // memcpy/memmove/memset.
+ MemoryLocation Loc = MemoryLocation::getForDest(MI);
+ return Loc;
+ }
+
+ IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst);
+ if (!II)
+ return MemoryLocation();
+
+ switch (II->getIntrinsicID()) {
+ default:
+ return MemoryLocation(); // Unhandled intrinsic.
+ case Intrinsic::init_trampoline:
+ // FIXME: We don't know the size of the trampoline, so we can't really
+ // handle it here.
+ return MemoryLocation(II->getArgOperand(0));
+ case Intrinsic::lifetime_end: {
+ uint64_t Len = cast<ConstantInt>(II->getArgOperand(0))->getZExtValue();
+ return MemoryLocation(II->getArgOperand(1), Len);
+ }
+ }
+}
+
+/// getLocForRead - Return the location read by the specified "hasMemoryWrite"
+/// instruction if any.
+static MemoryLocation getLocForRead(Instruction *Inst,
+ const TargetLibraryInfo &TLI) {
+ assert(hasMemoryWrite(Inst, TLI) && "Unknown instruction case");
+
+ // The only instructions that both read and write are the mem transfer
+ // instructions (memcpy/memmove).
+ if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(Inst))
+ return MemoryLocation::getForSource(MTI);
+ return MemoryLocation();
+}
+
+
+/// isRemovable - If the value of this instruction and the memory it writes to
+/// is unused, may we delete this instruction?
+static bool isRemovable(Instruction *I) {
+ // Don't remove volatile/atomic stores.
+ if (StoreInst *SI = dyn_cast<StoreInst>(I))
+ return SI->isUnordered();
+
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+ switch (II->getIntrinsicID()) {
+ default: llvm_unreachable("doesn't pass 'hasMemoryWrite' predicate");
+ case Intrinsic::lifetime_end:
+ // Never remove dead lifetime_end's, e.g. because it is followed by a
+ // free.
+ return false;
+ case Intrinsic::init_trampoline:
+ // Always safe to remove init_trampoline.
+ return true;
+
+ case Intrinsic::memset:
+ case Intrinsic::memmove:
+ case Intrinsic::memcpy:
+ // Don't remove volatile memory intrinsics.
+ return !cast<MemIntrinsic>(II)->isVolatile();
+ }
+ }
+
+ if (auto CS = CallSite(I))
+ return CS.getInstruction()->use_empty();
+
+ return false;
+}
+
+
+/// isShortenable - Returns true if this instruction can be safely shortened in
+/// length.
+static bool isShortenable(Instruction *I) {
+ // Don't shorten stores for now
+ if (isa<StoreInst>(I))
+ return false;
+
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+ switch (II->getIntrinsicID()) {
+ default: return false;
+ case Intrinsic::memset:
+ case Intrinsic::memcpy:
+ // Do shorten memory intrinsics.
+ return true;
+ }
+ }
+
+ // Don't shorten libcalls calls for now.
+
+ return false;
+}
+
+/// getStoredPointerOperand - Return the pointer that is being written to.
+static Value *getStoredPointerOperand(Instruction *I) {
+ if (StoreInst *SI = dyn_cast<StoreInst>(I))
+ return SI->getPointerOperand();
+ if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
+ return MI->getDest();
+
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+ switch (II->getIntrinsicID()) {
+ default: llvm_unreachable("Unexpected intrinsic!");
+ case Intrinsic::init_trampoline:
+ return II->getArgOperand(0);
+ }
+ }
+
+ CallSite CS(I);
+ // All the supported functions so far happen to have dest as their first
+ // argument.
+ return CS.getArgument(0);
+}
+
+static uint64_t getPointerSize(const Value *V, const DataLayout &DL,
+ const TargetLibraryInfo &TLI) {
+ uint64_t Size;
+ if (getObjectSize(V, Size, DL, &TLI))
+ return Size;
+ return MemoryLocation::UnknownSize;
+}
+
+namespace {
+ enum OverwriteResult
+ {
+ OverwriteComplete,
+ OverwriteEnd,
+ OverwriteUnknown
+ };
+}
+
+/// isOverwrite - Return 'OverwriteComplete' if a store to the 'Later' location
+/// completely overwrites a store to the 'Earlier' location.
+/// 'OverwriteEnd' if the end of the 'Earlier' location is completely
+/// overwritten by 'Later', or 'OverwriteUnknown' if nothing can be determined
+static OverwriteResult isOverwrite(const MemoryLocation &Later,
+ const MemoryLocation &Earlier,
+ const DataLayout &DL,
+ const TargetLibraryInfo &TLI,
+ int64_t &EarlierOff, int64_t &LaterOff) {
+ const Value *P1 = Earlier.Ptr->stripPointerCasts();
+ const Value *P2 = Later.Ptr->stripPointerCasts();
+
+ // If the start pointers are the same, we just have to compare sizes to see if
+ // the later store was larger than the earlier store.
+ if (P1 == P2) {
+ // If we don't know the sizes of either access, then we can't do a
+ // comparison.
+ if (Later.Size == MemoryLocation::UnknownSize ||
+ Earlier.Size == MemoryLocation::UnknownSize)
+ return OverwriteUnknown;
+
+ // Make sure that the Later size is >= the Earlier size.
+ if (Later.Size >= Earlier.Size)
+ return OverwriteComplete;
+ }
+
+ // Otherwise, we have to have size information, and the later store has to be
+ // larger than the earlier one.
+ if (Later.Size == MemoryLocation::UnknownSize ||
+ Earlier.Size == MemoryLocation::UnknownSize)
+ return OverwriteUnknown;
+
+ // Check to see if the later store is to the entire object (either a global,
+ // an alloca, or a byval/inalloca argument). If so, then it clearly
+ // overwrites any other store to the same object.
+ const Value *UO1 = GetUnderlyingObject(P1, DL),
+ *UO2 = GetUnderlyingObject(P2, DL);
+
+ // If we can't resolve the same pointers to the same object, then we can't
+ // analyze them at all.
+ if (UO1 != UO2)
+ return OverwriteUnknown;
+
+ // If the "Later" store is to a recognizable object, get its size.
+ uint64_t ObjectSize = getPointerSize(UO2, DL, TLI);
+ if (ObjectSize != MemoryLocation::UnknownSize)
+ if (ObjectSize == Later.Size && ObjectSize >= Earlier.Size)
+ return OverwriteComplete;
+
+ // Okay, we have stores to two completely different pointers. Try to
+ // decompose the pointer into a "base + constant_offset" form. If the base
+ // pointers are equal, then we can reason about the two stores.
+ EarlierOff = 0;
+ LaterOff = 0;
+ const Value *BP1 = GetPointerBaseWithConstantOffset(P1, EarlierOff, DL);
+ const Value *BP2 = GetPointerBaseWithConstantOffset(P2, LaterOff, DL);
+
+ // If the base pointers still differ, we have two completely different stores.
+ if (BP1 != BP2)
+ return OverwriteUnknown;
+
+ // The later store completely overlaps the earlier store if:
+ //
+ // 1. Both start at the same offset and the later one's size is greater than
+ // or equal to the earlier one's, or
+ //
+ // |--earlier--|
+ // |-- later --|
+ //
+ // 2. The earlier store has an offset greater than the later offset, but which
+ // still lies completely within the later store.
+ //
+ // |--earlier--|
+ // |----- later ------|
+ //
+ // We have to be careful here as *Off is signed while *.Size is unsigned.
+ if (EarlierOff >= LaterOff &&
+ Later.Size >= Earlier.Size &&
+ uint64_t(EarlierOff - LaterOff) + Earlier.Size <= Later.Size)
+ return OverwriteComplete;
+
+ // The other interesting case is if the later store overwrites the end of
+ // the earlier store
+ //
+ // |--earlier--|
+ // |-- later --|
+ //
+ // In this case we may want to trim the size of earlier to avoid generating
+ // writes to addresses which will definitely be overwritten later
+ if (LaterOff > EarlierOff &&
+ LaterOff < int64_t(EarlierOff + Earlier.Size) &&
+ int64_t(LaterOff + Later.Size) >= int64_t(EarlierOff + Earlier.Size))
+ return OverwriteEnd;
+
+ // Otherwise, they don't completely overlap.
+ return OverwriteUnknown;
+}
+
+/// isPossibleSelfRead - If 'Inst' might be a self read (i.e. a noop copy of a
+/// memory region into an identical pointer) then it doesn't actually make its
+/// input dead in the traditional sense. Consider this case:
+///
+/// memcpy(A <- B)
+/// memcpy(A <- A)
+///
+/// In this case, the second store to A does not make the first store to A dead.
+/// The usual situation isn't an explicit A<-A store like this (which can be
+/// trivially removed) but a case where two pointers may alias.
+///
+/// This function detects when it is unsafe to remove a dependent instruction
+/// because the DSE inducing instruction may be a self-read.
+static bool isPossibleSelfRead(Instruction *Inst,
+ const MemoryLocation &InstStoreLoc,
+ Instruction *DepWrite,
+ const TargetLibraryInfo &TLI,
+ AliasAnalysis &AA) {
+ // Self reads can only happen for instructions that read memory. Get the
+ // location read.
+ MemoryLocation InstReadLoc = getLocForRead(Inst, TLI);
+ if (!InstReadLoc.Ptr) return false; // Not a reading instruction.
+
+ // If the read and written loc obviously don't alias, it isn't a read.
+ if (AA.isNoAlias(InstReadLoc, InstStoreLoc)) return false;
+
+ // Okay, 'Inst' may copy over itself. However, we can still remove a the
+ // DepWrite instruction if we can prove that it reads from the same location
+ // as Inst. This handles useful cases like:
+ // memcpy(A <- B)
+ // memcpy(A <- B)
+ // Here we don't know if A/B may alias, but we do know that B/B are must
+ // aliases, so removing the first memcpy is safe (assuming it writes <= #
+ // bytes as the second one.
+ MemoryLocation DepReadLoc = getLocForRead(DepWrite, TLI);
+
+ if (DepReadLoc.Ptr && AA.isMustAlias(InstReadLoc.Ptr, DepReadLoc.Ptr))
+ return false;
+
+ // If DepWrite doesn't read memory or if we can't prove it is a must alias,
+ // then it can't be considered dead.
+ return true;
+}
+
+
+//===----------------------------------------------------------------------===//
+// DSE Pass
+//===----------------------------------------------------------------------===//
+
+bool DSE::runOnBasicBlock(BasicBlock &BB) {
+ const DataLayout &DL = BB.getModule()->getDataLayout();
+ bool MadeChange = false;
+
+ // Do a top-down walk on the BB.
+ for (BasicBlock::iterator BBI = BB.begin(), BBE = BB.end(); BBI != BBE; ) {
+ Instruction *Inst = &*BBI++;
+
+ // Handle 'free' calls specially.
+ if (CallInst *F = isFreeCall(Inst, TLI)) {
+ MadeChange |= HandleFree(F);
+ continue;
+ }
+
+ // If we find something that writes memory, get its memory dependence.
+ if (!hasMemoryWrite(Inst, *TLI))
+ continue;
+
+ // If we're storing the same value back to a pointer that we just
+ // loaded from, then the store can be removed.
+ if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+
+ auto RemoveDeadInstAndUpdateBBI = [&](Instruction *DeadInst) {
+ // DeleteDeadInstruction can delete the current instruction. Save BBI
+ // in case we need it.
+ WeakVH NextInst(&*BBI);
+
+ DeleteDeadInstruction(DeadInst, *MD, *TLI);
+
+ if (!NextInst) // Next instruction deleted.
+ BBI = BB.begin();
+ else if (BBI != BB.begin()) // Revisit this instruction if possible.
+ --BBI;
+ ++NumRedundantStores;
+ MadeChange = true;
+ };
+
+ if (LoadInst *DepLoad = dyn_cast<LoadInst>(SI->getValueOperand())) {
+ if (SI->getPointerOperand() == DepLoad->getPointerOperand() &&
+ isRemovable(SI) &&
+ MemoryIsNotModifiedBetween(DepLoad, SI)) {
+
+ DEBUG(dbgs() << "DSE: Remove Store Of Load from same pointer:\n "
+ << "LOAD: " << *DepLoad << "\n STORE: " << *SI << '\n');
+
+ RemoveDeadInstAndUpdateBBI(SI);
+ continue;
+ }
+ }
+
+ // Remove null stores into the calloc'ed objects
+ Constant *StoredConstant = dyn_cast<Constant>(SI->getValueOperand());
+
+ if (StoredConstant && StoredConstant->isNullValue() &&
+ isRemovable(SI)) {
+ Instruction *UnderlyingPointer = dyn_cast<Instruction>(
+ GetUnderlyingObject(SI->getPointerOperand(), DL));
+
+ if (UnderlyingPointer && isCallocLikeFn(UnderlyingPointer, TLI) &&
+ MemoryIsNotModifiedBetween(UnderlyingPointer, SI)) {
+ DEBUG(dbgs()
+ << "DSE: Remove null store to the calloc'ed object:\n DEAD: "
+ << *Inst << "\n OBJECT: " << *UnderlyingPointer << '\n');
+
+ RemoveDeadInstAndUpdateBBI(SI);
+ continue;
+ }
+ }
+ }
+
+ MemDepResult InstDep = MD->getDependency(Inst);
+
+ // Ignore any store where we can't find a local dependence.
+ // FIXME: cross-block DSE would be fun. :)
+ if (!InstDep.isDef() && !InstDep.isClobber())
+ continue;
+
+ // Figure out what location is being stored to.
+ MemoryLocation Loc = getLocForWrite(Inst, *AA);
+
+ // If we didn't get a useful location, fail.
+ if (!Loc.Ptr)
+ continue;
+
+ while (InstDep.isDef() || InstDep.isClobber()) {
+ // Get the memory clobbered by the instruction we depend on. MemDep will
+ // skip any instructions that 'Loc' clearly doesn't interact with. If we
+ // end up depending on a may- or must-aliased load, then we can't optimize
+ // away the store and we bail out. However, if we depend on on something
+ // that overwrites the memory location we *can* potentially optimize it.
+ //
+ // Find out what memory location the dependent instruction stores.
+ Instruction *DepWrite = InstDep.getInst();
+ MemoryLocation DepLoc = getLocForWrite(DepWrite, *AA);
+ // If we didn't get a useful location, or if it isn't a size, bail out.
+ if (!DepLoc.Ptr)
+ break;
+
+ // If we find a write that is a) removable (i.e., non-volatile), b) is
+ // completely obliterated by the store to 'Loc', and c) which we know that
+ // 'Inst' doesn't load from, then we can remove it.
+ if (isRemovable(DepWrite) &&
+ !isPossibleSelfRead(Inst, Loc, DepWrite, *TLI, *AA)) {
+ int64_t InstWriteOffset, DepWriteOffset;
+ OverwriteResult OR =
+ isOverwrite(Loc, DepLoc, DL, *TLI, DepWriteOffset, InstWriteOffset);
+ if (OR == OverwriteComplete) {
+ DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: "
+ << *DepWrite << "\n KILLER: " << *Inst << '\n');
+
+ // Delete the store and now-dead instructions that feed it.
+ DeleteDeadInstruction(DepWrite, *MD, *TLI);
+ ++NumFastStores;
+ MadeChange = true;
+
+ // DeleteDeadInstruction can delete the current instruction in loop
+ // cases, reset BBI.
+ BBI = Inst->getIterator();
+ if (BBI != BB.begin())
+ --BBI;
+ break;
+ } else if (OR == OverwriteEnd && isShortenable(DepWrite)) {
+ // TODO: base this on the target vector size so that if the earlier
+ // store was too small to get vector writes anyway then its likely
+ // a good idea to shorten it
+ // Power of 2 vector writes are probably always a bad idea to optimize
+ // as any store/memset/memcpy is likely using vector instructions so
+ // shortening it to not vector size is likely to be slower
+ MemIntrinsic* DepIntrinsic = cast<MemIntrinsic>(DepWrite);
+ unsigned DepWriteAlign = DepIntrinsic->getAlignment();
+ if (llvm::isPowerOf2_64(InstWriteOffset) ||
+ ((DepWriteAlign != 0) && InstWriteOffset % DepWriteAlign == 0)) {
+
+ DEBUG(dbgs() << "DSE: Remove Dead Store:\n OW END: "
+ << *DepWrite << "\n KILLER (offset "
+ << InstWriteOffset << ", "
+ << DepLoc.Size << ")"
+ << *Inst << '\n');
+
+ Value* DepWriteLength = DepIntrinsic->getLength();
+ Value* TrimmedLength = ConstantInt::get(DepWriteLength->getType(),
+ InstWriteOffset -
+ DepWriteOffset);
+ DepIntrinsic->setLength(TrimmedLength);
+ MadeChange = true;
+ }
+ }
+ }
+
+ // If this is a may-aliased store that is clobbering the store value, we
+ // can keep searching past it for another must-aliased pointer that stores
+ // to the same location. For example, in:
+ // store -> P
+ // store -> Q
+ // store -> P
+ // we can remove the first store to P even though we don't know if P and Q
+ // alias.
+ if (DepWrite == &BB.front()) break;
+
+ // Can't look past this instruction if it might read 'Loc'.
+ if (AA->getModRefInfo(DepWrite, Loc) & MRI_Ref)
+ break;
+
+ InstDep = MD->getPointerDependencyFrom(Loc, false,
+ DepWrite->getIterator(), &BB);
+ }
+ }
+
+ // If this block ends in a return, unwind, or unreachable, all allocas are
+ // dead at its end, which means stores to them are also dead.
+ if (BB.getTerminator()->getNumSuccessors() == 0)
+ MadeChange |= handleEndBlock(BB);
+
+ return MadeChange;
+}
+
+/// Returns true if the memory which is accessed by the second instruction is not
+/// modified between the first and the second instruction.
+/// Precondition: Second instruction must be dominated by the first
+/// instruction.
+bool DSE::MemoryIsNotModifiedBetween(Instruction *FirstI,
+ Instruction *SecondI) {
+ SmallVector<BasicBlock *, 16> WorkList;
+ SmallPtrSet<BasicBlock *, 8> Visited;
+ BasicBlock::iterator FirstBBI(FirstI);
+ ++FirstBBI;
+ BasicBlock::iterator SecondBBI(SecondI);
+ BasicBlock *FirstBB = FirstI->getParent();
+ BasicBlock *SecondBB = SecondI->getParent();
+ MemoryLocation MemLoc = MemoryLocation::get(SecondI);
+
+ // Start checking the store-block.
+ WorkList.push_back(SecondBB);
+ bool isFirstBlock = true;
+
+ // Check all blocks going backward until we reach the load-block.
+ while (!WorkList.empty()) {
+ BasicBlock *B = WorkList.pop_back_val();
+
+ // Ignore instructions before LI if this is the FirstBB.
+ BasicBlock::iterator BI = (B == FirstBB ? FirstBBI : B->begin());
+
+ BasicBlock::iterator EI;
+ if (isFirstBlock) {
+ // Ignore instructions after SI if this is the first visit of SecondBB.
+ assert(B == SecondBB && "first block is not the store block");
+ EI = SecondBBI;
+ isFirstBlock = false;
+ } else {
+ // It's not SecondBB or (in case of a loop) the second visit of SecondBB.
+ // In this case we also have to look at instructions after SI.
+ EI = B->end();
+ }
+ for (; BI != EI; ++BI) {
+ Instruction *I = &*BI;
+ if (I->mayWriteToMemory() && I != SecondI) {
+ auto Res = AA->getModRefInfo(I, MemLoc);
+ if (Res != MRI_NoModRef)
+ return false;
+ }
+ }
+ if (B != FirstBB) {
+ assert(B != &FirstBB->getParent()->getEntryBlock() &&
+ "Should not hit the entry block because SI must be dominated by LI");
+ for (auto PredI = pred_begin(B), PE = pred_end(B); PredI != PE; ++PredI) {
+ if (!Visited.insert(*PredI).second)
+ continue;
+ WorkList.push_back(*PredI);
+ }
+ }
+ }
+ return true;
+}
+
+/// Find all blocks that will unconditionally lead to the block BB and append
+/// them to F.
+static void FindUnconditionalPreds(SmallVectorImpl<BasicBlock *> &Blocks,
+ BasicBlock *BB, DominatorTree *DT) {
+ for (pred_iterator I = pred_begin(BB), E = pred_end(BB); I != E; ++I) {
+ BasicBlock *Pred = *I;
+ if (Pred == BB) continue;
+ TerminatorInst *PredTI = Pred->getTerminator();
+ if (PredTI->getNumSuccessors() != 1)
+ continue;
+
+ if (DT->isReachableFromEntry(Pred))
+ Blocks.push_back(Pred);
+ }
+}
+
+/// HandleFree - Handle frees of entire structures whose dependency is a store
+/// to a field of that structure.
+bool DSE::HandleFree(CallInst *F) {
+ bool MadeChange = false;
+
+ MemoryLocation Loc = MemoryLocation(F->getOperand(0));
+ SmallVector<BasicBlock *, 16> Blocks;
+ Blocks.push_back(F->getParent());
+ const DataLayout &DL = F->getModule()->getDataLayout();
+
+ while (!Blocks.empty()) {
+ BasicBlock *BB = Blocks.pop_back_val();
+ Instruction *InstPt = BB->getTerminator();
+ if (BB == F->getParent()) InstPt = F;
+
+ MemDepResult Dep =
+ MD->getPointerDependencyFrom(Loc, false, InstPt->getIterator(), BB);
+ while (Dep.isDef() || Dep.isClobber()) {
+ Instruction *Dependency = Dep.getInst();
+ if (!hasMemoryWrite(Dependency, *TLI) || !isRemovable(Dependency))
+ break;
+
+ Value *DepPointer =
+ GetUnderlyingObject(getStoredPointerOperand(Dependency), DL);
+
+ // Check for aliasing.
+ if (!AA->isMustAlias(F->getArgOperand(0), DepPointer))
+ break;
+
+ auto Next = ++Dependency->getIterator();
+
+ // DCE instructions only used to calculate that store
+ DeleteDeadInstruction(Dependency, *MD, *TLI);
+ ++NumFastStores;
+ MadeChange = true;
+
+ // Inst's old Dependency is now deleted. Compute the next dependency,
+ // which may also be dead, as in
+ // s[0] = 0;
+ // s[1] = 0; // This has just been deleted.
+ // free(s);
+ Dep = MD->getPointerDependencyFrom(Loc, false, Next, BB);
+ }
+
+ if (Dep.isNonLocal())
+ FindUnconditionalPreds(Blocks, BB, DT);
+ }
+
+ return MadeChange;
+}
+
+/// handleEndBlock - Remove dead stores to stack-allocated locations in the
+/// function end block. Ex:
+/// %A = alloca i32
+/// ...
+/// store i32 1, i32* %A
+/// ret void
+bool DSE::handleEndBlock(BasicBlock &BB) {
+ bool MadeChange = false;
+
+ // Keep track of all of the stack objects that are dead at the end of the
+ // function.
+ SmallSetVector<Value*, 16> DeadStackObjects;
+
+ // Find all of the alloca'd pointers in the entry block.
+ BasicBlock &Entry = BB.getParent()->front();
+ for (Instruction &I : Entry) {
+ if (isa<AllocaInst>(&I))
+ DeadStackObjects.insert(&I);
+
+ // Okay, so these are dead heap objects, but if the pointer never escapes
+ // then it's leaked by this function anyways.
+ else if (isAllocLikeFn(&I, TLI) && !PointerMayBeCaptured(&I, true, true))
+ DeadStackObjects.insert(&I);
+ }
+
+ // Treat byval or inalloca arguments the same, stores to them are dead at the
+ // end of the function.
+ for (Argument &AI : BB.getParent()->args())
+ if (AI.hasByValOrInAllocaAttr())
+ DeadStackObjects.insert(&AI);
+
+ const DataLayout &DL = BB.getModule()->getDataLayout();
+
+ // Scan the basic block backwards
+ for (BasicBlock::iterator BBI = BB.end(); BBI != BB.begin(); ){
+ --BBI;
+
+ // If we find a store, check to see if it points into a dead stack value.
+ if (hasMemoryWrite(&*BBI, *TLI) && isRemovable(&*BBI)) {
+ // See through pointer-to-pointer bitcasts
+ SmallVector<Value *, 4> Pointers;
+ GetUnderlyingObjects(getStoredPointerOperand(&*BBI), Pointers, DL);
+
+ // Stores to stack values are valid candidates for removal.
+ bool AllDead = true;
+ for (SmallVectorImpl<Value *>::iterator I = Pointers.begin(),
+ E = Pointers.end(); I != E; ++I)
+ if (!DeadStackObjects.count(*I)) {
+ AllDead = false;
+ break;
+ }
+
+ if (AllDead) {
+ Instruction *Dead = &*BBI++;
+
+ DEBUG(dbgs() << "DSE: Dead Store at End of Block:\n DEAD: "
+ << *Dead << "\n Objects: ";
+ for (SmallVectorImpl<Value *>::iterator I = Pointers.begin(),
+ E = Pointers.end(); I != E; ++I) {
+ dbgs() << **I;
+ if (std::next(I) != E)
+ dbgs() << ", ";
+ }
+ dbgs() << '\n');
+
+ // DCE instructions only used to calculate that store.
+ DeleteDeadInstruction(Dead, *MD, *TLI, &DeadStackObjects);
+ ++NumFastStores;
+ MadeChange = true;
+ continue;
+ }
+ }
+
+ // Remove any dead non-memory-mutating instructions.
+ if (isInstructionTriviallyDead(&*BBI, TLI)) {
+ Instruction *Inst = &*BBI++;
+ DeleteDeadInstruction(Inst, *MD, *TLI, &DeadStackObjects);
+ ++NumFastOther;
+ MadeChange = true;
+ continue;
+ }
+
+ if (isa<AllocaInst>(BBI)) {
+ // Remove allocas from the list of dead stack objects; there can't be
+ // any references before the definition.
+ DeadStackObjects.remove(&*BBI);
+ continue;
+ }
+
+ if (auto CS = CallSite(&*BBI)) {
+ // Remove allocation function calls from the list of dead stack objects;
+ // there can't be any references before the definition.
+ if (isAllocLikeFn(&*BBI, TLI))
+ DeadStackObjects.remove(&*BBI);
+
+ // If this call does not access memory, it can't be loading any of our
+ // pointers.
+ if (AA->doesNotAccessMemory(CS))
+ continue;
+
+ // If the call might load from any of our allocas, then any store above
+ // the call is live.
+ DeadStackObjects.remove_if([&](Value *I) {
+ // See if the call site touches the value.
+ ModRefInfo A = AA->getModRefInfo(CS, I, getPointerSize(I, DL, *TLI));
+
+ return A == MRI_ModRef || A == MRI_Ref;
+ });
+
+ // If all of the allocas were clobbered by the call then we're not going
+ // to find anything else to process.
+ if (DeadStackObjects.empty())
+ break;
+
+ continue;
+ }
+
+ MemoryLocation LoadedLoc;
+
+ // If we encounter a use of the pointer, it is no longer considered dead
+ if (LoadInst *L = dyn_cast<LoadInst>(BBI)) {
+ if (!L->isUnordered()) // Be conservative with atomic/volatile load
+ break;
+ LoadedLoc = MemoryLocation::get(L);
+ } else if (VAArgInst *V = dyn_cast<VAArgInst>(BBI)) {
+ LoadedLoc = MemoryLocation::get(V);
+ } else if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(BBI)) {
+ LoadedLoc = MemoryLocation::getForSource(MTI);
+ } else if (!BBI->mayReadFromMemory()) {
+ // Instruction doesn't read memory. Note that stores that weren't removed
+ // above will hit this case.
+ continue;
+ } else {
+ // Unknown inst; assume it clobbers everything.
+ break;
+ }
+
+ // Remove any allocas from the DeadPointer set that are loaded, as this
+ // makes any stores above the access live.
+ RemoveAccessedObjects(LoadedLoc, DeadStackObjects, DL);
+
+ // If all of the allocas were clobbered by the access then we're not going
+ // to find anything else to process.
+ if (DeadStackObjects.empty())
+ break;
+ }
+
+ return MadeChange;
+}
+
+/// RemoveAccessedObjects - Check to see if the specified location may alias any
+/// of the stack objects in the DeadStackObjects set. If so, they become live
+/// because the location is being loaded.
+void DSE::RemoveAccessedObjects(const MemoryLocation &LoadedLoc,
+ SmallSetVector<Value *, 16> &DeadStackObjects,
+ const DataLayout &DL) {
+ const Value *UnderlyingPointer = GetUnderlyingObject(LoadedLoc.Ptr, DL);
+
+ // A constant can't be in the dead pointer set.
+ if (isa<Constant>(UnderlyingPointer))
+ return;
+
+ // If the kill pointer can be easily reduced to an alloca, don't bother doing
+ // extraneous AA queries.
+ if (isa<AllocaInst>(UnderlyingPointer) || isa<Argument>(UnderlyingPointer)) {
+ DeadStackObjects.remove(const_cast<Value*>(UnderlyingPointer));
+ return;
+ }
+
+ // Remove objects that could alias LoadedLoc.
+ DeadStackObjects.remove_if([&](Value *I) {
+ // See if the loaded location could alias the stack location.
+ MemoryLocation StackLoc(I, getPointerSize(I, DL, *TLI));
+ return !AA->isNoAlias(StackLoc, LoadedLoc);
+ });
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
new file mode 100644
index 0000000..7ef062e
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -0,0 +1,890 @@
+//===- EarlyCSE.cpp - Simple and fast CSE pass ----------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs a simple dominator tree walk that eliminates trivially
+// redundant instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/EarlyCSE.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/ScopedHashTable.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/RecyclingAllocator.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <deque>
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+#define DEBUG_TYPE "early-cse"
+
+STATISTIC(NumSimplify, "Number of instructions simplified or DCE'd");
+STATISTIC(NumCSE, "Number of instructions CSE'd");
+STATISTIC(NumCSELoad, "Number of load instructions CSE'd");
+STATISTIC(NumCSECall, "Number of call instructions CSE'd");
+STATISTIC(NumDSE, "Number of trivial dead stores removed");
+
+//===----------------------------------------------------------------------===//
+// SimpleValue
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// \brief Struct representing the available values in the scoped hash table.
+struct SimpleValue {
+ Instruction *Inst;
+
+ SimpleValue(Instruction *I) : Inst(I) {
+ assert((isSentinel() || canHandle(I)) && "Inst can't be handled!");
+ }
+
+ bool isSentinel() const {
+ return Inst == DenseMapInfo<Instruction *>::getEmptyKey() ||
+ Inst == DenseMapInfo<Instruction *>::getTombstoneKey();
+ }
+
+ static bool canHandle(Instruction *Inst) {
+ // This can only handle non-void readnone functions.
+ if (CallInst *CI = dyn_cast<CallInst>(Inst))
+ return CI->doesNotAccessMemory() && !CI->getType()->isVoidTy();
+ return isa<CastInst>(Inst) || isa<BinaryOperator>(Inst) ||
+ isa<GetElementPtrInst>(Inst) || isa<CmpInst>(Inst) ||
+ isa<SelectInst>(Inst) || isa<ExtractElementInst>(Inst) ||
+ isa<InsertElementInst>(Inst) || isa<ShuffleVectorInst>(Inst) ||
+ isa<ExtractValueInst>(Inst) || isa<InsertValueInst>(Inst);
+ }
+};
+}
+
+namespace llvm {
+template <> struct DenseMapInfo<SimpleValue> {
+ static inline SimpleValue getEmptyKey() {
+ return DenseMapInfo<Instruction *>::getEmptyKey();
+ }
+ static inline SimpleValue getTombstoneKey() {
+ return DenseMapInfo<Instruction *>::getTombstoneKey();
+ }
+ static unsigned getHashValue(SimpleValue Val);
+ static bool isEqual(SimpleValue LHS, SimpleValue RHS);
+};
+}
+
+unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) {
+ Instruction *Inst = Val.Inst;
+ // Hash in all of the operands as pointers.
+ if (BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst)) {
+ Value *LHS = BinOp->getOperand(0);
+ Value *RHS = BinOp->getOperand(1);
+ if (BinOp->isCommutative() && BinOp->getOperand(0) > BinOp->getOperand(1))
+ std::swap(LHS, RHS);
+
+ if (isa<OverflowingBinaryOperator>(BinOp)) {
+ // Hash the overflow behavior
+ unsigned Overflow =
+ BinOp->hasNoSignedWrap() * OverflowingBinaryOperator::NoSignedWrap |
+ BinOp->hasNoUnsignedWrap() *
+ OverflowingBinaryOperator::NoUnsignedWrap;
+ return hash_combine(BinOp->getOpcode(), Overflow, LHS, RHS);
+ }
+
+ return hash_combine(BinOp->getOpcode(), LHS, RHS);
+ }
+
+ if (CmpInst *CI = dyn_cast<CmpInst>(Inst)) {
+ Value *LHS = CI->getOperand(0);
+ Value *RHS = CI->getOperand(1);
+ CmpInst::Predicate Pred = CI->getPredicate();
+ if (Inst->getOperand(0) > Inst->getOperand(1)) {
+ std::swap(LHS, RHS);
+ Pred = CI->getSwappedPredicate();
+ }
+ return hash_combine(Inst->getOpcode(), Pred, LHS, RHS);
+ }
+
+ if (CastInst *CI = dyn_cast<CastInst>(Inst))
+ return hash_combine(CI->getOpcode(), CI->getType(), CI->getOperand(0));
+
+ if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(Inst))
+ return hash_combine(EVI->getOpcode(), EVI->getOperand(0),
+ hash_combine_range(EVI->idx_begin(), EVI->idx_end()));
+
+ if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(Inst))
+ return hash_combine(IVI->getOpcode(), IVI->getOperand(0),
+ IVI->getOperand(1),
+ hash_combine_range(IVI->idx_begin(), IVI->idx_end()));
+
+ assert((isa<CallInst>(Inst) || isa<BinaryOperator>(Inst) ||
+ isa<GetElementPtrInst>(Inst) || isa<SelectInst>(Inst) ||
+ isa<ExtractElementInst>(Inst) || isa<InsertElementInst>(Inst) ||
+ isa<ShuffleVectorInst>(Inst)) &&
+ "Invalid/unknown instruction");
+
+ // Mix in the opcode.
+ return hash_combine(
+ Inst->getOpcode(),
+ hash_combine_range(Inst->value_op_begin(), Inst->value_op_end()));
+}
+
+bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) {
+ Instruction *LHSI = LHS.Inst, *RHSI = RHS.Inst;
+
+ if (LHS.isSentinel() || RHS.isSentinel())
+ return LHSI == RHSI;
+
+ if (LHSI->getOpcode() != RHSI->getOpcode())
+ return false;
+ if (LHSI->isIdenticalTo(RHSI))
+ return true;
+
+ // If we're not strictly identical, we still might be a commutable instruction
+ if (BinaryOperator *LHSBinOp = dyn_cast<BinaryOperator>(LHSI)) {
+ if (!LHSBinOp->isCommutative())
+ return false;
+
+ assert(isa<BinaryOperator>(RHSI) &&
+ "same opcode, but different instruction type?");
+ BinaryOperator *RHSBinOp = cast<BinaryOperator>(RHSI);
+
+ // Check overflow attributes
+ if (isa<OverflowingBinaryOperator>(LHSBinOp)) {
+ assert(isa<OverflowingBinaryOperator>(RHSBinOp) &&
+ "same opcode, but different operator type?");
+ if (LHSBinOp->hasNoUnsignedWrap() != RHSBinOp->hasNoUnsignedWrap() ||
+ LHSBinOp->hasNoSignedWrap() != RHSBinOp->hasNoSignedWrap())
+ return false;
+ }
+
+ // Commuted equality
+ return LHSBinOp->getOperand(0) == RHSBinOp->getOperand(1) &&
+ LHSBinOp->getOperand(1) == RHSBinOp->getOperand(0);
+ }
+ if (CmpInst *LHSCmp = dyn_cast<CmpInst>(LHSI)) {
+ assert(isa<CmpInst>(RHSI) &&
+ "same opcode, but different instruction type?");
+ CmpInst *RHSCmp = cast<CmpInst>(RHSI);
+ // Commuted equality
+ return LHSCmp->getOperand(0) == RHSCmp->getOperand(1) &&
+ LHSCmp->getOperand(1) == RHSCmp->getOperand(0) &&
+ LHSCmp->getSwappedPredicate() == RHSCmp->getPredicate();
+ }
+
+ return false;
+}
+
+//===----------------------------------------------------------------------===//
+// CallValue
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// \brief Struct representing the available call values in the scoped hash
+/// table.
+struct CallValue {
+ Instruction *Inst;
+
+ CallValue(Instruction *I) : Inst(I) {
+ assert((isSentinel() || canHandle(I)) && "Inst can't be handled!");
+ }
+
+ bool isSentinel() const {
+ return Inst == DenseMapInfo<Instruction *>::getEmptyKey() ||
+ Inst == DenseMapInfo<Instruction *>::getTombstoneKey();
+ }
+
+ static bool canHandle(Instruction *Inst) {
+ // Don't value number anything that returns void.
+ if (Inst->getType()->isVoidTy())
+ return false;
+
+ CallInst *CI = dyn_cast<CallInst>(Inst);
+ if (!CI || !CI->onlyReadsMemory())
+ return false;
+ return true;
+ }
+};
+}
+
+namespace llvm {
+template <> struct DenseMapInfo<CallValue> {
+ static inline CallValue getEmptyKey() {
+ return DenseMapInfo<Instruction *>::getEmptyKey();
+ }
+ static inline CallValue getTombstoneKey() {
+ return DenseMapInfo<Instruction *>::getTombstoneKey();
+ }
+ static unsigned getHashValue(CallValue Val);
+ static bool isEqual(CallValue LHS, CallValue RHS);
+};
+}
+
+unsigned DenseMapInfo<CallValue>::getHashValue(CallValue Val) {
+ Instruction *Inst = Val.Inst;
+ // Hash all of the operands as pointers and mix in the opcode.
+ return hash_combine(
+ Inst->getOpcode(),
+ hash_combine_range(Inst->value_op_begin(), Inst->value_op_end()));
+}
+
+bool DenseMapInfo<CallValue>::isEqual(CallValue LHS, CallValue RHS) {
+ Instruction *LHSI = LHS.Inst, *RHSI = RHS.Inst;
+ if (LHS.isSentinel() || RHS.isSentinel())
+ return LHSI == RHSI;
+ return LHSI->isIdenticalTo(RHSI);
+}
+
+//===----------------------------------------------------------------------===//
+// EarlyCSE implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// \brief A simple and fast domtree-based CSE pass.
+///
+/// This pass does a simple depth-first walk over the dominator tree,
+/// eliminating trivially redundant instructions and using instsimplify to
+/// canonicalize things as it goes. It is intended to be fast and catch obvious
+/// cases so that instcombine and other passes are more effective. It is
+/// expected that a later pass of GVN will catch the interesting/hard cases.
+class EarlyCSE {
+public:
+ const TargetLibraryInfo &TLI;
+ const TargetTransformInfo &TTI;
+ DominatorTree &DT;
+ AssumptionCache &AC;
+ typedef RecyclingAllocator<
+ BumpPtrAllocator, ScopedHashTableVal<SimpleValue, Value *>> AllocatorTy;
+ typedef ScopedHashTable<SimpleValue, Value *, DenseMapInfo<SimpleValue>,
+ AllocatorTy> ScopedHTType;
+
+ /// \brief A scoped hash table of the current values of all of our simple
+ /// scalar expressions.
+ ///
+ /// As we walk down the domtree, we look to see if instructions are in this:
+ /// if so, we replace them with what we find, otherwise we insert them so
+ /// that dominated values can succeed in their lookup.
+ ScopedHTType AvailableValues;
+
+ /// A scoped hash table of the current values of previously encounted memory
+ /// locations.
+ ///
+ /// This allows us to get efficient access to dominating loads or stores when
+ /// we have a fully redundant load. In addition to the most recent load, we
+ /// keep track of a generation count of the read, which is compared against
+ /// the current generation count. The current generation count is incremented
+ /// after every possibly writing memory operation, which ensures that we only
+ /// CSE loads with other loads that have no intervening store. Ordering
+ /// events (such as fences or atomic instructions) increment the generation
+ /// count as well; essentially, we model these as writes to all possible
+ /// locations. Note that atomic and/or volatile loads and stores can be
+ /// present the table; it is the responsibility of the consumer to inspect
+ /// the atomicity/volatility if needed.
+ struct LoadValue {
+ Value *Data;
+ unsigned Generation;
+ int MatchingId;
+ bool IsAtomic;
+ LoadValue()
+ : Data(nullptr), Generation(0), MatchingId(-1), IsAtomic(false) {}
+ LoadValue(Value *Data, unsigned Generation, unsigned MatchingId,
+ bool IsAtomic)
+ : Data(Data), Generation(Generation), MatchingId(MatchingId),
+ IsAtomic(IsAtomic) {}
+ };
+ typedef RecyclingAllocator<BumpPtrAllocator,
+ ScopedHashTableVal<Value *, LoadValue>>
+ LoadMapAllocator;
+ typedef ScopedHashTable<Value *, LoadValue, DenseMapInfo<Value *>,
+ LoadMapAllocator> LoadHTType;
+ LoadHTType AvailableLoads;
+
+ /// \brief A scoped hash table of the current values of read-only call
+ /// values.
+ ///
+ /// It uses the same generation count as loads.
+ typedef ScopedHashTable<CallValue, std::pair<Value *, unsigned>> CallHTType;
+ CallHTType AvailableCalls;
+
+ /// \brief This is the current generation of the memory value.
+ unsigned CurrentGeneration;
+
+ /// \brief Set up the EarlyCSE runner for a particular function.
+ EarlyCSE(const TargetLibraryInfo &TLI, const TargetTransformInfo &TTI,
+ DominatorTree &DT, AssumptionCache &AC)
+ : TLI(TLI), TTI(TTI), DT(DT), AC(AC), CurrentGeneration(0) {}
+
+ bool run();
+
+private:
+ // Almost a POD, but needs to call the constructors for the scoped hash
+ // tables so that a new scope gets pushed on. These are RAII so that the
+ // scope gets popped when the NodeScope is destroyed.
+ class NodeScope {
+ public:
+ NodeScope(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads,
+ CallHTType &AvailableCalls)
+ : Scope(AvailableValues), LoadScope(AvailableLoads),
+ CallScope(AvailableCalls) {}
+
+ private:
+ NodeScope(const NodeScope &) = delete;
+ void operator=(const NodeScope &) = delete;
+
+ ScopedHTType::ScopeTy Scope;
+ LoadHTType::ScopeTy LoadScope;
+ CallHTType::ScopeTy CallScope;
+ };
+
+ // Contains all the needed information to create a stack for doing a depth
+ // first tranversal of the tree. This includes scopes for values, loads, and
+ // calls as well as the generation. There is a child iterator so that the
+ // children do not need to be store spearately.
+ class StackNode {
+ public:
+ StackNode(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads,
+ CallHTType &AvailableCalls, unsigned cg, DomTreeNode *n,
+ DomTreeNode::iterator child, DomTreeNode::iterator end)
+ : CurrentGeneration(cg), ChildGeneration(cg), Node(n), ChildIter(child),
+ EndIter(end), Scopes(AvailableValues, AvailableLoads, AvailableCalls),
+ Processed(false) {}
+
+ // Accessors.
+ unsigned currentGeneration() { return CurrentGeneration; }
+ unsigned childGeneration() { return ChildGeneration; }
+ void childGeneration(unsigned generation) { ChildGeneration = generation; }
+ DomTreeNode *node() { return Node; }
+ DomTreeNode::iterator childIter() { return ChildIter; }
+ DomTreeNode *nextChild() {
+ DomTreeNode *child = *ChildIter;
+ ++ChildIter;
+ return child;
+ }
+ DomTreeNode::iterator end() { return EndIter; }
+ bool isProcessed() { return Processed; }
+ void process() { Processed = true; }
+
+ private:
+ StackNode(const StackNode &) = delete;
+ void operator=(const StackNode &) = delete;
+
+ // Members.
+ unsigned CurrentGeneration;
+ unsigned ChildGeneration;
+ DomTreeNode *Node;
+ DomTreeNode::iterator ChildIter;
+ DomTreeNode::iterator EndIter;
+ NodeScope Scopes;
+ bool Processed;
+ };
+
+ /// \brief Wrapper class to handle memory instructions, including loads,
+ /// stores and intrinsic loads and stores defined by the target.
+ class ParseMemoryInst {
+ public:
+ ParseMemoryInst(Instruction *Inst, const TargetTransformInfo &TTI)
+ : IsTargetMemInst(false), Inst(Inst) {
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst))
+ if (TTI.getTgtMemIntrinsic(II, Info) && Info.NumMemRefs == 1)
+ IsTargetMemInst = true;
+ }
+ bool isLoad() const {
+ if (IsTargetMemInst) return Info.ReadMem;
+ return isa<LoadInst>(Inst);
+ }
+ bool isStore() const {
+ if (IsTargetMemInst) return Info.WriteMem;
+ return isa<StoreInst>(Inst);
+ }
+ bool isAtomic() const {
+ if (IsTargetMemInst) {
+ assert(Info.IsSimple && "need to refine IsSimple in TTI");
+ return false;
+ }
+ return Inst->isAtomic();
+ }
+ bool isUnordered() const {
+ if (IsTargetMemInst) {
+ assert(Info.IsSimple && "need to refine IsSimple in TTI");
+ return true;
+ }
+ if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+ return LI->isUnordered();
+ } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+ return SI->isUnordered();
+ }
+ // Conservative answer
+ return !Inst->isAtomic();
+ }
+
+ bool isVolatile() const {
+ if (IsTargetMemInst) {
+ assert(Info.IsSimple && "need to refine IsSimple in TTI");
+ return false;
+ }
+ if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+ return LI->isVolatile();
+ } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+ return SI->isVolatile();
+ }
+ // Conservative answer
+ return true;
+ }
+
+
+ bool isMatchingMemLoc(const ParseMemoryInst &Inst) const {
+ return (getPointerOperand() == Inst.getPointerOperand() &&
+ getMatchingId() == Inst.getMatchingId());
+ }
+ bool isValid() const { return getPointerOperand() != nullptr; }
+
+ // For regular (non-intrinsic) loads/stores, this is set to -1. For
+ // intrinsic loads/stores, the id is retrieved from the corresponding
+ // field in the MemIntrinsicInfo structure. That field contains
+ // non-negative values only.
+ int getMatchingId() const {
+ if (IsTargetMemInst) return Info.MatchingId;
+ return -1;
+ }
+ Value *getPointerOperand() const {
+ if (IsTargetMemInst) return Info.PtrVal;
+ if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+ return LI->getPointerOperand();
+ } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+ return SI->getPointerOperand();
+ }
+ return nullptr;
+ }
+ bool mayReadFromMemory() const {
+ if (IsTargetMemInst) return Info.ReadMem;
+ return Inst->mayReadFromMemory();
+ }
+ bool mayWriteToMemory() const {
+ if (IsTargetMemInst) return Info.WriteMem;
+ return Inst->mayWriteToMemory();
+ }
+
+ private:
+ bool IsTargetMemInst;
+ MemIntrinsicInfo Info;
+ Instruction *Inst;
+ };
+
+ bool processNode(DomTreeNode *Node);
+
+ Value *getOrCreateResult(Value *Inst, Type *ExpectedType) const {
+ if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
+ return LI;
+ else if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+ return SI->getValueOperand();
+ assert(isa<IntrinsicInst>(Inst) && "Instruction not supported");
+ return TTI.getOrCreateResultFromMemIntrinsic(cast<IntrinsicInst>(Inst),
+ ExpectedType);
+ }
+};
+}
+
+bool EarlyCSE::processNode(DomTreeNode *Node) {
+ BasicBlock *BB = Node->getBlock();
+
+ // If this block has a single predecessor, then the predecessor is the parent
+ // of the domtree node and all of the live out memory values are still current
+ // in this block. If this block has multiple predecessors, then they could
+ // have invalidated the live-out memory values of our parent value. For now,
+ // just be conservative and invalidate memory if this block has multiple
+ // predecessors.
+ if (!BB->getSinglePredecessor())
+ ++CurrentGeneration;
+
+ // If this node has a single predecessor which ends in a conditional branch,
+ // we can infer the value of the branch condition given that we took this
+ // path. We need the single predeccesor to ensure there's not another path
+ // which reaches this block where the condition might hold a different
+ // value. Since we're adding this to the scoped hash table (like any other
+ // def), it will have been popped if we encounter a future merge block.
+ if (BasicBlock *Pred = BB->getSinglePredecessor())
+ if (auto *BI = dyn_cast<BranchInst>(Pred->getTerminator()))
+ if (BI->isConditional())
+ if (auto *CondInst = dyn_cast<Instruction>(BI->getCondition()))
+ if (SimpleValue::canHandle(CondInst)) {
+ assert(BI->getSuccessor(0) == BB || BI->getSuccessor(1) == BB);
+ auto *ConditionalConstant = (BI->getSuccessor(0) == BB) ?
+ ConstantInt::getTrue(BB->getContext()) :
+ ConstantInt::getFalse(BB->getContext());
+ AvailableValues.insert(CondInst, ConditionalConstant);
+ DEBUG(dbgs() << "EarlyCSE CVP: Add conditional value for '"
+ << CondInst->getName() << "' as " << *ConditionalConstant
+ << " in " << BB->getName() << "\n");
+ // Replace all dominated uses with the known value
+ replaceDominatedUsesWith(CondInst, ConditionalConstant, DT,
+ BasicBlockEdge(Pred, BB));
+ }
+
+ /// LastStore - Keep track of the last non-volatile store that we saw... for
+ /// as long as there in no instruction that reads memory. If we see a store
+ /// to the same location, we delete the dead store. This zaps trivial dead
+ /// stores which can occur in bitfield code among other things.
+ Instruction *LastStore = nullptr;
+
+ bool Changed = false;
+ const DataLayout &DL = BB->getModule()->getDataLayout();
+
+ // See if any instructions in the block can be eliminated. If so, do it. If
+ // not, add them to AvailableValues.
+ for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
+ Instruction *Inst = &*I++;
+
+ // Dead instructions should just be removed.
+ if (isInstructionTriviallyDead(Inst, &TLI)) {
+ DEBUG(dbgs() << "EarlyCSE DCE: " << *Inst << '\n');
+ Inst->eraseFromParent();
+ Changed = true;
+ ++NumSimplify;
+ continue;
+ }
+
+ // Skip assume intrinsics, they don't really have side effects (although
+ // they're marked as such to ensure preservation of control dependencies),
+ // and this pass will not disturb any of the assumption's control
+ // dependencies.
+ if (match(Inst, m_Intrinsic<Intrinsic::assume>())) {
+ DEBUG(dbgs() << "EarlyCSE skipping assumption: " << *Inst << '\n');
+ continue;
+ }
+
+ // If the instruction can be simplified (e.g. X+0 = X) then replace it with
+ // its simpler value.
+ if (Value *V = SimplifyInstruction(Inst, DL, &TLI, &DT, &AC)) {
+ DEBUG(dbgs() << "EarlyCSE Simplify: " << *Inst << " to: " << *V << '\n');
+ Inst->replaceAllUsesWith(V);
+ Inst->eraseFromParent();
+ Changed = true;
+ ++NumSimplify;
+ continue;
+ }
+
+ // If this is a simple instruction that we can value number, process it.
+ if (SimpleValue::canHandle(Inst)) {
+ // See if the instruction has an available value. If so, use it.
+ if (Value *V = AvailableValues.lookup(Inst)) {
+ DEBUG(dbgs() << "EarlyCSE CSE: " << *Inst << " to: " << *V << '\n');
+ Inst->replaceAllUsesWith(V);
+ Inst->eraseFromParent();
+ Changed = true;
+ ++NumCSE;
+ continue;
+ }
+
+ // Otherwise, just remember that this value is available.
+ AvailableValues.insert(Inst, Inst);
+ continue;
+ }
+
+ ParseMemoryInst MemInst(Inst, TTI);
+ // If this is a non-volatile load, process it.
+ if (MemInst.isValid() && MemInst.isLoad()) {
+ // (conservatively) we can't peak past the ordering implied by this
+ // operation, but we can add this load to our set of available values
+ if (MemInst.isVolatile() || !MemInst.isUnordered()) {
+ LastStore = nullptr;
+ ++CurrentGeneration;
+ }
+
+ // If we have an available version of this load, and if it is the right
+ // generation, replace this instruction.
+ LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand());
+ if (InVal.Data != nullptr && InVal.Generation == CurrentGeneration &&
+ InVal.MatchingId == MemInst.getMatchingId() &&
+ // We don't yet handle removing loads with ordering of any kind.
+ !MemInst.isVolatile() && MemInst.isUnordered() &&
+ // We can't replace an atomic load with one which isn't also atomic.
+ InVal.IsAtomic >= MemInst.isAtomic()) {
+ Value *Op = getOrCreateResult(InVal.Data, Inst->getType());
+ if (Op != nullptr) {
+ DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst
+ << " to: " << *InVal.Data << '\n');
+ if (!Inst->use_empty())
+ Inst->replaceAllUsesWith(Op);
+ Inst->eraseFromParent();
+ Changed = true;
+ ++NumCSELoad;
+ continue;
+ }
+ }
+
+ // Otherwise, remember that we have this instruction.
+ AvailableLoads.insert(
+ MemInst.getPointerOperand(),
+ LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId(),
+ MemInst.isAtomic()));
+ LastStore = nullptr;
+ continue;
+ }
+
+ // If this instruction may read from memory, forget LastStore.
+ // Load/store intrinsics will indicate both a read and a write to
+ // memory. The target may override this (e.g. so that a store intrinsic
+ // does not read from memory, and thus will be treated the same as a
+ // regular store for commoning purposes).
+ if (Inst->mayReadFromMemory() &&
+ !(MemInst.isValid() && !MemInst.mayReadFromMemory()))
+ LastStore = nullptr;
+
+ // If this is a read-only call, process it.
+ if (CallValue::canHandle(Inst)) {
+ // If we have an available version of this call, and if it is the right
+ // generation, replace this instruction.
+ std::pair<Value *, unsigned> InVal = AvailableCalls.lookup(Inst);
+ if (InVal.first != nullptr && InVal.second == CurrentGeneration) {
+ DEBUG(dbgs() << "EarlyCSE CSE CALL: " << *Inst
+ << " to: " << *InVal.first << '\n');
+ if (!Inst->use_empty())
+ Inst->replaceAllUsesWith(InVal.first);
+ Inst->eraseFromParent();
+ Changed = true;
+ ++NumCSECall;
+ continue;
+ }
+
+ // Otherwise, remember that we have this instruction.
+ AvailableCalls.insert(
+ Inst, std::pair<Value *, unsigned>(Inst, CurrentGeneration));
+ continue;
+ }
+
+ // A release fence requires that all stores complete before it, but does
+ // not prevent the reordering of following loads 'before' the fence. As a
+ // result, we don't need to consider it as writing to memory and don't need
+ // to advance the generation. We do need to prevent DSE across the fence,
+ // but that's handled above.
+ if (FenceInst *FI = dyn_cast<FenceInst>(Inst))
+ if (FI->getOrdering() == Release) {
+ assert(Inst->mayReadFromMemory() && "relied on to prevent DSE above");
+ continue;
+ }
+
+ // write back DSE - If we write back the same value we just loaded from
+ // the same location and haven't passed any intervening writes or ordering
+ // operations, we can remove the write. The primary benefit is in allowing
+ // the available load table to remain valid and value forward past where
+ // the store originally was.
+ if (MemInst.isValid() && MemInst.isStore()) {
+ LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand());
+ if (InVal.Data &&
+ InVal.Data == getOrCreateResult(Inst, InVal.Data->getType()) &&
+ InVal.Generation == CurrentGeneration &&
+ InVal.MatchingId == MemInst.getMatchingId() &&
+ // We don't yet handle removing stores with ordering of any kind.
+ !MemInst.isVolatile() && MemInst.isUnordered()) {
+ assert((!LastStore ||
+ ParseMemoryInst(LastStore, TTI).getPointerOperand() ==
+ MemInst.getPointerOperand()) &&
+ "can't have an intervening store!");
+ DEBUG(dbgs() << "EarlyCSE DSE (writeback): " << *Inst << '\n');
+ Inst->eraseFromParent();
+ Changed = true;
+ ++NumDSE;
+ // We can avoid incrementing the generation count since we were able
+ // to eliminate this store.
+ continue;
+ }
+ }
+
+ // Okay, this isn't something we can CSE at all. Check to see if it is
+ // something that could modify memory. If so, our available memory values
+ // cannot be used so bump the generation count.
+ if (Inst->mayWriteToMemory()) {
+ ++CurrentGeneration;
+
+ if (MemInst.isValid() && MemInst.isStore()) {
+ // We do a trivial form of DSE if there are two stores to the same
+ // location with no intervening loads. Delete the earlier store.
+ // At the moment, we don't remove ordered stores, but do remove
+ // unordered atomic stores. There's no special requirement (for
+ // unordered atomics) about removing atomic stores only in favor of
+ // other atomic stores since we we're going to execute the non-atomic
+ // one anyway and the atomic one might never have become visible.
+ if (LastStore) {
+ ParseMemoryInst LastStoreMemInst(LastStore, TTI);
+ assert(LastStoreMemInst.isUnordered() &&
+ !LastStoreMemInst.isVolatile() &&
+ "Violated invariant");
+ if (LastStoreMemInst.isMatchingMemLoc(MemInst)) {
+ DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore
+ << " due to: " << *Inst << '\n');
+ LastStore->eraseFromParent();
+ Changed = true;
+ ++NumDSE;
+ LastStore = nullptr;
+ }
+ // fallthrough - we can exploit information about this store
+ }
+
+ // Okay, we just invalidated anything we knew about loaded values. Try
+ // to salvage *something* by remembering that the stored value is a live
+ // version of the pointer. It is safe to forward from volatile stores
+ // to non-volatile loads, so we don't have to check for volatility of
+ // the store.
+ AvailableLoads.insert(
+ MemInst.getPointerOperand(),
+ LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId(),
+ MemInst.isAtomic()));
+
+ // Remember that this was the last unordered store we saw for DSE. We
+ // don't yet handle DSE on ordered or volatile stores since we don't
+ // have a good way to model the ordering requirement for following
+ // passes once the store is removed. We could insert a fence, but
+ // since fences are slightly stronger than stores in their ordering,
+ // it's not clear this is a profitable transform. Another option would
+ // be to merge the ordering with that of the post dominating store.
+ if (MemInst.isUnordered() && !MemInst.isVolatile())
+ LastStore = Inst;
+ else
+ LastStore = nullptr;
+ }
+ }
+ }
+
+ return Changed;
+}
+
+bool EarlyCSE::run() {
+ // Note, deque is being used here because there is significant performance
+ // gains over vector when the container becomes very large due to the
+ // specific access patterns. For more information see the mailing list
+ // discussion on this:
+ // http://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20120116/135228.html
+ std::deque<StackNode *> nodesToProcess;
+
+ bool Changed = false;
+
+ // Process the root node.
+ nodesToProcess.push_back(new StackNode(
+ AvailableValues, AvailableLoads, AvailableCalls, CurrentGeneration,
+ DT.getRootNode(), DT.getRootNode()->begin(), DT.getRootNode()->end()));
+
+ // Save the current generation.
+ unsigned LiveOutGeneration = CurrentGeneration;
+
+ // Process the stack.
+ while (!nodesToProcess.empty()) {
+ // Grab the first item off the stack. Set the current generation, remove
+ // the node from the stack, and process it.
+ StackNode *NodeToProcess = nodesToProcess.back();
+
+ // Initialize class members.
+ CurrentGeneration = NodeToProcess->currentGeneration();
+
+ // Check if the node needs to be processed.
+ if (!NodeToProcess->isProcessed()) {
+ // Process the node.
+ Changed |= processNode(NodeToProcess->node());
+ NodeToProcess->childGeneration(CurrentGeneration);
+ NodeToProcess->process();
+ } else if (NodeToProcess->childIter() != NodeToProcess->end()) {
+ // Push the next child onto the stack.
+ DomTreeNode *child = NodeToProcess->nextChild();
+ nodesToProcess.push_back(
+ new StackNode(AvailableValues, AvailableLoads, AvailableCalls,
+ NodeToProcess->childGeneration(), child, child->begin(),
+ child->end()));
+ } else {
+ // It has been processed, and there are no more children to process,
+ // so delete it and pop it off the stack.
+ delete NodeToProcess;
+ nodesToProcess.pop_back();
+ }
+ } // while (!nodes...)
+
+ // Reset the current generation.
+ CurrentGeneration = LiveOutGeneration;
+
+ return Changed;
+}
+
+PreservedAnalyses EarlyCSEPass::run(Function &F,
+ AnalysisManager<Function> *AM) {
+ auto &TLI = AM->getResult<TargetLibraryAnalysis>(F);
+ auto &TTI = AM->getResult<TargetIRAnalysis>(F);
+ auto &DT = AM->getResult<DominatorTreeAnalysis>(F);
+ auto &AC = AM->getResult<AssumptionAnalysis>(F);
+
+ EarlyCSE CSE(TLI, TTI, DT, AC);
+
+ if (!CSE.run())
+ return PreservedAnalyses::all();
+
+ // CSE preserves the dominator tree because it doesn't mutate the CFG.
+ // FIXME: Bundle this with other CFG-preservation.
+ PreservedAnalyses PA;
+ PA.preserve<DominatorTreeAnalysis>();
+ return PA;
+}
+
+namespace {
+/// \brief A simple and fast domtree-based CSE pass.
+///
+/// This pass does a simple depth-first walk over the dominator tree,
+/// eliminating trivially redundant instructions and using instsimplify to
+/// canonicalize things as it goes. It is intended to be fast and catch obvious
+/// cases so that instcombine and other passes are more effective. It is
+/// expected that a later pass of GVN will catch the interesting/hard cases.
+class EarlyCSELegacyPass : public FunctionPass {
+public:
+ static char ID;
+
+ EarlyCSELegacyPass() : FunctionPass(ID) {
+ initializeEarlyCSELegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipOptnoneFunction(F))
+ return false;
+
+ auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+
+ EarlyCSE CSE(TLI, TTI, DT, AC);
+
+ return CSE.run();
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.setPreservesCFG();
+ }
+};
+}
+
+char EarlyCSELegacyPass::ID = 0;
+
+FunctionPass *llvm::createEarlyCSEPass() { return new EarlyCSELegacyPass(); }
+
+INITIALIZE_PASS_BEGIN(EarlyCSELegacyPass, "early-cse", "Early CSE", false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(EarlyCSELegacyPass, "early-cse", "Early CSE", false, false)
diff --git a/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp b/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp
new file mode 100644
index 0000000..185cdbd
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp
@@ -0,0 +1,80 @@
+//===- FlattenCFGPass.cpp - CFG Flatten Pass ----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements flattening of CFG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "flattencfg"
+
+namespace {
+struct FlattenCFGPass : public FunctionPass {
+ static char ID; // Pass identification, replacement for typeid
+public:
+ FlattenCFGPass() : FunctionPass(ID) {
+ initializeFlattenCFGPassPass(*PassRegistry::getPassRegistry());
+ }
+ bool runOnFunction(Function &F) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AAResultsWrapperPass>();
+ }
+
+private:
+ AliasAnalysis *AA;
+};
+}
+
+char FlattenCFGPass::ID = 0;
+INITIALIZE_PASS_BEGIN(FlattenCFGPass, "flattencfg", "Flatten the CFG", false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(FlattenCFGPass, "flattencfg", "Flatten the CFG", false,
+ false)
+
+// Public interface to the FlattenCFG pass
+FunctionPass *llvm::createFlattenCFGPass() { return new FlattenCFGPass(); }
+
+/// iterativelyFlattenCFG - Call FlattenCFG on all the blocks in the function,
+/// iterating until no more changes are made.
+static bool iterativelyFlattenCFG(Function &F, AliasAnalysis *AA) {
+ bool Changed = false;
+ bool LocalChange = true;
+ while (LocalChange) {
+ LocalChange = false;
+
+ // Loop over all of the basic blocks and remove them if they are unneeded...
+ //
+ for (Function::iterator BBIt = F.begin(); BBIt != F.end();) {
+ if (FlattenCFG(&*BBIt++, AA)) {
+ LocalChange = true;
+ }
+ }
+ Changed |= LocalChange;
+ }
+ return Changed;
+}
+
+bool FlattenCFGPass::runOnFunction(Function &F) {
+ AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+ bool EverChanged = false;
+ // iterativelyFlattenCFG can make some blocks dead.
+ while (iterativelyFlattenCFG(F, AA)) {
+ removeUnreachableBlocks(F);
+ EverChanged = true;
+ }
+ return EverChanged;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp b/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp
new file mode 100644
index 0000000..7f5d786
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp
@@ -0,0 +1,543 @@
+//===- Float2Int.cpp - Demote floating point ops to work on integers ------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Float2Int pass, which aims to demote floating
+// point operations to work on integers, where that is losslessly possible.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "float2int"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/APSInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include <deque>
+#include <functional> // For std::function
+using namespace llvm;
+
+// The algorithm is simple. Start at instructions that convert from the
+// float to the int domain: fptoui, fptosi and fcmp. Walk up the def-use
+// graph, using an equivalence datastructure to unify graphs that interfere.
+//
+// Mappable instructions are those with an integer corrollary that, given
+// integer domain inputs, produce an integer output; fadd, for example.
+//
+// If a non-mappable instruction is seen, this entire def-use graph is marked
+// as non-transformable. If we see an instruction that converts from the
+// integer domain to FP domain (uitofp,sitofp), we terminate our walk.
+
+/// The largest integer type worth dealing with.
+static cl::opt<unsigned>
+MaxIntegerBW("float2int-max-integer-bw", cl::init(64), cl::Hidden,
+ cl::desc("Max integer bitwidth to consider in float2int"
+ "(default=64)"));
+
+namespace {
+ struct Float2Int : public FunctionPass {
+ static char ID; // Pass identification, replacement for typeid
+ Float2Int() : FunctionPass(ID) {
+ initializeFloat2IntPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override;
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ }
+
+ void findRoots(Function &F, SmallPtrSet<Instruction*,8> &Roots);
+ ConstantRange seen(Instruction *I, ConstantRange R);
+ ConstantRange badRange();
+ ConstantRange unknownRange();
+ ConstantRange validateRange(ConstantRange R);
+ void walkBackwards(const SmallPtrSetImpl<Instruction*> &Roots);
+ void walkForwards();
+ bool validateAndTransform();
+ Value *convert(Instruction *I, Type *ToTy);
+ void cleanup();
+
+ MapVector<Instruction*, ConstantRange > SeenInsts;
+ SmallPtrSet<Instruction*,8> Roots;
+ EquivalenceClasses<Instruction*> ECs;
+ MapVector<Instruction*, Value*> ConvertedInsts;
+ LLVMContext *Ctx;
+ };
+}
+
+char Float2Int::ID = 0;
+INITIALIZE_PASS_BEGIN(Float2Int, "float2int", "Float to int", false, false)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_END(Float2Int, "float2int", "Float to int", false, false)
+
+// Given a FCmp predicate, return a matching ICmp predicate if one
+// exists, otherwise return BAD_ICMP_PREDICATE.
+static CmpInst::Predicate mapFCmpPred(CmpInst::Predicate P) {
+ switch (P) {
+ case CmpInst::FCMP_OEQ:
+ case CmpInst::FCMP_UEQ:
+ return CmpInst::ICMP_EQ;
+ case CmpInst::FCMP_OGT:
+ case CmpInst::FCMP_UGT:
+ return CmpInst::ICMP_SGT;
+ case CmpInst::FCMP_OGE:
+ case CmpInst::FCMP_UGE:
+ return CmpInst::ICMP_SGE;
+ case CmpInst::FCMP_OLT:
+ case CmpInst::FCMP_ULT:
+ return CmpInst::ICMP_SLT;
+ case CmpInst::FCMP_OLE:
+ case CmpInst::FCMP_ULE:
+ return CmpInst::ICMP_SLE;
+ case CmpInst::FCMP_ONE:
+ case CmpInst::FCMP_UNE:
+ return CmpInst::ICMP_NE;
+ default:
+ return CmpInst::BAD_ICMP_PREDICATE;
+ }
+}
+
+// Given a floating point binary operator, return the matching
+// integer version.
+static Instruction::BinaryOps mapBinOpcode(unsigned Opcode) {
+ switch (Opcode) {
+ default: llvm_unreachable("Unhandled opcode!");
+ case Instruction::FAdd: return Instruction::Add;
+ case Instruction::FSub: return Instruction::Sub;
+ case Instruction::FMul: return Instruction::Mul;
+ }
+}
+
+// Find the roots - instructions that convert from the FP domain to
+// integer domain.
+void Float2Int::findRoots(Function &F, SmallPtrSet<Instruction*,8> &Roots) {
+ for (auto &I : instructions(F)) {
+ if (isa<VectorType>(I.getType()))
+ continue;
+ switch (I.getOpcode()) {
+ default: break;
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ Roots.insert(&I);
+ break;
+ case Instruction::FCmp:
+ if (mapFCmpPred(cast<CmpInst>(&I)->getPredicate()) !=
+ CmpInst::BAD_ICMP_PREDICATE)
+ Roots.insert(&I);
+ break;
+ }
+ }
+}
+
+// Helper - mark I as having been traversed, having range R.
+ConstantRange Float2Int::seen(Instruction *I, ConstantRange R) {
+ DEBUG(dbgs() << "F2I: " << *I << ":" << R << "\n");
+ if (SeenInsts.find(I) != SeenInsts.end())
+ SeenInsts.find(I)->second = R;
+ else
+ SeenInsts.insert(std::make_pair(I, R));
+ return R;
+}
+
+// Helper - get a range representing a poison value.
+ConstantRange Float2Int::badRange() {
+ return ConstantRange(MaxIntegerBW + 1, true);
+}
+ConstantRange Float2Int::unknownRange() {
+ return ConstantRange(MaxIntegerBW + 1, false);
+}
+ConstantRange Float2Int::validateRange(ConstantRange R) {
+ if (R.getBitWidth() > MaxIntegerBW + 1)
+ return badRange();
+ return R;
+}
+
+// The most obvious way to structure the search is a depth-first, eager
+// search from each root. However, that require direct recursion and so
+// can only handle small instruction sequences. Instead, we split the search
+// up into two phases:
+// - walkBackwards: A breadth-first walk of the use-def graph starting from
+// the roots. Populate "SeenInsts" with interesting
+// instructions and poison values if they're obvious and
+// cheap to compute. Calculate the equivalance set structure
+// while we're here too.
+// - walkForwards: Iterate over SeenInsts in reverse order, so we visit
+// defs before their uses. Calculate the real range info.
+
+// Breadth-first walk of the use-def graph; determine the set of nodes
+// we care about and eagerly determine if some of them are poisonous.
+void Float2Int::walkBackwards(const SmallPtrSetImpl<Instruction*> &Roots) {
+ std::deque<Instruction*> Worklist(Roots.begin(), Roots.end());
+ while (!Worklist.empty()) {
+ Instruction *I = Worklist.back();
+ Worklist.pop_back();
+
+ if (SeenInsts.find(I) != SeenInsts.end())
+ // Seen already.
+ continue;
+
+ switch (I->getOpcode()) {
+ // FIXME: Handle select and phi nodes.
+ default:
+ // Path terminated uncleanly.
+ seen(I, badRange());
+ break;
+
+ case Instruction::UIToFP: {
+ // Path terminated cleanly.
+ unsigned BW = I->getOperand(0)->getType()->getPrimitiveSizeInBits();
+ APInt Min = APInt::getMinValue(BW).zextOrSelf(MaxIntegerBW+1);
+ APInt Max = APInt::getMaxValue(BW).zextOrSelf(MaxIntegerBW+1);
+ seen(I, validateRange(ConstantRange(Min, Max)));
+ continue;
+ }
+
+ case Instruction::SIToFP: {
+ // Path terminated cleanly.
+ unsigned BW = I->getOperand(0)->getType()->getPrimitiveSizeInBits();
+ APInt SMin = APInt::getSignedMinValue(BW).sextOrSelf(MaxIntegerBW+1);
+ APInt SMax = APInt::getSignedMaxValue(BW).sextOrSelf(MaxIntegerBW+1);
+ seen(I, validateRange(ConstantRange(SMin, SMax)));
+ continue;
+ }
+
+ case Instruction::FAdd:
+ case Instruction::FSub:
+ case Instruction::FMul:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::FCmp:
+ seen(I, unknownRange());
+ break;
+ }
+
+ for (Value *O : I->operands()) {
+ if (Instruction *OI = dyn_cast<Instruction>(O)) {
+ // Unify def-use chains if they interfere.
+ ECs.unionSets(I, OI);
+ if (SeenInsts.find(I)->second != badRange())
+ Worklist.push_back(OI);
+ } else if (!isa<ConstantFP>(O)) {
+ // Not an instruction or ConstantFP? we can't do anything.
+ seen(I, badRange());
+ }
+ }
+ }
+}
+
+// Walk forwards down the list of seen instructions, so we visit defs before
+// uses.
+void Float2Int::walkForwards() {
+ for (auto &It : make_range(SeenInsts.rbegin(), SeenInsts.rend())) {
+ if (It.second != unknownRange())
+ continue;
+
+ Instruction *I = It.first;
+ std::function<ConstantRange(ArrayRef<ConstantRange>)> Op;
+ switch (I->getOpcode()) {
+ // FIXME: Handle select and phi nodes.
+ default:
+ case Instruction::UIToFP:
+ case Instruction::SIToFP:
+ llvm_unreachable("Should have been handled in walkForwards!");
+
+ case Instruction::FAdd:
+ Op = [](ArrayRef<ConstantRange> Ops) {
+ assert(Ops.size() == 2 && "FAdd is a binary operator!");
+ return Ops[0].add(Ops[1]);
+ };
+ break;
+
+ case Instruction::FSub:
+ Op = [](ArrayRef<ConstantRange> Ops) {
+ assert(Ops.size() == 2 && "FSub is a binary operator!");
+ return Ops[0].sub(Ops[1]);
+ };
+ break;
+
+ case Instruction::FMul:
+ Op = [](ArrayRef<ConstantRange> Ops) {
+ assert(Ops.size() == 2 && "FMul is a binary operator!");
+ return Ops[0].multiply(Ops[1]);
+ };
+ break;
+
+ //
+ // Root-only instructions - we'll only see these if they're the
+ // first node in a walk.
+ //
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ Op = [](ArrayRef<ConstantRange> Ops) {
+ assert(Ops.size() == 1 && "FPTo[US]I is a unary operator!");
+ return Ops[0];
+ };
+ break;
+
+ case Instruction::FCmp:
+ Op = [](ArrayRef<ConstantRange> Ops) {
+ assert(Ops.size() == 2 && "FCmp is a binary operator!");
+ return Ops[0].unionWith(Ops[1]);
+ };
+ break;
+ }
+
+ bool Abort = false;
+ SmallVector<ConstantRange,4> OpRanges;
+ for (Value *O : I->operands()) {
+ if (Instruction *OI = dyn_cast<Instruction>(O)) {
+ assert(SeenInsts.find(OI) != SeenInsts.end() &&
+ "def not seen before use!");
+ OpRanges.push_back(SeenInsts.find(OI)->second);
+ } else if (ConstantFP *CF = dyn_cast<ConstantFP>(O)) {
+ // Work out if the floating point number can be losslessly represented
+ // as an integer.
+ // APFloat::convertToInteger(&Exact) purports to do what we want, but
+ // the exactness can be too precise. For example, negative zero can
+ // never be exactly converted to an integer.
+ //
+ // Instead, we ask APFloat to round itself to an integral value - this
+ // preserves sign-of-zero - then compare the result with the original.
+ //
+ APFloat F = CF->getValueAPF();
+
+ // First, weed out obviously incorrect values. Non-finite numbers
+ // can't be represented and neither can negative zero, unless
+ // we're in fast math mode.
+ if (!F.isFinite() ||
+ (F.isZero() && F.isNegative() && isa<FPMathOperator>(I) &&
+ !I->hasNoSignedZeros())) {
+ seen(I, badRange());
+ Abort = true;
+ break;
+ }
+
+ APFloat NewF = F;
+ auto Res = NewF.roundToIntegral(APFloat::rmNearestTiesToEven);
+ if (Res != APFloat::opOK || NewF.compare(F) != APFloat::cmpEqual) {
+ seen(I, badRange());
+ Abort = true;
+ break;
+ }
+ // OK, it's representable. Now get it.
+ APSInt Int(MaxIntegerBW+1, false);
+ bool Exact;
+ CF->getValueAPF().convertToInteger(Int,
+ APFloat::rmNearestTiesToEven,
+ &Exact);
+ OpRanges.push_back(ConstantRange(Int));
+ } else {
+ llvm_unreachable("Should have already marked this as badRange!");
+ }
+ }
+
+ // Reduce the operands' ranges to a single range and return.
+ if (!Abort)
+ seen(I, Op(OpRanges));
+ }
+}
+
+// If there is a valid transform to be done, do it.
+bool Float2Int::validateAndTransform() {
+ bool MadeChange = false;
+
+ // Iterate over every disjoint partition of the def-use graph.
+ for (auto It = ECs.begin(), E = ECs.end(); It != E; ++It) {
+ ConstantRange R(MaxIntegerBW + 1, false);
+ bool Fail = false;
+ Type *ConvertedToTy = nullptr;
+
+ // For every member of the partition, union all the ranges together.
+ for (auto MI = ECs.member_begin(It), ME = ECs.member_end();
+ MI != ME; ++MI) {
+ Instruction *I = *MI;
+ auto SeenI = SeenInsts.find(I);
+ if (SeenI == SeenInsts.end())
+ continue;
+
+ R = R.unionWith(SeenI->second);
+ // We need to ensure I has no users that have not been seen.
+ // If it does, transformation would be illegal.
+ //
+ // Don't count the roots, as they terminate the graphs.
+ if (Roots.count(I) == 0) {
+ // Set the type of the conversion while we're here.
+ if (!ConvertedToTy)
+ ConvertedToTy = I->getType();
+ for (User *U : I->users()) {
+ Instruction *UI = dyn_cast<Instruction>(U);
+ if (!UI || SeenInsts.find(UI) == SeenInsts.end()) {
+ DEBUG(dbgs() << "F2I: Failing because of " << *U << "\n");
+ Fail = true;
+ break;
+ }
+ }
+ }
+ if (Fail)
+ break;
+ }
+
+ // If the set was empty, or we failed, or the range is poisonous,
+ // bail out.
+ if (ECs.member_begin(It) == ECs.member_end() || Fail ||
+ R.isFullSet() || R.isSignWrappedSet())
+ continue;
+ assert(ConvertedToTy && "Must have set the convertedtoty by this point!");
+
+ // The number of bits required is the maximum of the upper and
+ // lower limits, plus one so it can be signed.
+ unsigned MinBW = std::max(R.getLower().getMinSignedBits(),
+ R.getUpper().getMinSignedBits()) + 1;
+ DEBUG(dbgs() << "F2I: MinBitwidth=" << MinBW << ", R: " << R << "\n");
+
+ // If we've run off the realms of the exactly representable integers,
+ // the floating point result will differ from an integer approximation.
+
+ // Do we need more bits than are in the mantissa of the type we converted
+ // to? semanticsPrecision returns the number of mantissa bits plus one
+ // for the sign bit.
+ unsigned MaxRepresentableBits
+ = APFloat::semanticsPrecision(ConvertedToTy->getFltSemantics()) - 1;
+ if (MinBW > MaxRepresentableBits) {
+ DEBUG(dbgs() << "F2I: Value not guaranteed to be representable!\n");
+ continue;
+ }
+ if (MinBW > 64) {
+ DEBUG(dbgs() << "F2I: Value requires more than 64 bits to represent!\n");
+ continue;
+ }
+
+ // OK, R is known to be representable. Now pick a type for it.
+ // FIXME: Pick the smallest legal type that will fit.
+ Type *Ty = (MinBW > 32) ? Type::getInt64Ty(*Ctx) : Type::getInt32Ty(*Ctx);
+
+ for (auto MI = ECs.member_begin(It), ME = ECs.member_end();
+ MI != ME; ++MI)
+ convert(*MI, Ty);
+ MadeChange = true;
+ }
+
+ return MadeChange;
+}
+
+Value *Float2Int::convert(Instruction *I, Type *ToTy) {
+ if (ConvertedInsts.find(I) != ConvertedInsts.end())
+ // Already converted this instruction.
+ return ConvertedInsts[I];
+
+ SmallVector<Value*,4> NewOperands;
+ for (Value *V : I->operands()) {
+ // Don't recurse if we're an instruction that terminates the path.
+ if (I->getOpcode() == Instruction::UIToFP ||
+ I->getOpcode() == Instruction::SIToFP) {
+ NewOperands.push_back(V);
+ } else if (Instruction *VI = dyn_cast<Instruction>(V)) {
+ NewOperands.push_back(convert(VI, ToTy));
+ } else if (ConstantFP *CF = dyn_cast<ConstantFP>(V)) {
+ APSInt Val(ToTy->getPrimitiveSizeInBits(), /*IsUnsigned=*/false);
+ bool Exact;
+ CF->getValueAPF().convertToInteger(Val,
+ APFloat::rmNearestTiesToEven,
+ &Exact);
+ NewOperands.push_back(ConstantInt::get(ToTy, Val));
+ } else {
+ llvm_unreachable("Unhandled operand type?");
+ }
+ }
+
+ // Now create a new instruction.
+ IRBuilder<> IRB(I);
+ Value *NewV = nullptr;
+ switch (I->getOpcode()) {
+ default: llvm_unreachable("Unhandled instruction!");
+
+ case Instruction::FPToUI:
+ NewV = IRB.CreateZExtOrTrunc(NewOperands[0], I->getType());
+ break;
+
+ case Instruction::FPToSI:
+ NewV = IRB.CreateSExtOrTrunc(NewOperands[0], I->getType());
+ break;
+
+ case Instruction::FCmp: {
+ CmpInst::Predicate P = mapFCmpPred(cast<CmpInst>(I)->getPredicate());
+ assert(P != CmpInst::BAD_ICMP_PREDICATE && "Unhandled predicate!");
+ NewV = IRB.CreateICmp(P, NewOperands[0], NewOperands[1], I->getName());
+ break;
+ }
+
+ case Instruction::UIToFP:
+ NewV = IRB.CreateZExtOrTrunc(NewOperands[0], ToTy);
+ break;
+
+ case Instruction::SIToFP:
+ NewV = IRB.CreateSExtOrTrunc(NewOperands[0], ToTy);
+ break;
+
+ case Instruction::FAdd:
+ case Instruction::FSub:
+ case Instruction::FMul:
+ NewV = IRB.CreateBinOp(mapBinOpcode(I->getOpcode()),
+ NewOperands[0], NewOperands[1],
+ I->getName());
+ break;
+ }
+
+ // If we're a root instruction, RAUW.
+ if (Roots.count(I))
+ I->replaceAllUsesWith(NewV);
+
+ ConvertedInsts[I] = NewV;
+ return NewV;
+}
+
+// Perform dead code elimination on the instructions we just modified.
+void Float2Int::cleanup() {
+ for (auto &I : make_range(ConvertedInsts.rbegin(), ConvertedInsts.rend()))
+ I.first->eraseFromParent();
+}
+
+bool Float2Int::runOnFunction(Function &F) {
+ if (skipOptnoneFunction(F))
+ return false;
+
+ DEBUG(dbgs() << "F2I: Looking at function " << F.getName() << "\n");
+ // Clear out all state.
+ ECs = EquivalenceClasses<Instruction*>();
+ SeenInsts.clear();
+ ConvertedInsts.clear();
+ Roots.clear();
+
+ Ctx = &F.getParent()->getContext();
+
+ findRoots(F, Roots);
+
+ walkBackwards(Roots);
+ walkForwards();
+
+ bool Modified = validateAndTransform();
+ if (Modified)
+ cleanup();
+ return Modified;
+}
+
+FunctionPass *llvm::createFloat2IntPass() { return new Float2Int(); }
diff --git a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp
new file mode 100644
index 0000000..a028b8c
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -0,0 +1,2931 @@
+//===- GVN.cpp - Eliminate redundant values and loads ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs global value numbering to eliminate fully redundant
+// instructions. It also performs simple dead load elimination.
+//
+// Note that this pass does the value numbering itself; it does not use the
+// ValueNumbering analysis passes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/PHITransAddr.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include <vector>
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "gvn"
+
+STATISTIC(NumGVNInstr, "Number of instructions deleted");
+STATISTIC(NumGVNLoad, "Number of loads deleted");
+STATISTIC(NumGVNPRE, "Number of instructions PRE'd");
+STATISTIC(NumGVNBlocks, "Number of blocks merged");
+STATISTIC(NumGVNSimpl, "Number of instructions simplified");
+STATISTIC(NumGVNEqProp, "Number of equalities propagated");
+STATISTIC(NumPRELoad, "Number of loads PRE'd");
+
+static cl::opt<bool> EnablePRE("enable-pre",
+ cl::init(true), cl::Hidden);
+static cl::opt<bool> EnableLoadPRE("enable-load-pre", cl::init(true));
+
+// Maximum allowed recursion depth.
+static cl::opt<uint32_t>
+MaxRecurseDepth("max-recurse-depth", cl::Hidden, cl::init(1000), cl::ZeroOrMore,
+ cl::desc("Max recurse depth (default = 1000)"));
+
+//===----------------------------------------------------------------------===//
+// ValueTable Class
+//===----------------------------------------------------------------------===//
+
+/// This class holds the mapping between values and value numbers. It is used
+/// as an efficient mechanism to determine the expression-wise equivalence of
+/// two values.
+namespace {
+ struct Expression {
+ uint32_t opcode;
+ Type *type;
+ SmallVector<uint32_t, 4> varargs;
+
+ Expression(uint32_t o = ~2U) : opcode(o) { }
+
+ bool operator==(const Expression &other) const {
+ if (opcode != other.opcode)
+ return false;
+ if (opcode == ~0U || opcode == ~1U)
+ return true;
+ if (type != other.type)
+ return false;
+ if (varargs != other.varargs)
+ return false;
+ return true;
+ }
+
+ friend hash_code hash_value(const Expression &Value) {
+ return hash_combine(Value.opcode, Value.type,
+ hash_combine_range(Value.varargs.begin(),
+ Value.varargs.end()));
+ }
+ };
+
+ class ValueTable {
+ DenseMap<Value*, uint32_t> valueNumbering;
+ DenseMap<Expression, uint32_t> expressionNumbering;
+ AliasAnalysis *AA;
+ MemoryDependenceAnalysis *MD;
+ DominatorTree *DT;
+
+ uint32_t nextValueNumber;
+
+ Expression create_expression(Instruction* I);
+ Expression create_cmp_expression(unsigned Opcode,
+ CmpInst::Predicate Predicate,
+ Value *LHS, Value *RHS);
+ Expression create_extractvalue_expression(ExtractValueInst* EI);
+ uint32_t lookup_or_add_call(CallInst* C);
+ public:
+ ValueTable() : nextValueNumber(1) { }
+ uint32_t lookup_or_add(Value *V);
+ uint32_t lookup(Value *V) const;
+ uint32_t lookup_or_add_cmp(unsigned Opcode, CmpInst::Predicate Pred,
+ Value *LHS, Value *RHS);
+ bool exists(Value *V) const;
+ void add(Value *V, uint32_t num);
+ void clear();
+ void erase(Value *v);
+ void setAliasAnalysis(AliasAnalysis* A) { AA = A; }
+ AliasAnalysis *getAliasAnalysis() const { return AA; }
+ void setMemDep(MemoryDependenceAnalysis* M) { MD = M; }
+ void setDomTree(DominatorTree* D) { DT = D; }
+ uint32_t getNextUnusedValueNumber() { return nextValueNumber; }
+ void verifyRemoved(const Value *) const;
+ };
+}
+
+namespace llvm {
+template <> struct DenseMapInfo<Expression> {
+ static inline Expression getEmptyKey() {
+ return ~0U;
+ }
+
+ static inline Expression getTombstoneKey() {
+ return ~1U;
+ }
+
+ static unsigned getHashValue(const Expression e) {
+ using llvm::hash_value;
+ return static_cast<unsigned>(hash_value(e));
+ }
+ static bool isEqual(const Expression &LHS, const Expression &RHS) {
+ return LHS == RHS;
+ }
+};
+
+}
+
+//===----------------------------------------------------------------------===//
+// ValueTable Internal Functions
+//===----------------------------------------------------------------------===//
+
+Expression ValueTable::create_expression(Instruction *I) {
+ Expression e;
+ e.type = I->getType();
+ e.opcode = I->getOpcode();
+ for (Instruction::op_iterator OI = I->op_begin(), OE = I->op_end();
+ OI != OE; ++OI)
+ e.varargs.push_back(lookup_or_add(*OI));
+ if (I->isCommutative()) {
+ // Ensure that commutative instructions that only differ by a permutation
+ // of their operands get the same value number by sorting the operand value
+ // numbers. Since all commutative instructions have two operands it is more
+ // efficient to sort by hand rather than using, say, std::sort.
+ assert(I->getNumOperands() == 2 && "Unsupported commutative instruction!");
+ if (e.varargs[0] > e.varargs[1])
+ std::swap(e.varargs[0], e.varargs[1]);
+ }
+
+ if (CmpInst *C = dyn_cast<CmpInst>(I)) {
+ // Sort the operand value numbers so x<y and y>x get the same value number.
+ CmpInst::Predicate Predicate = C->getPredicate();
+ if (e.varargs[0] > e.varargs[1]) {
+ std::swap(e.varargs[0], e.varargs[1]);
+ Predicate = CmpInst::getSwappedPredicate(Predicate);
+ }
+ e.opcode = (C->getOpcode() << 8) | Predicate;
+ } else if (InsertValueInst *E = dyn_cast<InsertValueInst>(I)) {
+ for (InsertValueInst::idx_iterator II = E->idx_begin(), IE = E->idx_end();
+ II != IE; ++II)
+ e.varargs.push_back(*II);
+ }
+
+ return e;
+}
+
+Expression ValueTable::create_cmp_expression(unsigned Opcode,
+ CmpInst::Predicate Predicate,
+ Value *LHS, Value *RHS) {
+ assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
+ "Not a comparison!");
+ Expression e;
+ e.type = CmpInst::makeCmpResultType(LHS->getType());
+ e.varargs.push_back(lookup_or_add(LHS));
+ e.varargs.push_back(lookup_or_add(RHS));
+
+ // Sort the operand value numbers so x<y and y>x get the same value number.
+ if (e.varargs[0] > e.varargs[1]) {
+ std::swap(e.varargs[0], e.varargs[1]);
+ Predicate = CmpInst::getSwappedPredicate(Predicate);
+ }
+ e.opcode = (Opcode << 8) | Predicate;
+ return e;
+}
+
+Expression ValueTable::create_extractvalue_expression(ExtractValueInst *EI) {
+ assert(EI && "Not an ExtractValueInst?");
+ Expression e;
+ e.type = EI->getType();
+ e.opcode = 0;
+
+ IntrinsicInst *I = dyn_cast<IntrinsicInst>(EI->getAggregateOperand());
+ if (I != nullptr && EI->getNumIndices() == 1 && *EI->idx_begin() == 0 ) {
+ // EI might be an extract from one of our recognised intrinsics. If it
+ // is we'll synthesize a semantically equivalent expression instead on
+ // an extract value expression.
+ switch (I->getIntrinsicID()) {
+ case Intrinsic::sadd_with_overflow:
+ case Intrinsic::uadd_with_overflow:
+ e.opcode = Instruction::Add;
+ break;
+ case Intrinsic::ssub_with_overflow:
+ case Intrinsic::usub_with_overflow:
+ e.opcode = Instruction::Sub;
+ break;
+ case Intrinsic::smul_with_overflow:
+ case Intrinsic::umul_with_overflow:
+ e.opcode = Instruction::Mul;
+ break;
+ default:
+ break;
+ }
+
+ if (e.opcode != 0) {
+ // Intrinsic recognized. Grab its args to finish building the expression.
+ assert(I->getNumArgOperands() == 2 &&
+ "Expect two args for recognised intrinsics.");
+ e.varargs.push_back(lookup_or_add(I->getArgOperand(0)));
+ e.varargs.push_back(lookup_or_add(I->getArgOperand(1)));
+ return e;
+ }
+ }
+
+ // Not a recognised intrinsic. Fall back to producing an extract value
+ // expression.
+ e.opcode = EI->getOpcode();
+ for (Instruction::op_iterator OI = EI->op_begin(), OE = EI->op_end();
+ OI != OE; ++OI)
+ e.varargs.push_back(lookup_or_add(*OI));
+
+ for (ExtractValueInst::idx_iterator II = EI->idx_begin(), IE = EI->idx_end();
+ II != IE; ++II)
+ e.varargs.push_back(*II);
+
+ return e;
+}
+
+//===----------------------------------------------------------------------===//
+// ValueTable External Functions
+//===----------------------------------------------------------------------===//
+
+/// add - Insert a value into the table with a specified value number.
+void ValueTable::add(Value *V, uint32_t num) {
+ valueNumbering.insert(std::make_pair(V, num));
+}
+
+uint32_t ValueTable::lookup_or_add_call(CallInst *C) {
+ if (AA->doesNotAccessMemory(C)) {
+ Expression exp = create_expression(C);
+ uint32_t &e = expressionNumbering[exp];
+ if (!e) e = nextValueNumber++;
+ valueNumbering[C] = e;
+ return e;
+ } else if (AA->onlyReadsMemory(C)) {
+ Expression exp = create_expression(C);
+ uint32_t &e = expressionNumbering[exp];
+ if (!e) {
+ e = nextValueNumber++;
+ valueNumbering[C] = e;
+ return e;
+ }
+ if (!MD) {
+ e = nextValueNumber++;
+ valueNumbering[C] = e;
+ return e;
+ }
+
+ MemDepResult local_dep = MD->getDependency(C);
+
+ if (!local_dep.isDef() && !local_dep.isNonLocal()) {
+ valueNumbering[C] = nextValueNumber;
+ return nextValueNumber++;
+ }
+
+ if (local_dep.isDef()) {
+ CallInst* local_cdep = cast<CallInst>(local_dep.getInst());
+
+ if (local_cdep->getNumArgOperands() != C->getNumArgOperands()) {
+ valueNumbering[C] = nextValueNumber;
+ return nextValueNumber++;
+ }
+
+ for (unsigned i = 0, e = C->getNumArgOperands(); i < e; ++i) {
+ uint32_t c_vn = lookup_or_add(C->getArgOperand(i));
+ uint32_t cd_vn = lookup_or_add(local_cdep->getArgOperand(i));
+ if (c_vn != cd_vn) {
+ valueNumbering[C] = nextValueNumber;
+ return nextValueNumber++;
+ }
+ }
+
+ uint32_t v = lookup_or_add(local_cdep);
+ valueNumbering[C] = v;
+ return v;
+ }
+
+ // Non-local case.
+ const MemoryDependenceAnalysis::NonLocalDepInfo &deps =
+ MD->getNonLocalCallDependency(CallSite(C));
+ // FIXME: Move the checking logic to MemDep!
+ CallInst* cdep = nullptr;
+
+ // Check to see if we have a single dominating call instruction that is
+ // identical to C.
+ for (unsigned i = 0, e = deps.size(); i != e; ++i) {
+ const NonLocalDepEntry *I = &deps[i];
+ if (I->getResult().isNonLocal())
+ continue;
+
+ // We don't handle non-definitions. If we already have a call, reject
+ // instruction dependencies.
+ if (!I->getResult().isDef() || cdep != nullptr) {
+ cdep = nullptr;
+ break;
+ }
+
+ CallInst *NonLocalDepCall = dyn_cast<CallInst>(I->getResult().getInst());
+ // FIXME: All duplicated with non-local case.
+ if (NonLocalDepCall && DT->properlyDominates(I->getBB(), C->getParent())){
+ cdep = NonLocalDepCall;
+ continue;
+ }
+
+ cdep = nullptr;
+ break;
+ }
+
+ if (!cdep) {
+ valueNumbering[C] = nextValueNumber;
+ return nextValueNumber++;
+ }
+
+ if (cdep->getNumArgOperands() != C->getNumArgOperands()) {
+ valueNumbering[C] = nextValueNumber;
+ return nextValueNumber++;
+ }
+ for (unsigned i = 0, e = C->getNumArgOperands(); i < e; ++i) {
+ uint32_t c_vn = lookup_or_add(C->getArgOperand(i));
+ uint32_t cd_vn = lookup_or_add(cdep->getArgOperand(i));
+ if (c_vn != cd_vn) {
+ valueNumbering[C] = nextValueNumber;
+ return nextValueNumber++;
+ }
+ }
+
+ uint32_t v = lookup_or_add(cdep);
+ valueNumbering[C] = v;
+ return v;
+
+ } else {
+ valueNumbering[C] = nextValueNumber;
+ return nextValueNumber++;
+ }
+}
+
+/// Returns true if a value number exists for the specified value.
+bool ValueTable::exists(Value *V) const { return valueNumbering.count(V) != 0; }
+
+/// lookup_or_add - Returns the value number for the specified value, assigning
+/// it a new number if it did not have one before.
+uint32_t ValueTable::lookup_or_add(Value *V) {
+ DenseMap<Value*, uint32_t>::iterator VI = valueNumbering.find(V);
+ if (VI != valueNumbering.end())
+ return VI->second;
+
+ if (!isa<Instruction>(V)) {
+ valueNumbering[V] = nextValueNumber;
+ return nextValueNumber++;
+ }
+
+ Instruction* I = cast<Instruction>(V);
+ Expression exp;
+ switch (I->getOpcode()) {
+ case Instruction::Call:
+ return lookup_or_add_call(cast<CallInst>(I));
+ case Instruction::Add:
+ case Instruction::FAdd:
+ case Instruction::Sub:
+ case Instruction::FSub:
+ case Instruction::Mul:
+ case Instruction::FMul:
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::FDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ case Instruction::ICmp:
+ case Instruction::FCmp:
+ case Instruction::Trunc:
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::UIToFP:
+ case Instruction::SIToFP:
+ case Instruction::FPTrunc:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::BitCast:
+ case Instruction::Select:
+ case Instruction::ExtractElement:
+ case Instruction::InsertElement:
+ case Instruction::ShuffleVector:
+ case Instruction::InsertValue:
+ case Instruction::GetElementPtr:
+ exp = create_expression(I);
+ break;
+ case Instruction::ExtractValue:
+ exp = create_extractvalue_expression(cast<ExtractValueInst>(I));
+ break;
+ default:
+ valueNumbering[V] = nextValueNumber;
+ return nextValueNumber++;
+ }
+
+ uint32_t& e = expressionNumbering[exp];
+ if (!e) e = nextValueNumber++;
+ valueNumbering[V] = e;
+ return e;
+}
+
+/// Returns the value number of the specified value. Fails if
+/// the value has not yet been numbered.
+uint32_t ValueTable::lookup(Value *V) const {
+ DenseMap<Value*, uint32_t>::const_iterator VI = valueNumbering.find(V);
+ assert(VI != valueNumbering.end() && "Value not numbered?");
+ return VI->second;
+}
+
+/// Returns the value number of the given comparison,
+/// assigning it a new number if it did not have one before. Useful when
+/// we deduced the result of a comparison, but don't immediately have an
+/// instruction realizing that comparison to hand.
+uint32_t ValueTable::lookup_or_add_cmp(unsigned Opcode,
+ CmpInst::Predicate Predicate,
+ Value *LHS, Value *RHS) {
+ Expression exp = create_cmp_expression(Opcode, Predicate, LHS, RHS);
+ uint32_t& e = expressionNumbering[exp];
+ if (!e) e = nextValueNumber++;
+ return e;
+}
+
+/// Remove all entries from the ValueTable.
+void ValueTable::clear() {
+ valueNumbering.clear();
+ expressionNumbering.clear();
+ nextValueNumber = 1;
+}
+
+/// Remove a value from the value numbering.
+void ValueTable::erase(Value *V) {
+ valueNumbering.erase(V);
+}
+
+/// verifyRemoved - Verify that the value is removed from all internal data
+/// structures.
+void ValueTable::verifyRemoved(const Value *V) const {
+ for (DenseMap<Value*, uint32_t>::const_iterator
+ I = valueNumbering.begin(), E = valueNumbering.end(); I != E; ++I) {
+ assert(I->first != V && "Inst still occurs in value numbering map!");
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// GVN Pass
+//===----------------------------------------------------------------------===//
+
+namespace {
+ class GVN;
+ struct AvailableValueInBlock {
+ /// BB - The basic block in question.
+ BasicBlock *BB;
+ enum ValType {
+ SimpleVal, // A simple offsetted value that is accessed.
+ LoadVal, // A value produced by a load.
+ MemIntrin, // A memory intrinsic which is loaded from.
+ UndefVal // A UndefValue representing a value from dead block (which
+ // is not yet physically removed from the CFG).
+ };
+
+ /// V - The value that is live out of the block.
+ PointerIntPair<Value *, 2, ValType> Val;
+
+ /// Offset - The byte offset in Val that is interesting for the load query.
+ unsigned Offset;
+
+ static AvailableValueInBlock get(BasicBlock *BB, Value *V,
+ unsigned Offset = 0) {
+ AvailableValueInBlock Res;
+ Res.BB = BB;
+ Res.Val.setPointer(V);
+ Res.Val.setInt(SimpleVal);
+ Res.Offset = Offset;
+ return Res;
+ }
+
+ static AvailableValueInBlock getMI(BasicBlock *BB, MemIntrinsic *MI,
+ unsigned Offset = 0) {
+ AvailableValueInBlock Res;
+ Res.BB = BB;
+ Res.Val.setPointer(MI);
+ Res.Val.setInt(MemIntrin);
+ Res.Offset = Offset;
+ return Res;
+ }
+
+ static AvailableValueInBlock getLoad(BasicBlock *BB, LoadInst *LI,
+ unsigned Offset = 0) {
+ AvailableValueInBlock Res;
+ Res.BB = BB;
+ Res.Val.setPointer(LI);
+ Res.Val.setInt(LoadVal);
+ Res.Offset = Offset;
+ return Res;
+ }
+
+ static AvailableValueInBlock getUndef(BasicBlock *BB) {
+ AvailableValueInBlock Res;
+ Res.BB = BB;
+ Res.Val.setPointer(nullptr);
+ Res.Val.setInt(UndefVal);
+ Res.Offset = 0;
+ return Res;
+ }
+
+ bool isSimpleValue() const { return Val.getInt() == SimpleVal; }
+ bool isCoercedLoadValue() const { return Val.getInt() == LoadVal; }
+ bool isMemIntrinValue() const { return Val.getInt() == MemIntrin; }
+ bool isUndefValue() const { return Val.getInt() == UndefVal; }
+
+ Value *getSimpleValue() const {
+ assert(isSimpleValue() && "Wrong accessor");
+ return Val.getPointer();
+ }
+
+ LoadInst *getCoercedLoadValue() const {
+ assert(isCoercedLoadValue() && "Wrong accessor");
+ return cast<LoadInst>(Val.getPointer());
+ }
+
+ MemIntrinsic *getMemIntrinValue() const {
+ assert(isMemIntrinValue() && "Wrong accessor");
+ return cast<MemIntrinsic>(Val.getPointer());
+ }
+
+ /// Emit code into this block to adjust the value defined here to the
+ /// specified type. This handles various coercion cases.
+ Value *MaterializeAdjustedValue(LoadInst *LI, GVN &gvn) const;
+ };
+
+ class GVN : public FunctionPass {
+ bool NoLoads;
+ MemoryDependenceAnalysis *MD;
+ DominatorTree *DT;
+ const TargetLibraryInfo *TLI;
+ AssumptionCache *AC;
+ SetVector<BasicBlock *> DeadBlocks;
+
+ ValueTable VN;
+
+ /// A mapping from value numbers to lists of Value*'s that
+ /// have that value number. Use findLeader to query it.
+ struct LeaderTableEntry {
+ Value *Val;
+ const BasicBlock *BB;
+ LeaderTableEntry *Next;
+ };
+ DenseMap<uint32_t, LeaderTableEntry> LeaderTable;
+ BumpPtrAllocator TableAllocator;
+
+ // Block-local map of equivalent values to their leader, does not
+ // propagate to any successors. Entries added mid-block are applied
+ // to the remaining instructions in the block.
+ SmallMapVector<llvm::Value *, llvm::Constant *, 4> ReplaceWithConstMap;
+ SmallVector<Instruction*, 8> InstrsToErase;
+
+ typedef SmallVector<NonLocalDepResult, 64> LoadDepVect;
+ typedef SmallVector<AvailableValueInBlock, 64> AvailValInBlkVect;
+ typedef SmallVector<BasicBlock*, 64> UnavailBlkVect;
+
+ public:
+ static char ID; // Pass identification, replacement for typeid
+ explicit GVN(bool noloads = false)
+ : FunctionPass(ID), NoLoads(noloads), MD(nullptr) {
+ initializeGVNPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override;
+
+ /// This removes the specified instruction from
+ /// our various maps and marks it for deletion.
+ void markInstructionForDeletion(Instruction *I) {
+ VN.erase(I);
+ InstrsToErase.push_back(I);
+ }
+
+ DominatorTree &getDominatorTree() const { return *DT; }
+ AliasAnalysis *getAliasAnalysis() const { return VN.getAliasAnalysis(); }
+ MemoryDependenceAnalysis &getMemDep() const { return *MD; }
+ private:
+ /// Push a new Value to the LeaderTable onto the list for its value number.
+ void addToLeaderTable(uint32_t N, Value *V, const BasicBlock *BB) {
+ LeaderTableEntry &Curr = LeaderTable[N];
+ if (!Curr.Val) {
+ Curr.Val = V;
+ Curr.BB = BB;
+ return;
+ }
+
+ LeaderTableEntry *Node = TableAllocator.Allocate<LeaderTableEntry>();
+ Node->Val = V;
+ Node->BB = BB;
+ Node->Next = Curr.Next;
+ Curr.Next = Node;
+ }
+
+ /// Scan the list of values corresponding to a given
+ /// value number, and remove the given instruction if encountered.
+ void removeFromLeaderTable(uint32_t N, Instruction *I, BasicBlock *BB) {
+ LeaderTableEntry* Prev = nullptr;
+ LeaderTableEntry* Curr = &LeaderTable[N];
+
+ while (Curr && (Curr->Val != I || Curr->BB != BB)) {
+ Prev = Curr;
+ Curr = Curr->Next;
+ }
+
+ if (!Curr)
+ return;
+
+ if (Prev) {
+ Prev->Next = Curr->Next;
+ } else {
+ if (!Curr->Next) {
+ Curr->Val = nullptr;
+ Curr->BB = nullptr;
+ } else {
+ LeaderTableEntry* Next = Curr->Next;
+ Curr->Val = Next->Val;
+ Curr->BB = Next->BB;
+ Curr->Next = Next->Next;
+ }
+ }
+ }
+
+ // List of critical edges to be split between iterations.
+ SmallVector<std::pair<TerminatorInst*, unsigned>, 4> toSplit;
+
+ // This transformation requires dominator postdominator info
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ if (!NoLoads)
+ AU.addRequired<MemoryDependenceAnalysis>();
+ AU.addRequired<AAResultsWrapperPass>();
+
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ }
+
+
+ // Helper functions of redundant load elimination
+ bool processLoad(LoadInst *L);
+ bool processNonLocalLoad(LoadInst *L);
+ bool processAssumeIntrinsic(IntrinsicInst *II);
+ void AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps,
+ AvailValInBlkVect &ValuesPerBlock,
+ UnavailBlkVect &UnavailableBlocks);
+ bool PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
+ UnavailBlkVect &UnavailableBlocks);
+
+ // Other helper routines
+ bool processInstruction(Instruction *I);
+ bool processBlock(BasicBlock *BB);
+ void dump(DenseMap<uint32_t, Value*> &d);
+ bool iterateOnFunction(Function &F);
+ bool performPRE(Function &F);
+ bool performScalarPRE(Instruction *I);
+ bool performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred,
+ unsigned int ValNo);
+ Value *findLeader(const BasicBlock *BB, uint32_t num);
+ void cleanupGlobalSets();
+ void verifyRemoved(const Instruction *I) const;
+ bool splitCriticalEdges();
+ BasicBlock *splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ);
+ bool replaceOperandsWithConsts(Instruction *I) const;
+ bool propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root,
+ bool DominatesByEdge);
+ bool processFoldableCondBr(BranchInst *BI);
+ void addDeadBlock(BasicBlock *BB);
+ void assignValNumForDeadCode();
+ };
+
+ char GVN::ID = 0;
+}
+
+// The public interface to this file...
+FunctionPass *llvm::createGVNPass(bool NoLoads) {
+ return new GVN(NoLoads);
+}
+
+INITIALIZE_PASS_BEGIN(GVN, "gvn", "Global Value Numbering", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_END(GVN, "gvn", "Global Value Numbering", false, false)
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void GVN::dump(DenseMap<uint32_t, Value*>& d) {
+ errs() << "{\n";
+ for (DenseMap<uint32_t, Value*>::iterator I = d.begin(),
+ E = d.end(); I != E; ++I) {
+ errs() << I->first << "\n";
+ I->second->dump();
+ }
+ errs() << "}\n";
+}
+#endif
+
+/// Return true if we can prove that the value
+/// we're analyzing is fully available in the specified block. As we go, keep
+/// track of which blocks we know are fully alive in FullyAvailableBlocks. This
+/// map is actually a tri-state map with the following values:
+/// 0) we know the block *is not* fully available.
+/// 1) we know the block *is* fully available.
+/// 2) we do not know whether the block is fully available or not, but we are
+/// currently speculating that it will be.
+/// 3) we are speculating for this block and have used that to speculate for
+/// other blocks.
+static bool IsValueFullyAvailableInBlock(BasicBlock *BB,
+ DenseMap<BasicBlock*, char> &FullyAvailableBlocks,
+ uint32_t RecurseDepth) {
+ if (RecurseDepth > MaxRecurseDepth)
+ return false;
+
+ // Optimistically assume that the block is fully available and check to see
+ // if we already know about this block in one lookup.
+ std::pair<DenseMap<BasicBlock*, char>::iterator, char> IV =
+ FullyAvailableBlocks.insert(std::make_pair(BB, 2));
+
+ // If the entry already existed for this block, return the precomputed value.
+ if (!IV.second) {
+ // If this is a speculative "available" value, mark it as being used for
+ // speculation of other blocks.
+ if (IV.first->second == 2)
+ IV.first->second = 3;
+ return IV.first->second != 0;
+ }
+
+ // Otherwise, see if it is fully available in all predecessors.
+ pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
+
+ // If this block has no predecessors, it isn't live-in here.
+ if (PI == PE)
+ goto SpeculationFailure;
+
+ for (; PI != PE; ++PI)
+ // If the value isn't fully available in one of our predecessors, then it
+ // isn't fully available in this block either. Undo our previous
+ // optimistic assumption and bail out.
+ if (!IsValueFullyAvailableInBlock(*PI, FullyAvailableBlocks,RecurseDepth+1))
+ goto SpeculationFailure;
+
+ return true;
+
+// If we get here, we found out that this is not, after
+// all, a fully-available block. We have a problem if we speculated on this and
+// used the speculation to mark other blocks as available.
+SpeculationFailure:
+ char &BBVal = FullyAvailableBlocks[BB];
+
+ // If we didn't speculate on this, just return with it set to false.
+ if (BBVal == 2) {
+ BBVal = 0;
+ return false;
+ }
+
+ // If we did speculate on this value, we could have blocks set to 1 that are
+ // incorrect. Walk the (transitive) successors of this block and mark them as
+ // 0 if set to one.
+ SmallVector<BasicBlock*, 32> BBWorklist;
+ BBWorklist.push_back(BB);
+
+ do {
+ BasicBlock *Entry = BBWorklist.pop_back_val();
+ // Note that this sets blocks to 0 (unavailable) if they happen to not
+ // already be in FullyAvailableBlocks. This is safe.
+ char &EntryVal = FullyAvailableBlocks[Entry];
+ if (EntryVal == 0) continue; // Already unavailable.
+
+ // Mark as unavailable.
+ EntryVal = 0;
+
+ BBWorklist.append(succ_begin(Entry), succ_end(Entry));
+ } while (!BBWorklist.empty());
+
+ return false;
+}
+
+
+/// Return true if CoerceAvailableValueToLoadType will succeed.
+static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal,
+ Type *LoadTy,
+ const DataLayout &DL) {
+ // If the loaded or stored value is an first class array or struct, don't try
+ // to transform them. We need to be able to bitcast to integer.
+ if (LoadTy->isStructTy() || LoadTy->isArrayTy() ||
+ StoredVal->getType()->isStructTy() ||
+ StoredVal->getType()->isArrayTy())
+ return false;
+
+ // The store has to be at least as big as the load.
+ if (DL.getTypeSizeInBits(StoredVal->getType()) <
+ DL.getTypeSizeInBits(LoadTy))
+ return false;
+
+ return true;
+}
+
+/// If we saw a store of a value to memory, and
+/// then a load from a must-aliased pointer of a different type, try to coerce
+/// the stored value. LoadedTy is the type of the load we want to replace.
+/// IRB is IRBuilder used to insert new instructions.
+///
+/// If we can't do it, return null.
+static Value *CoerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy,
+ IRBuilder<> &IRB,
+ const DataLayout &DL) {
+ if (!CanCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, DL))
+ return nullptr;
+
+ // If this is already the right type, just return it.
+ Type *StoredValTy = StoredVal->getType();
+
+ uint64_t StoreSize = DL.getTypeSizeInBits(StoredValTy);
+ uint64_t LoadSize = DL.getTypeSizeInBits(LoadedTy);
+
+ // If the store and reload are the same size, we can always reuse it.
+ if (StoreSize == LoadSize) {
+ // Pointer to Pointer -> use bitcast.
+ if (StoredValTy->getScalarType()->isPointerTy() &&
+ LoadedTy->getScalarType()->isPointerTy())
+ return IRB.CreateBitCast(StoredVal, LoadedTy);
+
+ // Convert source pointers to integers, which can be bitcast.
+ if (StoredValTy->getScalarType()->isPointerTy()) {
+ StoredValTy = DL.getIntPtrType(StoredValTy);
+ StoredVal = IRB.CreatePtrToInt(StoredVal, StoredValTy);
+ }
+
+ Type *TypeToCastTo = LoadedTy;
+ if (TypeToCastTo->getScalarType()->isPointerTy())
+ TypeToCastTo = DL.getIntPtrType(TypeToCastTo);
+
+ if (StoredValTy != TypeToCastTo)
+ StoredVal = IRB.CreateBitCast(StoredVal, TypeToCastTo);
+
+ // Cast to pointer if the load needs a pointer type.
+ if (LoadedTy->getScalarType()->isPointerTy())
+ StoredVal = IRB.CreateIntToPtr(StoredVal, LoadedTy);
+
+ return StoredVal;
+ }
+
+ // If the loaded value is smaller than the available value, then we can
+ // extract out a piece from it. If the available value is too small, then we
+ // can't do anything.
+ assert(StoreSize >= LoadSize && "CanCoerceMustAliasedValueToLoad fail");
+
+ // Convert source pointers to integers, which can be manipulated.
+ if (StoredValTy->getScalarType()->isPointerTy()) {
+ StoredValTy = DL.getIntPtrType(StoredValTy);
+ StoredVal = IRB.CreatePtrToInt(StoredVal, StoredValTy);
+ }
+
+ // Convert vectors and fp to integer, which can be manipulated.
+ if (!StoredValTy->isIntegerTy()) {
+ StoredValTy = IntegerType::get(StoredValTy->getContext(), StoreSize);
+ StoredVal = IRB.CreateBitCast(StoredVal, StoredValTy);
+ }
+
+ // If this is a big-endian system, we need to shift the value down to the low
+ // bits so that a truncate will work.
+ if (DL.isBigEndian()) {
+ StoredVal = IRB.CreateLShr(StoredVal, StoreSize - LoadSize, "tmp");
+ }
+
+ // Truncate the integer to the right size now.
+ Type *NewIntTy = IntegerType::get(StoredValTy->getContext(), LoadSize);
+ StoredVal = IRB.CreateTrunc(StoredVal, NewIntTy, "trunc");
+
+ if (LoadedTy == NewIntTy)
+ return StoredVal;
+
+ // If the result is a pointer, inttoptr.
+ if (LoadedTy->getScalarType()->isPointerTy())
+ return IRB.CreateIntToPtr(StoredVal, LoadedTy, "inttoptr");
+
+ // Otherwise, bitcast.
+ return IRB.CreateBitCast(StoredVal, LoadedTy, "bitcast");
+}
+
+/// This function is called when we have a
+/// memdep query of a load that ends up being a clobbering memory write (store,
+/// memset, memcpy, memmove). This means that the write *may* provide bits used
+/// by the load but we can't be sure because the pointers don't mustalias.
+///
+/// Check this case to see if there is anything more we can do before we give
+/// up. This returns -1 if we have to give up, or a byte number in the stored
+/// value of the piece that feeds the load.
+static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr,
+ Value *WritePtr,
+ uint64_t WriteSizeInBits,
+ const DataLayout &DL) {
+ // If the loaded or stored value is a first class array or struct, don't try
+ // to transform them. We need to be able to bitcast to integer.
+ if (LoadTy->isStructTy() || LoadTy->isArrayTy())
+ return -1;
+
+ int64_t StoreOffset = 0, LoadOffset = 0;
+ Value *StoreBase =
+ GetPointerBaseWithConstantOffset(WritePtr, StoreOffset, DL);
+ Value *LoadBase = GetPointerBaseWithConstantOffset(LoadPtr, LoadOffset, DL);
+ if (StoreBase != LoadBase)
+ return -1;
+
+ // If the load and store are to the exact same address, they should have been
+ // a must alias. AA must have gotten confused.
+ // FIXME: Study to see if/when this happens. One case is forwarding a memset
+ // to a load from the base of the memset.
+#if 0
+ if (LoadOffset == StoreOffset) {
+ dbgs() << "STORE/LOAD DEP WITH COMMON POINTER MISSED:\n"
+ << "Base = " << *StoreBase << "\n"
+ << "Store Ptr = " << *WritePtr << "\n"
+ << "Store Offs = " << StoreOffset << "\n"
+ << "Load Ptr = " << *LoadPtr << "\n";
+ abort();
+ }
+#endif
+
+ // If the load and store don't overlap at all, the store doesn't provide
+ // anything to the load. In this case, they really don't alias at all, AA
+ // must have gotten confused.
+ uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy);
+
+ if ((WriteSizeInBits & 7) | (LoadSize & 7))
+ return -1;
+ uint64_t StoreSize = WriteSizeInBits >> 3; // Convert to bytes.
+ LoadSize >>= 3;
+
+
+ bool isAAFailure = false;
+ if (StoreOffset < LoadOffset)
+ isAAFailure = StoreOffset+int64_t(StoreSize) <= LoadOffset;
+ else
+ isAAFailure = LoadOffset+int64_t(LoadSize) <= StoreOffset;
+
+ if (isAAFailure) {
+#if 0
+ dbgs() << "STORE LOAD DEP WITH COMMON BASE:\n"
+ << "Base = " << *StoreBase << "\n"
+ << "Store Ptr = " << *WritePtr << "\n"
+ << "Store Offs = " << StoreOffset << "\n"
+ << "Load Ptr = " << *LoadPtr << "\n";
+ abort();
+#endif
+ return -1;
+ }
+
+ // If the Load isn't completely contained within the stored bits, we don't
+ // have all the bits to feed it. We could do something crazy in the future
+ // (issue a smaller load then merge the bits in) but this seems unlikely to be
+ // valuable.
+ if (StoreOffset > LoadOffset ||
+ StoreOffset+StoreSize < LoadOffset+LoadSize)
+ return -1;
+
+ // Okay, we can do this transformation. Return the number of bytes into the
+ // store that the load is.
+ return LoadOffset-StoreOffset;
+}
+
+/// This function is called when we have a
+/// memdep query of a load that ends up being a clobbering store.
+static int AnalyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr,
+ StoreInst *DepSI) {
+ // Cannot handle reading from store of first-class aggregate yet.
+ if (DepSI->getValueOperand()->getType()->isStructTy() ||
+ DepSI->getValueOperand()->getType()->isArrayTy())
+ return -1;
+
+ const DataLayout &DL = DepSI->getModule()->getDataLayout();
+ Value *StorePtr = DepSI->getPointerOperand();
+ uint64_t StoreSize =DL.getTypeSizeInBits(DepSI->getValueOperand()->getType());
+ return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr,
+ StorePtr, StoreSize, DL);
+}
+
+/// This function is called when we have a
+/// memdep query of a load that ends up being clobbered by another load. See if
+/// the other load can feed into the second load.
+static int AnalyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr,
+ LoadInst *DepLI, const DataLayout &DL){
+ // Cannot handle reading from store of first-class aggregate yet.
+ if (DepLI->getType()->isStructTy() || DepLI->getType()->isArrayTy())
+ return -1;
+
+ Value *DepPtr = DepLI->getPointerOperand();
+ uint64_t DepSize = DL.getTypeSizeInBits(DepLI->getType());
+ int R = AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, DepSize, DL);
+ if (R != -1) return R;
+
+ // If we have a load/load clobber an DepLI can be widened to cover this load,
+ // then we should widen it!
+ int64_t LoadOffs = 0;
+ const Value *LoadBase =
+ GetPointerBaseWithConstantOffset(LoadPtr, LoadOffs, DL);
+ unsigned LoadSize = DL.getTypeStoreSize(LoadTy);
+
+ unsigned Size = MemoryDependenceAnalysis::getLoadLoadClobberFullWidthSize(
+ LoadBase, LoadOffs, LoadSize, DepLI);
+ if (Size == 0) return -1;
+
+ return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, Size*8, DL);
+}
+
+
+
+static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
+ MemIntrinsic *MI,
+ const DataLayout &DL) {
+ // If the mem operation is a non-constant size, we can't handle it.
+ ConstantInt *SizeCst = dyn_cast<ConstantInt>(MI->getLength());
+ if (!SizeCst) return -1;
+ uint64_t MemSizeInBits = SizeCst->getZExtValue()*8;
+
+ // If this is memset, we just need to see if the offset is valid in the size
+ // of the memset..
+ if (MI->getIntrinsicID() == Intrinsic::memset)
+ return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, MI->getDest(),
+ MemSizeInBits, DL);
+
+ // If we have a memcpy/memmove, the only case we can handle is if this is a
+ // copy from constant memory. In that case, we can read directly from the
+ // constant memory.
+ MemTransferInst *MTI = cast<MemTransferInst>(MI);
+
+ Constant *Src = dyn_cast<Constant>(MTI->getSource());
+ if (!Src) return -1;
+
+ GlobalVariable *GV = dyn_cast<GlobalVariable>(GetUnderlyingObject(Src, DL));
+ if (!GV || !GV->isConstant()) return -1;
+
+ // See if the access is within the bounds of the transfer.
+ int Offset = AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr,
+ MI->getDest(), MemSizeInBits, DL);
+ if (Offset == -1)
+ return Offset;
+
+ unsigned AS = Src->getType()->getPointerAddressSpace();
+ // Otherwise, see if we can constant fold a load from the constant with the
+ // offset applied as appropriate.
+ Src = ConstantExpr::getBitCast(Src,
+ Type::getInt8PtrTy(Src->getContext(), AS));
+ Constant *OffsetCst =
+ ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
+ Src = ConstantExpr::getGetElementPtr(Type::getInt8Ty(Src->getContext()), Src,
+ OffsetCst);
+ Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS));
+ if (ConstantFoldLoadFromConstPtr(Src, DL))
+ return Offset;
+ return -1;
+}
+
+
+/// This function is called when we have a
+/// memdep query of a load that ends up being a clobbering store. This means
+/// that the store provides bits used by the load but we the pointers don't
+/// mustalias. Check this case to see if there is anything more we can do
+/// before we give up.
+static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset,
+ Type *LoadTy,
+ Instruction *InsertPt, const DataLayout &DL){
+ LLVMContext &Ctx = SrcVal->getType()->getContext();
+
+ uint64_t StoreSize = (DL.getTypeSizeInBits(SrcVal->getType()) + 7) / 8;
+ uint64_t LoadSize = (DL.getTypeSizeInBits(LoadTy) + 7) / 8;
+
+ IRBuilder<> Builder(InsertPt);
+
+ // Compute which bits of the stored value are being used by the load. Convert
+ // to an integer type to start with.
+ if (SrcVal->getType()->getScalarType()->isPointerTy())
+ SrcVal = Builder.CreatePtrToInt(SrcVal,
+ DL.getIntPtrType(SrcVal->getType()));
+ if (!SrcVal->getType()->isIntegerTy())
+ SrcVal = Builder.CreateBitCast(SrcVal, IntegerType::get(Ctx, StoreSize*8));
+
+ // Shift the bits to the least significant depending on endianness.
+ unsigned ShiftAmt;
+ if (DL.isLittleEndian())
+ ShiftAmt = Offset*8;
+ else
+ ShiftAmt = (StoreSize-LoadSize-Offset)*8;
+
+ if (ShiftAmt)
+ SrcVal = Builder.CreateLShr(SrcVal, ShiftAmt);
+
+ if (LoadSize != StoreSize)
+ SrcVal = Builder.CreateTrunc(SrcVal, IntegerType::get(Ctx, LoadSize*8));
+
+ return CoerceAvailableValueToLoadType(SrcVal, LoadTy, Builder, DL);
+}
+
+/// This function is called when we have a
+/// memdep query of a load that ends up being a clobbering load. This means
+/// that the load *may* provide bits used by the load but we can't be sure
+/// because the pointers don't mustalias. Check this case to see if there is
+/// anything more we can do before we give up.
+static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset,
+ Type *LoadTy, Instruction *InsertPt,
+ GVN &gvn) {
+ const DataLayout &DL = SrcVal->getModule()->getDataLayout();
+ // If Offset+LoadTy exceeds the size of SrcVal, then we must be wanting to
+ // widen SrcVal out to a larger load.
+ unsigned SrcValSize = DL.getTypeStoreSize(SrcVal->getType());
+ unsigned LoadSize = DL.getTypeStoreSize(LoadTy);
+ if (Offset+LoadSize > SrcValSize) {
+ assert(SrcVal->isSimple() && "Cannot widen volatile/atomic load!");
+ assert(SrcVal->getType()->isIntegerTy() && "Can't widen non-integer load");
+ // If we have a load/load clobber an DepLI can be widened to cover this
+ // load, then we should widen it to the next power of 2 size big enough!
+ unsigned NewLoadSize = Offset+LoadSize;
+ if (!isPowerOf2_32(NewLoadSize))
+ NewLoadSize = NextPowerOf2(NewLoadSize);
+
+ Value *PtrVal = SrcVal->getPointerOperand();
+
+ // Insert the new load after the old load. This ensures that subsequent
+ // memdep queries will find the new load. We can't easily remove the old
+ // load completely because it is already in the value numbering table.
+ IRBuilder<> Builder(SrcVal->getParent(), ++BasicBlock::iterator(SrcVal));
+ Type *DestPTy =
+ IntegerType::get(LoadTy->getContext(), NewLoadSize*8);
+ DestPTy = PointerType::get(DestPTy,
+ PtrVal->getType()->getPointerAddressSpace());
+ Builder.SetCurrentDebugLocation(SrcVal->getDebugLoc());
+ PtrVal = Builder.CreateBitCast(PtrVal, DestPTy);
+ LoadInst *NewLoad = Builder.CreateLoad(PtrVal);
+ NewLoad->takeName(SrcVal);
+ NewLoad->setAlignment(SrcVal->getAlignment());
+
+ DEBUG(dbgs() << "GVN WIDENED LOAD: " << *SrcVal << "\n");
+ DEBUG(dbgs() << "TO: " << *NewLoad << "\n");
+
+ // Replace uses of the original load with the wider load. On a big endian
+ // system, we need to shift down to get the relevant bits.
+ Value *RV = NewLoad;
+ if (DL.isBigEndian())
+ RV = Builder.CreateLShr(RV,
+ NewLoadSize*8-SrcVal->getType()->getPrimitiveSizeInBits());
+ RV = Builder.CreateTrunc(RV, SrcVal->getType());
+ SrcVal->replaceAllUsesWith(RV);
+
+ // We would like to use gvn.markInstructionForDeletion here, but we can't
+ // because the load is already memoized into the leader map table that GVN
+ // tracks. It is potentially possible to remove the load from the table,
+ // but then there all of the operations based on it would need to be
+ // rehashed. Just leave the dead load around.
+ gvn.getMemDep().removeInstruction(SrcVal);
+ SrcVal = NewLoad;
+ }
+
+ return GetStoreValueForLoad(SrcVal, Offset, LoadTy, InsertPt, DL);
+}
+
+
+/// This function is called when we have a
+/// memdep query of a load that ends up being a clobbering mem intrinsic.
+static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
+ Type *LoadTy, Instruction *InsertPt,
+ const DataLayout &DL){
+ LLVMContext &Ctx = LoadTy->getContext();
+ uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy)/8;
+
+ IRBuilder<> Builder(InsertPt);
+
+ // We know that this method is only called when the mem transfer fully
+ // provides the bits for the load.
+ if (MemSetInst *MSI = dyn_cast<MemSetInst>(SrcInst)) {
+ // memset(P, 'x', 1234) -> splat('x'), even if x is a variable, and
+ // independently of what the offset is.
+ Value *Val = MSI->getValue();
+ if (LoadSize != 1)
+ Val = Builder.CreateZExt(Val, IntegerType::get(Ctx, LoadSize*8));
+
+ Value *OneElt = Val;
+
+ // Splat the value out to the right number of bits.
+ for (unsigned NumBytesSet = 1; NumBytesSet != LoadSize; ) {
+ // If we can double the number of bytes set, do it.
+ if (NumBytesSet*2 <= LoadSize) {
+ Value *ShVal = Builder.CreateShl(Val, NumBytesSet*8);
+ Val = Builder.CreateOr(Val, ShVal);
+ NumBytesSet <<= 1;
+ continue;
+ }
+
+ // Otherwise insert one byte at a time.
+ Value *ShVal = Builder.CreateShl(Val, 1*8);
+ Val = Builder.CreateOr(OneElt, ShVal);
+ ++NumBytesSet;
+ }
+
+ return CoerceAvailableValueToLoadType(Val, LoadTy, Builder, DL);
+ }
+
+ // Otherwise, this is a memcpy/memmove from a constant global.
+ MemTransferInst *MTI = cast<MemTransferInst>(SrcInst);
+ Constant *Src = cast<Constant>(MTI->getSource());
+ unsigned AS = Src->getType()->getPointerAddressSpace();
+
+ // Otherwise, see if we can constant fold a load from the constant with the
+ // offset applied as appropriate.
+ Src = ConstantExpr::getBitCast(Src,
+ Type::getInt8PtrTy(Src->getContext(), AS));
+ Constant *OffsetCst =
+ ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
+ Src = ConstantExpr::getGetElementPtr(Type::getInt8Ty(Src->getContext()), Src,
+ OffsetCst);
+ Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS));
+ return ConstantFoldLoadFromConstPtr(Src, DL);
+}
+
+
+/// Given a set of loads specified by ValuesPerBlock,
+/// construct SSA form, allowing us to eliminate LI. This returns the value
+/// that should be used at LI's definition site.
+static Value *ConstructSSAForLoadSet(LoadInst *LI,
+ SmallVectorImpl<AvailableValueInBlock> &ValuesPerBlock,
+ GVN &gvn) {
+ // Check for the fully redundant, dominating load case. In this case, we can
+ // just use the dominating value directly.
+ if (ValuesPerBlock.size() == 1 &&
+ gvn.getDominatorTree().properlyDominates(ValuesPerBlock[0].BB,
+ LI->getParent())) {
+ assert(!ValuesPerBlock[0].isUndefValue() && "Dead BB dominate this block");
+ return ValuesPerBlock[0].MaterializeAdjustedValue(LI, gvn);
+ }
+
+ // Otherwise, we have to construct SSA form.
+ SmallVector<PHINode*, 8> NewPHIs;
+ SSAUpdater SSAUpdate(&NewPHIs);
+ SSAUpdate.Initialize(LI->getType(), LI->getName());
+
+ for (const AvailableValueInBlock &AV : ValuesPerBlock) {
+ BasicBlock *BB = AV.BB;
+
+ if (SSAUpdate.HasValueForBlock(BB))
+ continue;
+
+ SSAUpdate.AddAvailableValue(BB, AV.MaterializeAdjustedValue(LI, gvn));
+ }
+
+ // Perform PHI construction.
+ return SSAUpdate.GetValueInMiddleOfBlock(LI->getParent());
+}
+
+Value *AvailableValueInBlock::MaterializeAdjustedValue(LoadInst *LI,
+ GVN &gvn) const {
+ Value *Res;
+ Type *LoadTy = LI->getType();
+ const DataLayout &DL = LI->getModule()->getDataLayout();
+ if (isSimpleValue()) {
+ Res = getSimpleValue();
+ if (Res->getType() != LoadTy) {
+ Res = GetStoreValueForLoad(Res, Offset, LoadTy, BB->getTerminator(), DL);
+
+ DEBUG(dbgs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset << " "
+ << *getSimpleValue() << '\n'
+ << *Res << '\n' << "\n\n\n");
+ }
+ } else if (isCoercedLoadValue()) {
+ LoadInst *Load = getCoercedLoadValue();
+ if (Load->getType() == LoadTy && Offset == 0) {
+ Res = Load;
+ } else {
+ Res = GetLoadValueForLoad(Load, Offset, LoadTy, BB->getTerminator(),
+ gvn);
+
+ DEBUG(dbgs() << "GVN COERCED NONLOCAL LOAD:\nOffset: " << Offset << " "
+ << *getCoercedLoadValue() << '\n'
+ << *Res << '\n' << "\n\n\n");
+ }
+ } else if (isMemIntrinValue()) {
+ Res = GetMemInstValueForLoad(getMemIntrinValue(), Offset, LoadTy,
+ BB->getTerminator(), DL);
+ DEBUG(dbgs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset
+ << " " << *getMemIntrinValue() << '\n'
+ << *Res << '\n' << "\n\n\n");
+ } else {
+ assert(isUndefValue() && "Should be UndefVal");
+ DEBUG(dbgs() << "GVN COERCED NONLOCAL Undef:\n";);
+ return UndefValue::get(LoadTy);
+ }
+ return Res;
+}
+
+static bool isLifetimeStart(const Instruction *Inst) {
+ if (const IntrinsicInst* II = dyn_cast<IntrinsicInst>(Inst))
+ return II->getIntrinsicID() == Intrinsic::lifetime_start;
+ return false;
+}
+
+void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps,
+ AvailValInBlkVect &ValuesPerBlock,
+ UnavailBlkVect &UnavailableBlocks) {
+
+ // Filter out useless results (non-locals, etc). Keep track of the blocks
+ // where we have a value available in repl, also keep track of whether we see
+ // dependencies that produce an unknown value for the load (such as a call
+ // that could potentially clobber the load).
+ unsigned NumDeps = Deps.size();
+ const DataLayout &DL = LI->getModule()->getDataLayout();
+ for (unsigned i = 0, e = NumDeps; i != e; ++i) {
+ BasicBlock *DepBB = Deps[i].getBB();
+ MemDepResult DepInfo = Deps[i].getResult();
+
+ if (DeadBlocks.count(DepBB)) {
+ // Dead dependent mem-op disguise as a load evaluating the same value
+ // as the load in question.
+ ValuesPerBlock.push_back(AvailableValueInBlock::getUndef(DepBB));
+ continue;
+ }
+
+ if (!DepInfo.isDef() && !DepInfo.isClobber()) {
+ UnavailableBlocks.push_back(DepBB);
+ continue;
+ }
+
+ if (DepInfo.isClobber()) {
+ // The address being loaded in this non-local block may not be the same as
+ // the pointer operand of the load if PHI translation occurs. Make sure
+ // to consider the right address.
+ Value *Address = Deps[i].getAddress();
+
+ // If the dependence is to a store that writes to a superset of the bits
+ // read by the load, we can extract the bits we need for the load from the
+ // stored value.
+ if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInfo.getInst())) {
+ if (Address) {
+ int Offset =
+ AnalyzeLoadFromClobberingStore(LI->getType(), Address, DepSI);
+ if (Offset != -1) {
+ ValuesPerBlock.push_back(AvailableValueInBlock::get(DepBB,
+ DepSI->getValueOperand(),
+ Offset));
+ continue;
+ }
+ }
+ }
+
+ // Check to see if we have something like this:
+ // load i32* P
+ // load i8* (P+1)
+ // if we have this, replace the later with an extraction from the former.
+ if (LoadInst *DepLI = dyn_cast<LoadInst>(DepInfo.getInst())) {
+ // If this is a clobber and L is the first instruction in its block, then
+ // we have the first instruction in the entry block.
+ if (DepLI != LI && Address) {
+ int Offset =
+ AnalyzeLoadFromClobberingLoad(LI->getType(), Address, DepLI, DL);
+
+ if (Offset != -1) {
+ ValuesPerBlock.push_back(AvailableValueInBlock::getLoad(DepBB,DepLI,
+ Offset));
+ continue;
+ }
+ }
+ }
+
+ // If the clobbering value is a memset/memcpy/memmove, see if we can
+ // forward a value on from it.
+ if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(DepInfo.getInst())) {
+ if (Address) {
+ int Offset = AnalyzeLoadFromClobberingMemInst(LI->getType(), Address,
+ DepMI, DL);
+ if (Offset != -1) {
+ ValuesPerBlock.push_back(AvailableValueInBlock::getMI(DepBB, DepMI,
+ Offset));
+ continue;
+ }
+ }
+ }
+
+ UnavailableBlocks.push_back(DepBB);
+ continue;
+ }
+
+ // DepInfo.isDef() here
+
+ Instruction *DepInst = DepInfo.getInst();
+
+ // Loading the allocation -> undef.
+ if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI) ||
+ // Loading immediately after lifetime begin -> undef.
+ isLifetimeStart(DepInst)) {
+ ValuesPerBlock.push_back(AvailableValueInBlock::get(DepBB,
+ UndefValue::get(LI->getType())));
+ continue;
+ }
+
+ // Loading from calloc (which zero initializes memory) -> zero
+ if (isCallocLikeFn(DepInst, TLI)) {
+ ValuesPerBlock.push_back(AvailableValueInBlock::get(
+ DepBB, Constant::getNullValue(LI->getType())));
+ continue;
+ }
+
+ if (StoreInst *S = dyn_cast<StoreInst>(DepInst)) {
+ // Reject loads and stores that are to the same address but are of
+ // different types if we have to.
+ if (S->getValueOperand()->getType() != LI->getType()) {
+ // If the stored value is larger or equal to the loaded value, we can
+ // reuse it.
+ if (!CanCoerceMustAliasedValueToLoad(S->getValueOperand(),
+ LI->getType(), DL)) {
+ UnavailableBlocks.push_back(DepBB);
+ continue;
+ }
+ }
+
+ ValuesPerBlock.push_back(AvailableValueInBlock::get(DepBB,
+ S->getValueOperand()));
+ continue;
+ }
+
+ if (LoadInst *LD = dyn_cast<LoadInst>(DepInst)) {
+ // If the types mismatch and we can't handle it, reject reuse of the load.
+ if (LD->getType() != LI->getType()) {
+ // If the stored value is larger or equal to the loaded value, we can
+ // reuse it.
+ if (!CanCoerceMustAliasedValueToLoad(LD, LI->getType(), DL)) {
+ UnavailableBlocks.push_back(DepBB);
+ continue;
+ }
+ }
+ ValuesPerBlock.push_back(AvailableValueInBlock::getLoad(DepBB, LD));
+ continue;
+ }
+
+ UnavailableBlocks.push_back(DepBB);
+ }
+}
+
+bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
+ UnavailBlkVect &UnavailableBlocks) {
+ // Okay, we have *some* definitions of the value. This means that the value
+ // is available in some of our (transitive) predecessors. Lets think about
+ // doing PRE of this load. This will involve inserting a new load into the
+ // predecessor when it's not available. We could do this in general, but
+ // prefer to not increase code size. As such, we only do this when we know
+ // that we only have to insert *one* load (which means we're basically moving
+ // the load, not inserting a new one).
+
+ SmallPtrSet<BasicBlock *, 4> Blockers(UnavailableBlocks.begin(),
+ UnavailableBlocks.end());
+
+ // Let's find the first basic block with more than one predecessor. Walk
+ // backwards through predecessors if needed.
+ BasicBlock *LoadBB = LI->getParent();
+ BasicBlock *TmpBB = LoadBB;
+
+ while (TmpBB->getSinglePredecessor()) {
+ TmpBB = TmpBB->getSinglePredecessor();
+ if (TmpBB == LoadBB) // Infinite (unreachable) loop.
+ return false;
+ if (Blockers.count(TmpBB))
+ return false;
+
+ // If any of these blocks has more than one successor (i.e. if the edge we
+ // just traversed was critical), then there are other paths through this
+ // block along which the load may not be anticipated. Hoisting the load
+ // above this block would be adding the load to execution paths along
+ // which it was not previously executed.
+ if (TmpBB->getTerminator()->getNumSuccessors() != 1)
+ return false;
+ }
+
+ assert(TmpBB);
+ LoadBB = TmpBB;
+
+ // Check to see how many predecessors have the loaded value fully
+ // available.
+ MapVector<BasicBlock *, Value *> PredLoads;
+ DenseMap<BasicBlock*, char> FullyAvailableBlocks;
+ for (const AvailableValueInBlock &AV : ValuesPerBlock)
+ FullyAvailableBlocks[AV.BB] = true;
+ for (BasicBlock *UnavailableBB : UnavailableBlocks)
+ FullyAvailableBlocks[UnavailableBB] = false;
+
+ SmallVector<BasicBlock *, 4> CriticalEdgePred;
+ for (BasicBlock *Pred : predecessors(LoadBB)) {
+ // If any predecessor block is an EH pad that does not allow non-PHI
+ // instructions before the terminator, we can't PRE the load.
+ if (Pred->getTerminator()->isEHPad()) {
+ DEBUG(dbgs()
+ << "COULD NOT PRE LOAD BECAUSE OF AN EH PAD PREDECESSOR '"
+ << Pred->getName() << "': " << *LI << '\n');
+ return false;
+ }
+
+ if (IsValueFullyAvailableInBlock(Pred, FullyAvailableBlocks, 0)) {
+ continue;
+ }
+
+ if (Pred->getTerminator()->getNumSuccessors() != 1) {
+ if (isa<IndirectBrInst>(Pred->getTerminator())) {
+ DEBUG(dbgs() << "COULD NOT PRE LOAD BECAUSE OF INDBR CRITICAL EDGE '"
+ << Pred->getName() << "': " << *LI << '\n');
+ return false;
+ }
+
+ if (LoadBB->isEHPad()) {
+ DEBUG(dbgs()
+ << "COULD NOT PRE LOAD BECAUSE OF AN EH PAD CRITICAL EDGE '"
+ << Pred->getName() << "': " << *LI << '\n');
+ return false;
+ }
+
+ CriticalEdgePred.push_back(Pred);
+ } else {
+ // Only add the predecessors that will not be split for now.
+ PredLoads[Pred] = nullptr;
+ }
+ }
+
+ // Decide whether PRE is profitable for this load.
+ unsigned NumUnavailablePreds = PredLoads.size() + CriticalEdgePred.size();
+ assert(NumUnavailablePreds != 0 &&
+ "Fully available value should already be eliminated!");
+
+ // If this load is unavailable in multiple predecessors, reject it.
+ // FIXME: If we could restructure the CFG, we could make a common pred with
+ // all the preds that don't have an available LI and insert a new load into
+ // that one block.
+ if (NumUnavailablePreds != 1)
+ return false;
+
+ // Split critical edges, and update the unavailable predecessors accordingly.
+ for (BasicBlock *OrigPred : CriticalEdgePred) {
+ BasicBlock *NewPred = splitCriticalEdges(OrigPred, LoadBB);
+ assert(!PredLoads.count(OrigPred) && "Split edges shouldn't be in map!");
+ PredLoads[NewPred] = nullptr;
+ DEBUG(dbgs() << "Split critical edge " << OrigPred->getName() << "->"
+ << LoadBB->getName() << '\n');
+ }
+
+ // Check if the load can safely be moved to all the unavailable predecessors.
+ bool CanDoPRE = true;
+ const DataLayout &DL = LI->getModule()->getDataLayout();
+ SmallVector<Instruction*, 8> NewInsts;
+ for (auto &PredLoad : PredLoads) {
+ BasicBlock *UnavailablePred = PredLoad.first;
+
+ // Do PHI translation to get its value in the predecessor if necessary. The
+ // returned pointer (if non-null) is guaranteed to dominate UnavailablePred.
+
+ // If all preds have a single successor, then we know it is safe to insert
+ // the load on the pred (?!?), so we can insert code to materialize the
+ // pointer if it is not available.
+ PHITransAddr Address(LI->getPointerOperand(), DL, AC);
+ Value *LoadPtr = nullptr;
+ LoadPtr = Address.PHITranslateWithInsertion(LoadBB, UnavailablePred,
+ *DT, NewInsts);
+
+ // If we couldn't find or insert a computation of this phi translated value,
+ // we fail PRE.
+ if (!LoadPtr) {
+ DEBUG(dbgs() << "COULDN'T INSERT PHI TRANSLATED VALUE OF: "
+ << *LI->getPointerOperand() << "\n");
+ CanDoPRE = false;
+ break;
+ }
+
+ PredLoad.second = LoadPtr;
+ }
+
+ if (!CanDoPRE) {
+ while (!NewInsts.empty()) {
+ Instruction *I = NewInsts.pop_back_val();
+ if (MD) MD->removeInstruction(I);
+ I->eraseFromParent();
+ }
+ // HINT: Don't revert the edge-splitting as following transformation may
+ // also need to split these critical edges.
+ return !CriticalEdgePred.empty();
+ }
+
+ // Okay, we can eliminate this load by inserting a reload in the predecessor
+ // and using PHI construction to get the value in the other predecessors, do
+ // it.
+ DEBUG(dbgs() << "GVN REMOVING PRE LOAD: " << *LI << '\n');
+ DEBUG(if (!NewInsts.empty())
+ dbgs() << "INSERTED " << NewInsts.size() << " INSTS: "
+ << *NewInsts.back() << '\n');
+
+ // Assign value numbers to the new instructions.
+ for (Instruction *I : NewInsts) {
+ // FIXME: We really _ought_ to insert these value numbers into their
+ // parent's availability map. However, in doing so, we risk getting into
+ // ordering issues. If a block hasn't been processed yet, we would be
+ // marking a value as AVAIL-IN, which isn't what we intend.
+ VN.lookup_or_add(I);
+ }
+
+ for (const auto &PredLoad : PredLoads) {
+ BasicBlock *UnavailablePred = PredLoad.first;
+ Value *LoadPtr = PredLoad.second;
+
+ Instruction *NewLoad = new LoadInst(LoadPtr, LI->getName()+".pre", false,
+ LI->getAlignment(),
+ UnavailablePred->getTerminator());
+
+ // Transfer the old load's AA tags to the new load.
+ AAMDNodes Tags;
+ LI->getAAMetadata(Tags);
+ if (Tags)
+ NewLoad->setAAMetadata(Tags);
+
+ if (auto *MD = LI->getMetadata(LLVMContext::MD_invariant_load))
+ NewLoad->setMetadata(LLVMContext::MD_invariant_load, MD);
+ if (auto *InvGroupMD = LI->getMetadata(LLVMContext::MD_invariant_group))
+ NewLoad->setMetadata(LLVMContext::MD_invariant_group, InvGroupMD);
+
+ // Transfer DebugLoc.
+ NewLoad->setDebugLoc(LI->getDebugLoc());
+
+ // Add the newly created load.
+ ValuesPerBlock.push_back(AvailableValueInBlock::get(UnavailablePred,
+ NewLoad));
+ MD->invalidateCachedPointerInfo(LoadPtr);
+ DEBUG(dbgs() << "GVN INSERTED " << *NewLoad << '\n');
+ }
+
+ // Perform PHI construction.
+ Value *V = ConstructSSAForLoadSet(LI, ValuesPerBlock, *this);
+ LI->replaceAllUsesWith(V);
+ if (isa<PHINode>(V))
+ V->takeName(LI);
+ if (Instruction *I = dyn_cast<Instruction>(V))
+ I->setDebugLoc(LI->getDebugLoc());
+ if (V->getType()->getScalarType()->isPointerTy())
+ MD->invalidateCachedPointerInfo(V);
+ markInstructionForDeletion(LI);
+ ++NumPRELoad;
+ return true;
+}
+
+/// Attempt to eliminate a load whose dependencies are
+/// non-local by performing PHI construction.
+bool GVN::processNonLocalLoad(LoadInst *LI) {
+ // non-local speculations are not allowed under asan.
+ if (LI->getParent()->getParent()->hasFnAttribute(Attribute::SanitizeAddress))
+ return false;
+
+ // Step 1: Find the non-local dependencies of the load.
+ LoadDepVect Deps;
+ MD->getNonLocalPointerDependency(LI, Deps);
+
+ // If we had to process more than one hundred blocks to find the
+ // dependencies, this load isn't worth worrying about. Optimizing
+ // it will be too expensive.
+ unsigned NumDeps = Deps.size();
+ if (NumDeps > 100)
+ return false;
+
+ // If we had a phi translation failure, we'll have a single entry which is a
+ // clobber in the current block. Reject this early.
+ if (NumDeps == 1 &&
+ !Deps[0].getResult().isDef() && !Deps[0].getResult().isClobber()) {
+ DEBUG(
+ dbgs() << "GVN: non-local load ";
+ LI->printAsOperand(dbgs());
+ dbgs() << " has unknown dependencies\n";
+ );
+ return false;
+ }
+
+ // If this load follows a GEP, see if we can PRE the indices before analyzing.
+ if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(LI->getOperand(0))) {
+ for (GetElementPtrInst::op_iterator OI = GEP->idx_begin(),
+ OE = GEP->idx_end();
+ OI != OE; ++OI)
+ if (Instruction *I = dyn_cast<Instruction>(OI->get()))
+ performScalarPRE(I);
+ }
+
+ // Step 2: Analyze the availability of the load
+ AvailValInBlkVect ValuesPerBlock;
+ UnavailBlkVect UnavailableBlocks;
+ AnalyzeLoadAvailability(LI, Deps, ValuesPerBlock, UnavailableBlocks);
+
+ // If we have no predecessors that produce a known value for this load, exit
+ // early.
+ if (ValuesPerBlock.empty())
+ return false;
+
+ // Step 3: Eliminate fully redundancy.
+ //
+ // If all of the instructions we depend on produce a known value for this
+ // load, then it is fully redundant and we can use PHI insertion to compute
+ // its value. Insert PHIs and remove the fully redundant value now.
+ if (UnavailableBlocks.empty()) {
+ DEBUG(dbgs() << "GVN REMOVING NONLOCAL LOAD: " << *LI << '\n');
+
+ // Perform PHI construction.
+ Value *V = ConstructSSAForLoadSet(LI, ValuesPerBlock, *this);
+ LI->replaceAllUsesWith(V);
+
+ if (isa<PHINode>(V))
+ V->takeName(LI);
+ if (Instruction *I = dyn_cast<Instruction>(V))
+ if (LI->getDebugLoc())
+ I->setDebugLoc(LI->getDebugLoc());
+ if (V->getType()->getScalarType()->isPointerTy())
+ MD->invalidateCachedPointerInfo(V);
+ markInstructionForDeletion(LI);
+ ++NumGVNLoad;
+ return true;
+ }
+
+ // Step 4: Eliminate partial redundancy.
+ if (!EnablePRE || !EnableLoadPRE)
+ return false;
+
+ return PerformLoadPRE(LI, ValuesPerBlock, UnavailableBlocks);
+}
+
+bool GVN::processAssumeIntrinsic(IntrinsicInst *IntrinsicI) {
+ assert(IntrinsicI->getIntrinsicID() == Intrinsic::assume &&
+ "This function can only be called with llvm.assume intrinsic");
+ Value *V = IntrinsicI->getArgOperand(0);
+
+ if (ConstantInt *Cond = dyn_cast<ConstantInt>(V)) {
+ if (Cond->isZero()) {
+ Type *Int8Ty = Type::getInt8Ty(V->getContext());
+ // Insert a new store to null instruction before the load to indicate that
+ // this code is not reachable. FIXME: We could insert unreachable
+ // instruction directly because we can modify the CFG.
+ new StoreInst(UndefValue::get(Int8Ty),
+ Constant::getNullValue(Int8Ty->getPointerTo()),
+ IntrinsicI);
+ }
+ markInstructionForDeletion(IntrinsicI);
+ return false;
+ }
+
+ Constant *True = ConstantInt::getTrue(V->getContext());
+ bool Changed = false;
+
+ for (BasicBlock *Successor : successors(IntrinsicI->getParent())) {
+ BasicBlockEdge Edge(IntrinsicI->getParent(), Successor);
+
+ // This property is only true in dominated successors, propagateEquality
+ // will check dominance for us.
+ Changed |= propagateEquality(V, True, Edge, false);
+ }
+
+ // We can replace assume value with true, which covers cases like this:
+ // call void @llvm.assume(i1 %cmp)
+ // br i1 %cmp, label %bb1, label %bb2 ; will change %cmp to true
+ ReplaceWithConstMap[V] = True;
+
+ // If one of *cmp *eq operand is const, adding it to map will cover this:
+ // %cmp = fcmp oeq float 3.000000e+00, %0 ; const on lhs could happen
+ // call void @llvm.assume(i1 %cmp)
+ // ret float %0 ; will change it to ret float 3.000000e+00
+ if (auto *CmpI = dyn_cast<CmpInst>(V)) {
+ if (CmpI->getPredicate() == CmpInst::Predicate::ICMP_EQ ||
+ CmpI->getPredicate() == CmpInst::Predicate::FCMP_OEQ ||
+ (CmpI->getPredicate() == CmpInst::Predicate::FCMP_UEQ &&
+ CmpI->getFastMathFlags().noNaNs())) {
+ Value *CmpLHS = CmpI->getOperand(0);
+ Value *CmpRHS = CmpI->getOperand(1);
+ if (isa<Constant>(CmpLHS))
+ std::swap(CmpLHS, CmpRHS);
+ auto *RHSConst = dyn_cast<Constant>(CmpRHS);
+
+ // If only one operand is constant.
+ if (RHSConst != nullptr && !isa<Constant>(CmpLHS))
+ ReplaceWithConstMap[CmpLHS] = RHSConst;
+ }
+ }
+ return Changed;
+}
+
+static void patchReplacementInstruction(Instruction *I, Value *Repl) {
+ // Patch the replacement so that it is not more restrictive than the value
+ // being replaced.
+ BinaryOperator *Op = dyn_cast<BinaryOperator>(I);
+ BinaryOperator *ReplOp = dyn_cast<BinaryOperator>(Repl);
+ if (Op && ReplOp)
+ ReplOp->andIRFlags(Op);
+
+ if (Instruction *ReplInst = dyn_cast<Instruction>(Repl)) {
+ // FIXME: If both the original and replacement value are part of the
+ // same control-flow region (meaning that the execution of one
+ // guarantees the execution of the other), then we can combine the
+ // noalias scopes here and do better than the general conservative
+ // answer used in combineMetadata().
+
+ // In general, GVN unifies expressions over different control-flow
+ // regions, and so we need a conservative combination of the noalias
+ // scopes.
+ static const unsigned KnownIDs[] = {
+ LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
+ LLVMContext::MD_noalias, LLVMContext::MD_range,
+ LLVMContext::MD_fpmath, LLVMContext::MD_invariant_load,
+ LLVMContext::MD_invariant_group};
+ combineMetadata(ReplInst, I, KnownIDs);
+ }
+}
+
+static void patchAndReplaceAllUsesWith(Instruction *I, Value *Repl) {
+ patchReplacementInstruction(I, Repl);
+ I->replaceAllUsesWith(Repl);
+}
+
+/// Attempt to eliminate a load, first by eliminating it
+/// locally, and then attempting non-local elimination if that fails.
+bool GVN::processLoad(LoadInst *L) {
+ if (!MD)
+ return false;
+
+ if (!L->isSimple())
+ return false;
+
+ if (L->use_empty()) {
+ markInstructionForDeletion(L);
+ return true;
+ }
+
+ // ... to a pointer that has been loaded from before...
+ MemDepResult Dep = MD->getDependency(L);
+ const DataLayout &DL = L->getModule()->getDataLayout();
+
+ // If we have a clobber and target data is around, see if this is a clobber
+ // that we can fix up through code synthesis.
+ if (Dep.isClobber()) {
+ // Check to see if we have something like this:
+ // store i32 123, i32* %P
+ // %A = bitcast i32* %P to i8*
+ // %B = gep i8* %A, i32 1
+ // %C = load i8* %B
+ //
+ // We could do that by recognizing if the clobber instructions are obviously
+ // a common base + constant offset, and if the previous store (or memset)
+ // completely covers this load. This sort of thing can happen in bitfield
+ // access code.
+ Value *AvailVal = nullptr;
+ if (StoreInst *DepSI = dyn_cast<StoreInst>(Dep.getInst())) {
+ int Offset = AnalyzeLoadFromClobberingStore(
+ L->getType(), L->getPointerOperand(), DepSI);
+ if (Offset != -1)
+ AvailVal = GetStoreValueForLoad(DepSI->getValueOperand(), Offset,
+ L->getType(), L, DL);
+ }
+
+ // Check to see if we have something like this:
+ // load i32* P
+ // load i8* (P+1)
+ // if we have this, replace the later with an extraction from the former.
+ if (LoadInst *DepLI = dyn_cast<LoadInst>(Dep.getInst())) {
+ // If this is a clobber and L is the first instruction in its block, then
+ // we have the first instruction in the entry block.
+ if (DepLI == L)
+ return false;
+
+ int Offset = AnalyzeLoadFromClobberingLoad(
+ L->getType(), L->getPointerOperand(), DepLI, DL);
+ if (Offset != -1)
+ AvailVal = GetLoadValueForLoad(DepLI, Offset, L->getType(), L, *this);
+ }
+
+ // If the clobbering value is a memset/memcpy/memmove, see if we can forward
+ // a value on from it.
+ if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(Dep.getInst())) {
+ int Offset = AnalyzeLoadFromClobberingMemInst(
+ L->getType(), L->getPointerOperand(), DepMI, DL);
+ if (Offset != -1)
+ AvailVal = GetMemInstValueForLoad(DepMI, Offset, L->getType(), L, DL);
+ }
+
+ if (AvailVal) {
+ DEBUG(dbgs() << "GVN COERCED INST:\n" << *Dep.getInst() << '\n'
+ << *AvailVal << '\n' << *L << "\n\n\n");
+
+ // Replace the load!
+ L->replaceAllUsesWith(AvailVal);
+ if (AvailVal->getType()->getScalarType()->isPointerTy())
+ MD->invalidateCachedPointerInfo(AvailVal);
+ markInstructionForDeletion(L);
+ ++NumGVNLoad;
+ return true;
+ }
+
+ // If the value isn't available, don't do anything!
+ DEBUG(
+ // fast print dep, using operator<< on instruction is too slow.
+ dbgs() << "GVN: load ";
+ L->printAsOperand(dbgs());
+ Instruction *I = Dep.getInst();
+ dbgs() << " is clobbered by " << *I << '\n';
+ );
+ return false;
+ }
+
+ // If it is defined in another block, try harder.
+ if (Dep.isNonLocal())
+ return processNonLocalLoad(L);
+
+ if (!Dep.isDef()) {
+ DEBUG(
+ // fast print dep, using operator<< on instruction is too slow.
+ dbgs() << "GVN: load ";
+ L->printAsOperand(dbgs());
+ dbgs() << " has unknown dependence\n";
+ );
+ return false;
+ }
+
+ Instruction *DepInst = Dep.getInst();
+ if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInst)) {
+ Value *StoredVal = DepSI->getValueOperand();
+
+ // The store and load are to a must-aliased pointer, but they may not
+ // actually have the same type. See if we know how to reuse the stored
+ // value (depending on its type).
+ if (StoredVal->getType() != L->getType()) {
+ IRBuilder<> Builder(L);
+ StoredVal =
+ CoerceAvailableValueToLoadType(StoredVal, L->getType(), Builder, DL);
+ if (!StoredVal)
+ return false;
+
+ DEBUG(dbgs() << "GVN COERCED STORE:\n" << *DepSI << '\n' << *StoredVal
+ << '\n' << *L << "\n\n\n");
+ }
+
+ // Remove it!
+ L->replaceAllUsesWith(StoredVal);
+ if (StoredVal->getType()->getScalarType()->isPointerTy())
+ MD->invalidateCachedPointerInfo(StoredVal);
+ markInstructionForDeletion(L);
+ ++NumGVNLoad;
+ return true;
+ }
+
+ if (LoadInst *DepLI = dyn_cast<LoadInst>(DepInst)) {
+ Value *AvailableVal = DepLI;
+
+ // The loads are of a must-aliased pointer, but they may not actually have
+ // the same type. See if we know how to reuse the previously loaded value
+ // (depending on its type).
+ if (DepLI->getType() != L->getType()) {
+ IRBuilder<> Builder(L);
+ AvailableVal =
+ CoerceAvailableValueToLoadType(DepLI, L->getType(), Builder, DL);
+ if (!AvailableVal)
+ return false;
+
+ DEBUG(dbgs() << "GVN COERCED LOAD:\n" << *DepLI << "\n" << *AvailableVal
+ << "\n" << *L << "\n\n\n");
+ }
+
+ // Remove it!
+ patchAndReplaceAllUsesWith(L, AvailableVal);
+ if (DepLI->getType()->getScalarType()->isPointerTy())
+ MD->invalidateCachedPointerInfo(DepLI);
+ markInstructionForDeletion(L);
+ ++NumGVNLoad;
+ return true;
+ }
+
+ // If this load really doesn't depend on anything, then we must be loading an
+ // undef value. This can happen when loading for a fresh allocation with no
+ // intervening stores, for example.
+ if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI)) {
+ L->replaceAllUsesWith(UndefValue::get(L->getType()));
+ markInstructionForDeletion(L);
+ ++NumGVNLoad;
+ return true;
+ }
+
+ // If this load occurs either right after a lifetime begin,
+ // then the loaded value is undefined.
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(DepInst)) {
+ if (II->getIntrinsicID() == Intrinsic::lifetime_start) {
+ L->replaceAllUsesWith(UndefValue::get(L->getType()));
+ markInstructionForDeletion(L);
+ ++NumGVNLoad;
+ return true;
+ }
+ }
+
+ // If this load follows a calloc (which zero initializes memory),
+ // then the loaded value is zero
+ if (isCallocLikeFn(DepInst, TLI)) {
+ L->replaceAllUsesWith(Constant::getNullValue(L->getType()));
+ markInstructionForDeletion(L);
+ ++NumGVNLoad;
+ return true;
+ }
+
+ return false;
+}
+
+// In order to find a leader for a given value number at a
+// specific basic block, we first obtain the list of all Values for that number,
+// and then scan the list to find one whose block dominates the block in
+// question. This is fast because dominator tree queries consist of only
+// a few comparisons of DFS numbers.
+Value *GVN::findLeader(const BasicBlock *BB, uint32_t num) {
+ LeaderTableEntry Vals = LeaderTable[num];
+ if (!Vals.Val) return nullptr;
+
+ Value *Val = nullptr;
+ if (DT->dominates(Vals.BB, BB)) {
+ Val = Vals.Val;
+ if (isa<Constant>(Val)) return Val;
+ }
+
+ LeaderTableEntry* Next = Vals.Next;
+ while (Next) {
+ if (DT->dominates(Next->BB, BB)) {
+ if (isa<Constant>(Next->Val)) return Next->Val;
+ if (!Val) Val = Next->Val;
+ }
+
+ Next = Next->Next;
+ }
+
+ return Val;
+}
+
+/// There is an edge from 'Src' to 'Dst'. Return
+/// true if every path from the entry block to 'Dst' passes via this edge. In
+/// particular 'Dst' must not be reachable via another edge from 'Src'.
+static bool isOnlyReachableViaThisEdge(const BasicBlockEdge &E,
+ DominatorTree *DT) {
+ // While in theory it is interesting to consider the case in which Dst has
+ // more than one predecessor, because Dst might be part of a loop which is
+ // only reachable from Src, in practice it is pointless since at the time
+ // GVN runs all such loops have preheaders, which means that Dst will have
+ // been changed to have only one predecessor, namely Src.
+ const BasicBlock *Pred = E.getEnd()->getSinglePredecessor();
+ const BasicBlock *Src = E.getStart();
+ assert((!Pred || Pred == Src) && "No edge between these basic blocks!");
+ (void)Src;
+ return Pred != nullptr;
+}
+
+// Tries to replace instruction with const, using information from
+// ReplaceWithConstMap.
+bool GVN::replaceOperandsWithConsts(Instruction *Instr) const {
+ bool Changed = false;
+ for (unsigned OpNum = 0; OpNum < Instr->getNumOperands(); ++OpNum) {
+ Value *Operand = Instr->getOperand(OpNum);
+ auto it = ReplaceWithConstMap.find(Operand);
+ if (it != ReplaceWithConstMap.end()) {
+ assert(!isa<Constant>(Operand) &&
+ "Replacing constants with constants is invalid");
+ DEBUG(dbgs() << "GVN replacing: " << *Operand << " with " << *it->second
+ << " in instruction " << *Instr << '\n');
+ Instr->setOperand(OpNum, it->second);
+ Changed = true;
+ }
+ }
+ return Changed;
+}
+
+/// The given values are known to be equal in every block
+/// dominated by 'Root'. Exploit this, for example by replacing 'LHS' with
+/// 'RHS' everywhere in the scope. Returns whether a change was made.
+/// If DominatesByEdge is false, then it means that it is dominated by Root.End.
+bool GVN::propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root,
+ bool DominatesByEdge) {
+ SmallVector<std::pair<Value*, Value*>, 4> Worklist;
+ Worklist.push_back(std::make_pair(LHS, RHS));
+ bool Changed = false;
+ // For speed, compute a conservative fast approximation to
+ // DT->dominates(Root, Root.getEnd());
+ bool RootDominatesEnd = isOnlyReachableViaThisEdge(Root, DT);
+
+ while (!Worklist.empty()) {
+ std::pair<Value*, Value*> Item = Worklist.pop_back_val();
+ LHS = Item.first; RHS = Item.second;
+
+ if (LHS == RHS)
+ continue;
+ assert(LHS->getType() == RHS->getType() && "Equality but unequal types!");
+
+ // Don't try to propagate equalities between constants.
+ if (isa<Constant>(LHS) && isa<Constant>(RHS))
+ continue;
+
+ // Prefer a constant on the right-hand side, or an Argument if no constants.
+ if (isa<Constant>(LHS) || (isa<Argument>(LHS) && !isa<Constant>(RHS)))
+ std::swap(LHS, RHS);
+ assert((isa<Argument>(LHS) || isa<Instruction>(LHS)) && "Unexpected value!");
+
+ // If there is no obvious reason to prefer the left-hand side over the
+ // right-hand side, ensure the longest lived term is on the right-hand side,
+ // so the shortest lived term will be replaced by the longest lived.
+ // This tends to expose more simplifications.
+ uint32_t LVN = VN.lookup_or_add(LHS);
+ if ((isa<Argument>(LHS) && isa<Argument>(RHS)) ||
+ (isa<Instruction>(LHS) && isa<Instruction>(RHS))) {
+ // Move the 'oldest' value to the right-hand side, using the value number
+ // as a proxy for age.
+ uint32_t RVN = VN.lookup_or_add(RHS);
+ if (LVN < RVN) {
+ std::swap(LHS, RHS);
+ LVN = RVN;
+ }
+ }
+
+ // If value numbering later sees that an instruction in the scope is equal
+ // to 'LHS' then ensure it will be turned into 'RHS'. In order to preserve
+ // the invariant that instructions only occur in the leader table for their
+ // own value number (this is used by removeFromLeaderTable), do not do this
+ // if RHS is an instruction (if an instruction in the scope is morphed into
+ // LHS then it will be turned into RHS by the next GVN iteration anyway, so
+ // using the leader table is about compiling faster, not optimizing better).
+ // The leader table only tracks basic blocks, not edges. Only add to if we
+ // have the simple case where the edge dominates the end.
+ if (RootDominatesEnd && !isa<Instruction>(RHS))
+ addToLeaderTable(LVN, RHS, Root.getEnd());
+
+ // Replace all occurrences of 'LHS' with 'RHS' everywhere in the scope. As
+ // LHS always has at least one use that is not dominated by Root, this will
+ // never do anything if LHS has only one use.
+ if (!LHS->hasOneUse()) {
+ unsigned NumReplacements =
+ DominatesByEdge
+ ? replaceDominatedUsesWith(LHS, RHS, *DT, Root)
+ : replaceDominatedUsesWith(LHS, RHS, *DT, Root.getEnd());
+
+ Changed |= NumReplacements > 0;
+ NumGVNEqProp += NumReplacements;
+ }
+
+ // Now try to deduce additional equalities from this one. For example, if
+ // the known equality was "(A != B)" == "false" then it follows that A and B
+ // are equal in the scope. Only boolean equalities with an explicit true or
+ // false RHS are currently supported.
+ if (!RHS->getType()->isIntegerTy(1))
+ // Not a boolean equality - bail out.
+ continue;
+ ConstantInt *CI = dyn_cast<ConstantInt>(RHS);
+ if (!CI)
+ // RHS neither 'true' nor 'false' - bail out.
+ continue;
+ // Whether RHS equals 'true'. Otherwise it equals 'false'.
+ bool isKnownTrue = CI->isAllOnesValue();
+ bool isKnownFalse = !isKnownTrue;
+
+ // If "A && B" is known true then both A and B are known true. If "A || B"
+ // is known false then both A and B are known false.
+ Value *A, *B;
+ if ((isKnownTrue && match(LHS, m_And(m_Value(A), m_Value(B)))) ||
+ (isKnownFalse && match(LHS, m_Or(m_Value(A), m_Value(B))))) {
+ Worklist.push_back(std::make_pair(A, RHS));
+ Worklist.push_back(std::make_pair(B, RHS));
+ continue;
+ }
+
+ // If we are propagating an equality like "(A == B)" == "true" then also
+ // propagate the equality A == B. When propagating a comparison such as
+ // "(A >= B)" == "true", replace all instances of "A < B" with "false".
+ if (CmpInst *Cmp = dyn_cast<CmpInst>(LHS)) {
+ Value *Op0 = Cmp->getOperand(0), *Op1 = Cmp->getOperand(1);
+
+ // If "A == B" is known true, or "A != B" is known false, then replace
+ // A with B everywhere in the scope.
+ if ((isKnownTrue && Cmp->getPredicate() == CmpInst::ICMP_EQ) ||
+ (isKnownFalse && Cmp->getPredicate() == CmpInst::ICMP_NE))
+ Worklist.push_back(std::make_pair(Op0, Op1));
+
+ // Handle the floating point versions of equality comparisons too.
+ if ((isKnownTrue && Cmp->getPredicate() == CmpInst::FCMP_OEQ) ||
+ (isKnownFalse && Cmp->getPredicate() == CmpInst::FCMP_UNE)) {
+
+ // Floating point -0.0 and 0.0 compare equal, so we can only
+ // propagate values if we know that we have a constant and that
+ // its value is non-zero.
+
+ // FIXME: We should do this optimization if 'no signed zeros' is
+ // applicable via an instruction-level fast-math-flag or some other
+ // indicator that relaxed FP semantics are being used.
+
+ if (isa<ConstantFP>(Op1) && !cast<ConstantFP>(Op1)->isZero())
+ Worklist.push_back(std::make_pair(Op0, Op1));
+ }
+
+ // If "A >= B" is known true, replace "A < B" with false everywhere.
+ CmpInst::Predicate NotPred = Cmp->getInversePredicate();
+ Constant *NotVal = ConstantInt::get(Cmp->getType(), isKnownFalse);
+ // Since we don't have the instruction "A < B" immediately to hand, work
+ // out the value number that it would have and use that to find an
+ // appropriate instruction (if any).
+ uint32_t NextNum = VN.getNextUnusedValueNumber();
+ uint32_t Num = VN.lookup_or_add_cmp(Cmp->getOpcode(), NotPred, Op0, Op1);
+ // If the number we were assigned was brand new then there is no point in
+ // looking for an instruction realizing it: there cannot be one!
+ if (Num < NextNum) {
+ Value *NotCmp = findLeader(Root.getEnd(), Num);
+ if (NotCmp && isa<Instruction>(NotCmp)) {
+ unsigned NumReplacements =
+ DominatesByEdge
+ ? replaceDominatedUsesWith(NotCmp, NotVal, *DT, Root)
+ : replaceDominatedUsesWith(NotCmp, NotVal, *DT,
+ Root.getEnd());
+ Changed |= NumReplacements > 0;
+ NumGVNEqProp += NumReplacements;
+ }
+ }
+ // Ensure that any instruction in scope that gets the "A < B" value number
+ // is replaced with false.
+ // The leader table only tracks basic blocks, not edges. Only add to if we
+ // have the simple case where the edge dominates the end.
+ if (RootDominatesEnd)
+ addToLeaderTable(Num, NotVal, Root.getEnd());
+
+ continue;
+ }
+ }
+
+ return Changed;
+}
+
+/// When calculating availability, handle an instruction
+/// by inserting it into the appropriate sets
+bool GVN::processInstruction(Instruction *I) {
+ // Ignore dbg info intrinsics.
+ if (isa<DbgInfoIntrinsic>(I))
+ return false;
+
+ // If the instruction can be easily simplified then do so now in preference
+ // to value numbering it. Value numbering often exposes redundancies, for
+ // example if it determines that %y is equal to %x then the instruction
+ // "%z = and i32 %x, %y" becomes "%z = and i32 %x, %x" which we now simplify.
+ const DataLayout &DL = I->getModule()->getDataLayout();
+ if (Value *V = SimplifyInstruction(I, DL, TLI, DT, AC)) {
+ I->replaceAllUsesWith(V);
+ if (MD && V->getType()->getScalarType()->isPointerTy())
+ MD->invalidateCachedPointerInfo(V);
+ markInstructionForDeletion(I);
+ ++NumGVNSimpl;
+ return true;
+ }
+
+ if (IntrinsicInst *IntrinsicI = dyn_cast<IntrinsicInst>(I))
+ if (IntrinsicI->getIntrinsicID() == Intrinsic::assume)
+ return processAssumeIntrinsic(IntrinsicI);
+
+ if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+ if (processLoad(LI))
+ return true;
+
+ unsigned Num = VN.lookup_or_add(LI);
+ addToLeaderTable(Num, LI, LI->getParent());
+ return false;
+ }
+
+ // For conditional branches, we can perform simple conditional propagation on
+ // the condition value itself.
+ if (BranchInst *BI = dyn_cast<BranchInst>(I)) {
+ if (!BI->isConditional())
+ return false;
+
+ if (isa<Constant>(BI->getCondition()))
+ return processFoldableCondBr(BI);
+
+ Value *BranchCond = BI->getCondition();
+ BasicBlock *TrueSucc = BI->getSuccessor(0);
+ BasicBlock *FalseSucc = BI->getSuccessor(1);
+ // Avoid multiple edges early.
+ if (TrueSucc == FalseSucc)
+ return false;
+
+ BasicBlock *Parent = BI->getParent();
+ bool Changed = false;
+
+ Value *TrueVal = ConstantInt::getTrue(TrueSucc->getContext());
+ BasicBlockEdge TrueE(Parent, TrueSucc);
+ Changed |= propagateEquality(BranchCond, TrueVal, TrueE, true);
+
+ Value *FalseVal = ConstantInt::getFalse(FalseSucc->getContext());
+ BasicBlockEdge FalseE(Parent, FalseSucc);
+ Changed |= propagateEquality(BranchCond, FalseVal, FalseE, true);
+
+ return Changed;
+ }
+
+ // For switches, propagate the case values into the case destinations.
+ if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) {
+ Value *SwitchCond = SI->getCondition();
+ BasicBlock *Parent = SI->getParent();
+ bool Changed = false;
+
+ // Remember how many outgoing edges there are to every successor.
+ SmallDenseMap<BasicBlock *, unsigned, 16> SwitchEdges;
+ for (unsigned i = 0, n = SI->getNumSuccessors(); i != n; ++i)
+ ++SwitchEdges[SI->getSuccessor(i)];
+
+ for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
+ i != e; ++i) {
+ BasicBlock *Dst = i.getCaseSuccessor();
+ // If there is only a single edge, propagate the case value into it.
+ if (SwitchEdges.lookup(Dst) == 1) {
+ BasicBlockEdge E(Parent, Dst);
+ Changed |= propagateEquality(SwitchCond, i.getCaseValue(), E, true);
+ }
+ }
+ return Changed;
+ }
+
+ // Instructions with void type don't return a value, so there's
+ // no point in trying to find redundancies in them.
+ if (I->getType()->isVoidTy())
+ return false;
+
+ uint32_t NextNum = VN.getNextUnusedValueNumber();
+ unsigned Num = VN.lookup_or_add(I);
+
+ // Allocations are always uniquely numbered, so we can save time and memory
+ // by fast failing them.
+ if (isa<AllocaInst>(I) || isa<TerminatorInst>(I) || isa<PHINode>(I)) {
+ addToLeaderTable(Num, I, I->getParent());
+ return false;
+ }
+
+ // If the number we were assigned was a brand new VN, then we don't
+ // need to do a lookup to see if the number already exists
+ // somewhere in the domtree: it can't!
+ if (Num >= NextNum) {
+ addToLeaderTable(Num, I, I->getParent());
+ return false;
+ }
+
+ // Perform fast-path value-number based elimination of values inherited from
+ // dominators.
+ Value *Repl = findLeader(I->getParent(), Num);
+ if (!Repl) {
+ // Failure, just remember this instance for future use.
+ addToLeaderTable(Num, I, I->getParent());
+ return false;
+ } else if (Repl == I) {
+ // If I was the result of a shortcut PRE, it might already be in the table
+ // and the best replacement for itself. Nothing to do.
+ return false;
+ }
+
+ // Remove it!
+ patchAndReplaceAllUsesWith(I, Repl);
+ if (MD && Repl->getType()->getScalarType()->isPointerTy())
+ MD->invalidateCachedPointerInfo(Repl);
+ markInstructionForDeletion(I);
+ return true;
+}
+
+/// runOnFunction - This is the main transformation entry point for a function.
+bool GVN::runOnFunction(Function& F) {
+ if (skipOptnoneFunction(F))
+ return false;
+
+ if (!NoLoads)
+ MD = &getAnalysis<MemoryDependenceAnalysis>();
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ VN.setAliasAnalysis(&getAnalysis<AAResultsWrapperPass>().getAAResults());
+ VN.setMemDep(MD);
+ VN.setDomTree(DT);
+
+ bool Changed = false;
+ bool ShouldContinue = true;
+
+ // Merge unconditional branches, allowing PRE to catch more
+ // optimization opportunities.
+ for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ) {
+ BasicBlock *BB = &*FI++;
+
+ bool removedBlock =
+ MergeBlockIntoPredecessor(BB, DT, /* LoopInfo */ nullptr, MD);
+ if (removedBlock) ++NumGVNBlocks;
+
+ Changed |= removedBlock;
+ }
+
+ unsigned Iteration = 0;
+ while (ShouldContinue) {
+ DEBUG(dbgs() << "GVN iteration: " << Iteration << "\n");
+ ShouldContinue = iterateOnFunction(F);
+ Changed |= ShouldContinue;
+ ++Iteration;
+ }
+
+ if (EnablePRE) {
+ // Fabricate val-num for dead-code in order to suppress assertion in
+ // performPRE().
+ assignValNumForDeadCode();
+ bool PREChanged = true;
+ while (PREChanged) {
+ PREChanged = performPRE(F);
+ Changed |= PREChanged;
+ }
+ }
+
+ // FIXME: Should perform GVN again after PRE does something. PRE can move
+ // computations into blocks where they become fully redundant. Note that
+ // we can't do this until PRE's critical edge splitting updates memdep.
+ // Actually, when this happens, we should just fully integrate PRE into GVN.
+
+ cleanupGlobalSets();
+ // Do not cleanup DeadBlocks in cleanupGlobalSets() as it's called for each
+ // iteration.
+ DeadBlocks.clear();
+
+ return Changed;
+}
+
+bool GVN::processBlock(BasicBlock *BB) {
+ // FIXME: Kill off InstrsToErase by doing erasing eagerly in a helper function
+ // (and incrementing BI before processing an instruction).
+ assert(InstrsToErase.empty() &&
+ "We expect InstrsToErase to be empty across iterations");
+ if (DeadBlocks.count(BB))
+ return false;
+
+ // Clearing map before every BB because it can be used only for single BB.
+ ReplaceWithConstMap.clear();
+ bool ChangedFunction = false;
+
+ for (BasicBlock::iterator BI = BB->begin(), BE = BB->end();
+ BI != BE;) {
+ if (!ReplaceWithConstMap.empty())
+ ChangedFunction |= replaceOperandsWithConsts(&*BI);
+ ChangedFunction |= processInstruction(&*BI);
+
+ if (InstrsToErase.empty()) {
+ ++BI;
+ continue;
+ }
+
+ // If we need some instructions deleted, do it now.
+ NumGVNInstr += InstrsToErase.size();
+
+ // Avoid iterator invalidation.
+ bool AtStart = BI == BB->begin();
+ if (!AtStart)
+ --BI;
+
+ for (SmallVectorImpl<Instruction *>::iterator I = InstrsToErase.begin(),
+ E = InstrsToErase.end(); I != E; ++I) {
+ DEBUG(dbgs() << "GVN removed: " << **I << '\n');
+ if (MD) MD->removeInstruction(*I);
+ DEBUG(verifyRemoved(*I));
+ (*I)->eraseFromParent();
+ }
+ InstrsToErase.clear();
+
+ if (AtStart)
+ BI = BB->begin();
+ else
+ ++BI;
+ }
+
+ return ChangedFunction;
+}
+
+// Instantiate an expression in a predecessor that lacked it.
+bool GVN::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred,
+ unsigned int ValNo) {
+ // Because we are going top-down through the block, all value numbers
+ // will be available in the predecessor by the time we need them. Any
+ // that weren't originally present will have been instantiated earlier
+ // in this loop.
+ bool success = true;
+ for (unsigned i = 0, e = Instr->getNumOperands(); i != e; ++i) {
+ Value *Op = Instr->getOperand(i);
+ if (isa<Argument>(Op) || isa<Constant>(Op) || isa<GlobalValue>(Op))
+ continue;
+ // This could be a newly inserted instruction, in which case, we won't
+ // find a value number, and should give up before we hurt ourselves.
+ // FIXME: Rewrite the infrastructure to let it easier to value number
+ // and process newly inserted instructions.
+ if (!VN.exists(Op)) {
+ success = false;
+ break;
+ }
+ if (Value *V = findLeader(Pred, VN.lookup(Op))) {
+ Instr->setOperand(i, V);
+ } else {
+ success = false;
+ break;
+ }
+ }
+
+ // Fail out if we encounter an operand that is not available in
+ // the PRE predecessor. This is typically because of loads which
+ // are not value numbered precisely.
+ if (!success)
+ return false;
+
+ Instr->insertBefore(Pred->getTerminator());
+ Instr->setName(Instr->getName() + ".pre");
+ Instr->setDebugLoc(Instr->getDebugLoc());
+ VN.add(Instr, ValNo);
+
+ // Update the availability map to include the new instruction.
+ addToLeaderTable(ValNo, Instr, Pred);
+ return true;
+}
+
+bool GVN::performScalarPRE(Instruction *CurInst) {
+ SmallVector<std::pair<Value*, BasicBlock*>, 8> predMap;
+
+ if (isa<AllocaInst>(CurInst) || isa<TerminatorInst>(CurInst) ||
+ isa<PHINode>(CurInst) || CurInst->getType()->isVoidTy() ||
+ CurInst->mayReadFromMemory() || CurInst->mayHaveSideEffects() ||
+ isa<DbgInfoIntrinsic>(CurInst))
+ return false;
+
+ // Don't do PRE on compares. The PHI would prevent CodeGenPrepare from
+ // sinking the compare again, and it would force the code generator to
+ // move the i1 from processor flags or predicate registers into a general
+ // purpose register.
+ if (isa<CmpInst>(CurInst))
+ return false;
+
+ // We don't currently value number ANY inline asm calls.
+ if (CallInst *CallI = dyn_cast<CallInst>(CurInst))
+ if (CallI->isInlineAsm())
+ return false;
+
+ uint32_t ValNo = VN.lookup(CurInst);
+
+ // Look for the predecessors for PRE opportunities. We're
+ // only trying to solve the basic diamond case, where
+ // a value is computed in the successor and one predecessor,
+ // but not the other. We also explicitly disallow cases
+ // where the successor is its own predecessor, because they're
+ // more complicated to get right.
+ unsigned NumWith = 0;
+ unsigned NumWithout = 0;
+ BasicBlock *PREPred = nullptr;
+ BasicBlock *CurrentBlock = CurInst->getParent();
+ predMap.clear();
+
+ for (BasicBlock *P : predecessors(CurrentBlock)) {
+ // We're not interested in PRE where the block is its
+ // own predecessor, or in blocks with predecessors
+ // that are not reachable.
+ if (P == CurrentBlock) {
+ NumWithout = 2;
+ break;
+ } else if (!DT->isReachableFromEntry(P)) {
+ NumWithout = 2;
+ break;
+ }
+
+ Value *predV = findLeader(P, ValNo);
+ if (!predV) {
+ predMap.push_back(std::make_pair(static_cast<Value *>(nullptr), P));
+ PREPred = P;
+ ++NumWithout;
+ } else if (predV == CurInst) {
+ /* CurInst dominates this predecessor. */
+ NumWithout = 2;
+ break;
+ } else {
+ predMap.push_back(std::make_pair(predV, P));
+ ++NumWith;
+ }
+ }
+
+ // Don't do PRE when it might increase code size, i.e. when
+ // we would need to insert instructions in more than one pred.
+ if (NumWithout > 1 || NumWith == 0)
+ return false;
+
+ // We may have a case where all predecessors have the instruction,
+ // and we just need to insert a phi node. Otherwise, perform
+ // insertion.
+ Instruction *PREInstr = nullptr;
+
+ if (NumWithout != 0) {
+ // Don't do PRE across indirect branch.
+ if (isa<IndirectBrInst>(PREPred->getTerminator()))
+ return false;
+
+ // We can't do PRE safely on a critical edge, so instead we schedule
+ // the edge to be split and perform the PRE the next time we iterate
+ // on the function.
+ unsigned SuccNum = GetSuccessorNumber(PREPred, CurrentBlock);
+ if (isCriticalEdge(PREPred->getTerminator(), SuccNum)) {
+ toSplit.push_back(std::make_pair(PREPred->getTerminator(), SuccNum));
+ return false;
+ }
+ // We need to insert somewhere, so let's give it a shot
+ PREInstr = CurInst->clone();
+ if (!performScalarPREInsertion(PREInstr, PREPred, ValNo)) {
+ // If we failed insertion, make sure we remove the instruction.
+ DEBUG(verifyRemoved(PREInstr));
+ delete PREInstr;
+ return false;
+ }
+ }
+
+ // Either we should have filled in the PRE instruction, or we should
+ // not have needed insertions.
+ assert (PREInstr != nullptr || NumWithout == 0);
+
+ ++NumGVNPRE;
+
+ // Create a PHI to make the value available in this block.
+ PHINode *Phi =
+ PHINode::Create(CurInst->getType(), predMap.size(),
+ CurInst->getName() + ".pre-phi", &CurrentBlock->front());
+ for (unsigned i = 0, e = predMap.size(); i != e; ++i) {
+ if (Value *V = predMap[i].first)
+ Phi->addIncoming(V, predMap[i].second);
+ else
+ Phi->addIncoming(PREInstr, PREPred);
+ }
+
+ VN.add(Phi, ValNo);
+ addToLeaderTable(ValNo, Phi, CurrentBlock);
+ Phi->setDebugLoc(CurInst->getDebugLoc());
+ CurInst->replaceAllUsesWith(Phi);
+ if (MD && Phi->getType()->getScalarType()->isPointerTy())
+ MD->invalidateCachedPointerInfo(Phi);
+ VN.erase(CurInst);
+ removeFromLeaderTable(ValNo, CurInst, CurrentBlock);
+
+ DEBUG(dbgs() << "GVN PRE removed: " << *CurInst << '\n');
+ if (MD)
+ MD->removeInstruction(CurInst);
+ DEBUG(verifyRemoved(CurInst));
+ CurInst->eraseFromParent();
+ ++NumGVNInstr;
+
+ return true;
+}
+
+/// Perform a purely local form of PRE that looks for diamond
+/// control flow patterns and attempts to perform simple PRE at the join point.
+bool GVN::performPRE(Function &F) {
+ bool Changed = false;
+ for (BasicBlock *CurrentBlock : depth_first(&F.getEntryBlock())) {
+ // Nothing to PRE in the entry block.
+ if (CurrentBlock == &F.getEntryBlock())
+ continue;
+
+ // Don't perform PRE on an EH pad.
+ if (CurrentBlock->isEHPad())
+ continue;
+
+ for (BasicBlock::iterator BI = CurrentBlock->begin(),
+ BE = CurrentBlock->end();
+ BI != BE;) {
+ Instruction *CurInst = &*BI++;
+ Changed |= performScalarPRE(CurInst);
+ }
+ }
+
+ if (splitCriticalEdges())
+ Changed = true;
+
+ return Changed;
+}
+
+/// Split the critical edge connecting the given two blocks, and return
+/// the block inserted to the critical edge.
+BasicBlock *GVN::splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ) {
+ BasicBlock *BB =
+ SplitCriticalEdge(Pred, Succ, CriticalEdgeSplittingOptions(DT));
+ if (MD)
+ MD->invalidateCachedPredecessors();
+ return BB;
+}
+
+/// Split critical edges found during the previous
+/// iteration that may enable further optimization.
+bool GVN::splitCriticalEdges() {
+ if (toSplit.empty())
+ return false;
+ do {
+ std::pair<TerminatorInst*, unsigned> Edge = toSplit.pop_back_val();
+ SplitCriticalEdge(Edge.first, Edge.second,
+ CriticalEdgeSplittingOptions(DT));
+ } while (!toSplit.empty());
+ if (MD) MD->invalidateCachedPredecessors();
+ return true;
+}
+
+/// Executes one iteration of GVN
+bool GVN::iterateOnFunction(Function &F) {
+ cleanupGlobalSets();
+
+ // Top-down walk of the dominator tree
+ bool Changed = false;
+ // Save the blocks this function have before transformation begins. GVN may
+ // split critical edge, and hence may invalidate the RPO/DT iterator.
+ //
+ std::vector<BasicBlock *> BBVect;
+ BBVect.reserve(256);
+ // Needed for value numbering with phi construction to work.
+ ReversePostOrderTraversal<Function *> RPOT(&F);
+ for (ReversePostOrderTraversal<Function *>::rpo_iterator RI = RPOT.begin(),
+ RE = RPOT.end();
+ RI != RE; ++RI)
+ BBVect.push_back(*RI);
+
+ for (std::vector<BasicBlock *>::iterator I = BBVect.begin(), E = BBVect.end();
+ I != E; I++)
+ Changed |= processBlock(*I);
+
+ return Changed;
+}
+
+void GVN::cleanupGlobalSets() {
+ VN.clear();
+ LeaderTable.clear();
+ TableAllocator.Reset();
+}
+
+/// Verify that the specified instruction does not occur in our
+/// internal data structures.
+void GVN::verifyRemoved(const Instruction *Inst) const {
+ VN.verifyRemoved(Inst);
+
+ // Walk through the value number scope to make sure the instruction isn't
+ // ferreted away in it.
+ for (DenseMap<uint32_t, LeaderTableEntry>::const_iterator
+ I = LeaderTable.begin(), E = LeaderTable.end(); I != E; ++I) {
+ const LeaderTableEntry *Node = &I->second;
+ assert(Node->Val != Inst && "Inst still in value numbering scope!");
+
+ while (Node->Next) {
+ Node = Node->Next;
+ assert(Node->Val != Inst && "Inst still in value numbering scope!");
+ }
+ }
+}
+
+/// BB is declared dead, which implied other blocks become dead as well. This
+/// function is to add all these blocks to "DeadBlocks". For the dead blocks'
+/// live successors, update their phi nodes by replacing the operands
+/// corresponding to dead blocks with UndefVal.
+void GVN::addDeadBlock(BasicBlock *BB) {
+ SmallVector<BasicBlock *, 4> NewDead;
+ SmallSetVector<BasicBlock *, 4> DF;
+
+ NewDead.push_back(BB);
+ while (!NewDead.empty()) {
+ BasicBlock *D = NewDead.pop_back_val();
+ if (DeadBlocks.count(D))
+ continue;
+
+ // All blocks dominated by D are dead.
+ SmallVector<BasicBlock *, 8> Dom;
+ DT->getDescendants(D, Dom);
+ DeadBlocks.insert(Dom.begin(), Dom.end());
+
+ // Figure out the dominance-frontier(D).
+ for (BasicBlock *B : Dom) {
+ for (BasicBlock *S : successors(B)) {
+ if (DeadBlocks.count(S))
+ continue;
+
+ bool AllPredDead = true;
+ for (BasicBlock *P : predecessors(S))
+ if (!DeadBlocks.count(P)) {
+ AllPredDead = false;
+ break;
+ }
+
+ if (!AllPredDead) {
+ // S could be proved dead later on. That is why we don't update phi
+ // operands at this moment.
+ DF.insert(S);
+ } else {
+ // While S is not dominated by D, it is dead by now. This could take
+ // place if S already have a dead predecessor before D is declared
+ // dead.
+ NewDead.push_back(S);
+ }
+ }
+ }
+ }
+
+ // For the dead blocks' live successors, update their phi nodes by replacing
+ // the operands corresponding to dead blocks with UndefVal.
+ for(SmallSetVector<BasicBlock *, 4>::iterator I = DF.begin(), E = DF.end();
+ I != E; I++) {
+ BasicBlock *B = *I;
+ if (DeadBlocks.count(B))
+ continue;
+
+ SmallVector<BasicBlock *, 4> Preds(pred_begin(B), pred_end(B));
+ for (BasicBlock *P : Preds) {
+ if (!DeadBlocks.count(P))
+ continue;
+
+ if (isCriticalEdge(P->getTerminator(), GetSuccessorNumber(P, B))) {
+ if (BasicBlock *S = splitCriticalEdges(P, B))
+ DeadBlocks.insert(P = S);
+ }
+
+ for (BasicBlock::iterator II = B->begin(); isa<PHINode>(II); ++II) {
+ PHINode &Phi = cast<PHINode>(*II);
+ Phi.setIncomingValue(Phi.getBasicBlockIndex(P),
+ UndefValue::get(Phi.getType()));
+ }
+ }
+ }
+}
+
+// If the given branch is recognized as a foldable branch (i.e. conditional
+// branch with constant condition), it will perform following analyses and
+// transformation.
+// 1) If the dead out-coming edge is a critical-edge, split it. Let
+// R be the target of the dead out-coming edge.
+// 1) Identify the set of dead blocks implied by the branch's dead outcoming
+// edge. The result of this step will be {X| X is dominated by R}
+// 2) Identify those blocks which haves at least one dead predecessor. The
+// result of this step will be dominance-frontier(R).
+// 3) Update the PHIs in DF(R) by replacing the operands corresponding to
+// dead blocks with "UndefVal" in an hope these PHIs will optimized away.
+//
+// Return true iff *NEW* dead code are found.
+bool GVN::processFoldableCondBr(BranchInst *BI) {
+ if (!BI || BI->isUnconditional())
+ return false;
+
+ // If a branch has two identical successors, we cannot declare either dead.
+ if (BI->getSuccessor(0) == BI->getSuccessor(1))
+ return false;
+
+ ConstantInt *Cond = dyn_cast<ConstantInt>(BI->getCondition());
+ if (!Cond)
+ return false;
+
+ BasicBlock *DeadRoot = Cond->getZExtValue() ?
+ BI->getSuccessor(1) : BI->getSuccessor(0);
+ if (DeadBlocks.count(DeadRoot))
+ return false;
+
+ if (!DeadRoot->getSinglePredecessor())
+ DeadRoot = splitCriticalEdges(BI->getParent(), DeadRoot);
+
+ addDeadBlock(DeadRoot);
+ return true;
+}
+
+// performPRE() will trigger assert if it comes across an instruction without
+// associated val-num. As it normally has far more live instructions than dead
+// instructions, it makes more sense just to "fabricate" a val-number for the
+// dead code than checking if instruction involved is dead or not.
+void GVN::assignValNumForDeadCode() {
+ for (BasicBlock *BB : DeadBlocks) {
+ for (Instruction &Inst : *BB) {
+ unsigned ValNum = VN.lookup_or_add(&Inst);
+ addToLeaderTable(ValNum, &Inst, BB);
+ }
+ }
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
new file mode 100644
index 0000000..ec5e15f
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -0,0 +1,2199 @@
+//===- IndVarSimplify.cpp - Induction Variable Elimination ----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This transformation analyzes and transforms the induction variables (and
+// computations derived from them) into simpler forms suitable for subsequent
+// analysis and transformation.
+//
+// If the trip count of a loop is computable, this pass also makes the following
+// changes:
+// 1. The exit condition for the loop is canonicalized to compare the
+// induction value against the exit value. This turns loops like:
+// 'for (i = 7; i*i < 1000; ++i)' into 'for (i = 0; i != 25; ++i)'
+// 2. Any use outside of the loop of an expression derived from the indvar
+// is changed to compute the derived value outside of the loop, eliminating
+// the dependence on the exit value of the induction variable. If the only
+// purpose of the loop is to compute the exit value of some derived
+// expression, this transformation will make the loop dead.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/SimplifyIndVar.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "indvars"
+
+STATISTIC(NumWidened , "Number of indvars widened");
+STATISTIC(NumReplaced , "Number of exit values replaced");
+STATISTIC(NumLFTR , "Number of loop exit tests replaced");
+STATISTIC(NumElimExt , "Number of IV sign/zero extends eliminated");
+STATISTIC(NumElimIV , "Number of congruent IVs eliminated");
+
+// Trip count verification can be enabled by default under NDEBUG if we
+// implement a strong expression equivalence checker in SCEV. Until then, we
+// use the verify-indvars flag, which may assert in some cases.
+static cl::opt<bool> VerifyIndvars(
+ "verify-indvars", cl::Hidden,
+ cl::desc("Verify the ScalarEvolution result after running indvars"));
+
+static cl::opt<bool> ReduceLiveIVs("liv-reduce", cl::Hidden,
+ cl::desc("Reduce live induction variables."));
+
+enum ReplaceExitVal { NeverRepl, OnlyCheapRepl, AlwaysRepl };
+
+static cl::opt<ReplaceExitVal> ReplaceExitValue(
+ "replexitval", cl::Hidden, cl::init(OnlyCheapRepl),
+ cl::desc("Choose the strategy to replace exit value in IndVarSimplify"),
+ cl::values(clEnumValN(NeverRepl, "never", "never replace exit value"),
+ clEnumValN(OnlyCheapRepl, "cheap",
+ "only replace exit value when the cost is cheap"),
+ clEnumValN(AlwaysRepl, "always",
+ "always replace exit value whenever possible"),
+ clEnumValEnd));
+
+namespace {
+struct RewritePhi;
+
+class IndVarSimplify : public LoopPass {
+ LoopInfo *LI;
+ ScalarEvolution *SE;
+ DominatorTree *DT;
+ TargetLibraryInfo *TLI;
+ const TargetTransformInfo *TTI;
+
+ SmallVector<WeakVH, 16> DeadInsts;
+ bool Changed;
+public:
+
+ static char ID; // Pass identification, replacement for typeid
+ IndVarSimplify()
+ : LoopPass(ID), LI(nullptr), SE(nullptr), DT(nullptr), Changed(false) {
+ initializeIndVarSimplifyPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addRequiredID(LoopSimplifyID);
+ AU.addRequiredID(LCSSAID);
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addPreserved<ScalarEvolutionWrapperPass>();
+ AU.addPreservedID(LoopSimplifyID);
+ AU.addPreservedID(LCSSAID);
+ AU.setPreservesCFG();
+ }
+
+private:
+ void releaseMemory() override {
+ DeadInsts.clear();
+ }
+
+ bool isValidRewrite(Value *FromVal, Value *ToVal);
+
+ void handleFloatingPointIV(Loop *L, PHINode *PH);
+ void rewriteNonIntegerIVs(Loop *L);
+
+ void simplifyAndExtend(Loop *L, SCEVExpander &Rewriter, LoopInfo *LI);
+
+ bool canLoopBeDeleted(Loop *L, SmallVector<RewritePhi, 8> &RewritePhiSet);
+ void rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter);
+
+ Value *linearFunctionTestReplace(Loop *L, const SCEV *BackedgeTakenCount,
+ PHINode *IndVar, SCEVExpander &Rewriter);
+
+ void sinkUnusedInvariants(Loop *L);
+
+ Value *expandSCEVIfNeeded(SCEVExpander &Rewriter, const SCEV *S, Loop *L,
+ Instruction *InsertPt, Type *Ty);
+};
+}
+
+char IndVarSimplify::ID = 0;
+INITIALIZE_PASS_BEGIN(IndVarSimplify, "indvars",
+ "Induction Variable Simplification", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(LCSSA)
+INITIALIZE_PASS_END(IndVarSimplify, "indvars",
+ "Induction Variable Simplification", false, false)
+
+Pass *llvm::createIndVarSimplifyPass() {
+ return new IndVarSimplify();
+}
+
+/// Return true if the SCEV expansion generated by the rewriter can replace the
+/// original value. SCEV guarantees that it produces the same value, but the way
+/// it is produced may be illegal IR. Ideally, this function will only be
+/// called for verification.
+bool IndVarSimplify::isValidRewrite(Value *FromVal, Value *ToVal) {
+ // If an SCEV expression subsumed multiple pointers, its expansion could
+ // reassociate the GEP changing the base pointer. This is illegal because the
+ // final address produced by a GEP chain must be inbounds relative to its
+ // underlying object. Otherwise basic alias analysis, among other things,
+ // could fail in a dangerous way. Ultimately, SCEV will be improved to avoid
+ // producing an expression involving multiple pointers. Until then, we must
+ // bail out here.
+ //
+ // Retrieve the pointer operand of the GEP. Don't use GetUnderlyingObject
+ // because it understands lcssa phis while SCEV does not.
+ Value *FromPtr = FromVal;
+ Value *ToPtr = ToVal;
+ if (auto *GEP = dyn_cast<GEPOperator>(FromVal)) {
+ FromPtr = GEP->getPointerOperand();
+ }
+ if (auto *GEP = dyn_cast<GEPOperator>(ToVal)) {
+ ToPtr = GEP->getPointerOperand();
+ }
+ if (FromPtr != FromVal || ToPtr != ToVal) {
+ // Quickly check the common case
+ if (FromPtr == ToPtr)
+ return true;
+
+ // SCEV may have rewritten an expression that produces the GEP's pointer
+ // operand. That's ok as long as the pointer operand has the same base
+ // pointer. Unlike GetUnderlyingObject(), getPointerBase() will find the
+ // base of a recurrence. This handles the case in which SCEV expansion
+ // converts a pointer type recurrence into a nonrecurrent pointer base
+ // indexed by an integer recurrence.
+
+ // If the GEP base pointer is a vector of pointers, abort.
+ if (!FromPtr->getType()->isPointerTy() || !ToPtr->getType()->isPointerTy())
+ return false;
+
+ const SCEV *FromBase = SE->getPointerBase(SE->getSCEV(FromPtr));
+ const SCEV *ToBase = SE->getPointerBase(SE->getSCEV(ToPtr));
+ if (FromBase == ToBase)
+ return true;
+
+ DEBUG(dbgs() << "INDVARS: GEP rewrite bail out "
+ << *FromBase << " != " << *ToBase << "\n");
+
+ return false;
+ }
+ return true;
+}
+
+/// Determine the insertion point for this user. By default, insert immediately
+/// before the user. SCEVExpander or LICM will hoist loop invariants out of the
+/// loop. For PHI nodes, there may be multiple uses, so compute the nearest
+/// common dominator for the incoming blocks.
+static Instruction *getInsertPointForUses(Instruction *User, Value *Def,
+ DominatorTree *DT, LoopInfo *LI) {
+ PHINode *PHI = dyn_cast<PHINode>(User);
+ if (!PHI)
+ return User;
+
+ Instruction *InsertPt = nullptr;
+ for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i) {
+ if (PHI->getIncomingValue(i) != Def)
+ continue;
+
+ BasicBlock *InsertBB = PHI->getIncomingBlock(i);
+ if (!InsertPt) {
+ InsertPt = InsertBB->getTerminator();
+ continue;
+ }
+ InsertBB = DT->findNearestCommonDominator(InsertPt->getParent(), InsertBB);
+ InsertPt = InsertBB->getTerminator();
+ }
+ assert(InsertPt && "Missing phi operand");
+
+ auto *DefI = dyn_cast<Instruction>(Def);
+ if (!DefI)
+ return InsertPt;
+
+ assert(DT->dominates(DefI, InsertPt) && "def does not dominate all uses");
+
+ auto *L = LI->getLoopFor(DefI->getParent());
+ assert(!L || L->contains(LI->getLoopFor(InsertPt->getParent())));
+
+ for (auto *DTN = (*DT)[InsertPt->getParent()]; DTN; DTN = DTN->getIDom())
+ if (LI->getLoopFor(DTN->getBlock()) == L)
+ return DTN->getBlock()->getTerminator();
+
+ llvm_unreachable("DefI dominates InsertPt!");
+}
+
+//===----------------------------------------------------------------------===//
+// rewriteNonIntegerIVs and helpers. Prefer integer IVs.
+//===----------------------------------------------------------------------===//
+
+/// Convert APF to an integer, if possible.
+static bool ConvertToSInt(const APFloat &APF, int64_t &IntVal) {
+ bool isExact = false;
+ // See if we can convert this to an int64_t
+ uint64_t UIntVal;
+ if (APF.convertToInteger(&UIntVal, 64, true, APFloat::rmTowardZero,
+ &isExact) != APFloat::opOK || !isExact)
+ return false;
+ IntVal = UIntVal;
+ return true;
+}
+
+/// If the loop has floating induction variable then insert corresponding
+/// integer induction variable if possible.
+/// For example,
+/// for(double i = 0; i < 10000; ++i)
+/// bar(i)
+/// is converted into
+/// for(int i = 0; i < 10000; ++i)
+/// bar((double)i);
+///
+void IndVarSimplify::handleFloatingPointIV(Loop *L, PHINode *PN) {
+ unsigned IncomingEdge = L->contains(PN->getIncomingBlock(0));
+ unsigned BackEdge = IncomingEdge^1;
+
+ // Check incoming value.
+ auto *InitValueVal = dyn_cast<ConstantFP>(PN->getIncomingValue(IncomingEdge));
+
+ int64_t InitValue;
+ if (!InitValueVal || !ConvertToSInt(InitValueVal->getValueAPF(), InitValue))
+ return;
+
+ // Check IV increment. Reject this PN if increment operation is not
+ // an add or increment value can not be represented by an integer.
+ auto *Incr = dyn_cast<BinaryOperator>(PN->getIncomingValue(BackEdge));
+ if (Incr == nullptr || Incr->getOpcode() != Instruction::FAdd) return;
+
+ // If this is not an add of the PHI with a constantfp, or if the constant fp
+ // is not an integer, bail out.
+ ConstantFP *IncValueVal = dyn_cast<ConstantFP>(Incr->getOperand(1));
+ int64_t IncValue;
+ if (IncValueVal == nullptr || Incr->getOperand(0) != PN ||
+ !ConvertToSInt(IncValueVal->getValueAPF(), IncValue))
+ return;
+
+ // Check Incr uses. One user is PN and the other user is an exit condition
+ // used by the conditional terminator.
+ Value::user_iterator IncrUse = Incr->user_begin();
+ Instruction *U1 = cast<Instruction>(*IncrUse++);
+ if (IncrUse == Incr->user_end()) return;
+ Instruction *U2 = cast<Instruction>(*IncrUse++);
+ if (IncrUse != Incr->user_end()) return;
+
+ // Find exit condition, which is an fcmp. If it doesn't exist, or if it isn't
+ // only used by a branch, we can't transform it.
+ FCmpInst *Compare = dyn_cast<FCmpInst>(U1);
+ if (!Compare)
+ Compare = dyn_cast<FCmpInst>(U2);
+ if (!Compare || !Compare->hasOneUse() ||
+ !isa<BranchInst>(Compare->user_back()))
+ return;
+
+ BranchInst *TheBr = cast<BranchInst>(Compare->user_back());
+
+ // We need to verify that the branch actually controls the iteration count
+ // of the loop. If not, the new IV can overflow and no one will notice.
+ // The branch block must be in the loop and one of the successors must be out
+ // of the loop.
+ assert(TheBr->isConditional() && "Can't use fcmp if not conditional");
+ if (!L->contains(TheBr->getParent()) ||
+ (L->contains(TheBr->getSuccessor(0)) &&
+ L->contains(TheBr->getSuccessor(1))))
+ return;
+
+
+ // If it isn't a comparison with an integer-as-fp (the exit value), we can't
+ // transform it.
+ ConstantFP *ExitValueVal = dyn_cast<ConstantFP>(Compare->getOperand(1));
+ int64_t ExitValue;
+ if (ExitValueVal == nullptr ||
+ !ConvertToSInt(ExitValueVal->getValueAPF(), ExitValue))
+ return;
+
+ // Find new predicate for integer comparison.
+ CmpInst::Predicate NewPred = CmpInst::BAD_ICMP_PREDICATE;
+ switch (Compare->getPredicate()) {
+ default: return; // Unknown comparison.
+ case CmpInst::FCMP_OEQ:
+ case CmpInst::FCMP_UEQ: NewPred = CmpInst::ICMP_EQ; break;
+ case CmpInst::FCMP_ONE:
+ case CmpInst::FCMP_UNE: NewPred = CmpInst::ICMP_NE; break;
+ case CmpInst::FCMP_OGT:
+ case CmpInst::FCMP_UGT: NewPred = CmpInst::ICMP_SGT; break;
+ case CmpInst::FCMP_OGE:
+ case CmpInst::FCMP_UGE: NewPred = CmpInst::ICMP_SGE; break;
+ case CmpInst::FCMP_OLT:
+ case CmpInst::FCMP_ULT: NewPred = CmpInst::ICMP_SLT; break;
+ case CmpInst::FCMP_OLE:
+ case CmpInst::FCMP_ULE: NewPred = CmpInst::ICMP_SLE; break;
+ }
+
+ // We convert the floating point induction variable to a signed i32 value if
+ // we can. This is only safe if the comparison will not overflow in a way
+ // that won't be trapped by the integer equivalent operations. Check for this
+ // now.
+ // TODO: We could use i64 if it is native and the range requires it.
+
+ // The start/stride/exit values must all fit in signed i32.
+ if (!isInt<32>(InitValue) || !isInt<32>(IncValue) || !isInt<32>(ExitValue))
+ return;
+
+ // If not actually striding (add x, 0.0), avoid touching the code.
+ if (IncValue == 0)
+ return;
+
+ // Positive and negative strides have different safety conditions.
+ if (IncValue > 0) {
+ // If we have a positive stride, we require the init to be less than the
+ // exit value.
+ if (InitValue >= ExitValue)
+ return;
+
+ uint32_t Range = uint32_t(ExitValue-InitValue);
+ // Check for infinite loop, either:
+ // while (i <= Exit) or until (i > Exit)
+ if (NewPred == CmpInst::ICMP_SLE || NewPred == CmpInst::ICMP_SGT) {
+ if (++Range == 0) return; // Range overflows.
+ }
+
+ unsigned Leftover = Range % uint32_t(IncValue);
+
+ // If this is an equality comparison, we require that the strided value
+ // exactly land on the exit value, otherwise the IV condition will wrap
+ // around and do things the fp IV wouldn't.
+ if ((NewPred == CmpInst::ICMP_EQ || NewPred == CmpInst::ICMP_NE) &&
+ Leftover != 0)
+ return;
+
+ // If the stride would wrap around the i32 before exiting, we can't
+ // transform the IV.
+ if (Leftover != 0 && int32_t(ExitValue+IncValue) < ExitValue)
+ return;
+
+ } else {
+ // If we have a negative stride, we require the init to be greater than the
+ // exit value.
+ if (InitValue <= ExitValue)
+ return;
+
+ uint32_t Range = uint32_t(InitValue-ExitValue);
+ // Check for infinite loop, either:
+ // while (i >= Exit) or until (i < Exit)
+ if (NewPred == CmpInst::ICMP_SGE || NewPred == CmpInst::ICMP_SLT) {
+ if (++Range == 0) return; // Range overflows.
+ }
+
+ unsigned Leftover = Range % uint32_t(-IncValue);
+
+ // If this is an equality comparison, we require that the strided value
+ // exactly land on the exit value, otherwise the IV condition will wrap
+ // around and do things the fp IV wouldn't.
+ if ((NewPred == CmpInst::ICMP_EQ || NewPred == CmpInst::ICMP_NE) &&
+ Leftover != 0)
+ return;
+
+ // If the stride would wrap around the i32 before exiting, we can't
+ // transform the IV.
+ if (Leftover != 0 && int32_t(ExitValue+IncValue) > ExitValue)
+ return;
+ }
+
+ IntegerType *Int32Ty = Type::getInt32Ty(PN->getContext());
+
+ // Insert new integer induction variable.
+ PHINode *NewPHI = PHINode::Create(Int32Ty, 2, PN->getName()+".int", PN);
+ NewPHI->addIncoming(ConstantInt::get(Int32Ty, InitValue),
+ PN->getIncomingBlock(IncomingEdge));
+
+ Value *NewAdd =
+ BinaryOperator::CreateAdd(NewPHI, ConstantInt::get(Int32Ty, IncValue),
+ Incr->getName()+".int", Incr);
+ NewPHI->addIncoming(NewAdd, PN->getIncomingBlock(BackEdge));
+
+ ICmpInst *NewCompare = new ICmpInst(TheBr, NewPred, NewAdd,
+ ConstantInt::get(Int32Ty, ExitValue),
+ Compare->getName());
+
+ // In the following deletions, PN may become dead and may be deleted.
+ // Use a WeakVH to observe whether this happens.
+ WeakVH WeakPH = PN;
+
+ // Delete the old floating point exit comparison. The branch starts using the
+ // new comparison.
+ NewCompare->takeName(Compare);
+ Compare->replaceAllUsesWith(NewCompare);
+ RecursivelyDeleteTriviallyDeadInstructions(Compare, TLI);
+
+ // Delete the old floating point increment.
+ Incr->replaceAllUsesWith(UndefValue::get(Incr->getType()));
+ RecursivelyDeleteTriviallyDeadInstructions(Incr, TLI);
+
+ // If the FP induction variable still has uses, this is because something else
+ // in the loop uses its value. In order to canonicalize the induction
+ // variable, we chose to eliminate the IV and rewrite it in terms of an
+ // int->fp cast.
+ //
+ // We give preference to sitofp over uitofp because it is faster on most
+ // platforms.
+ if (WeakPH) {
+ Value *Conv = new SIToFPInst(NewPHI, PN->getType(), "indvar.conv",
+ &*PN->getParent()->getFirstInsertionPt());
+ PN->replaceAllUsesWith(Conv);
+ RecursivelyDeleteTriviallyDeadInstructions(PN, TLI);
+ }
+ Changed = true;
+}
+
+void IndVarSimplify::rewriteNonIntegerIVs(Loop *L) {
+ // First step. Check to see if there are any floating-point recurrences.
+ // If there are, change them into integer recurrences, permitting analysis by
+ // the SCEV routines.
+ //
+ BasicBlock *Header = L->getHeader();
+
+ SmallVector<WeakVH, 8> PHIs;
+ for (BasicBlock::iterator I = Header->begin();
+ PHINode *PN = dyn_cast<PHINode>(I); ++I)
+ PHIs.push_back(PN);
+
+ for (unsigned i = 0, e = PHIs.size(); i != e; ++i)
+ if (PHINode *PN = dyn_cast_or_null<PHINode>(&*PHIs[i]))
+ handleFloatingPointIV(L, PN);
+
+ // If the loop previously had floating-point IV, ScalarEvolution
+ // may not have been able to compute a trip count. Now that we've done some
+ // re-writing, the trip count may be computable.
+ if (Changed)
+ SE->forgetLoop(L);
+}
+
+namespace {
+// Collect information about PHI nodes which can be transformed in
+// rewriteLoopExitValues.
+struct RewritePhi {
+ PHINode *PN;
+ unsigned Ith; // Ith incoming value.
+ Value *Val; // Exit value after expansion.
+ bool HighCost; // High Cost when expansion.
+ bool SafePhi; // LCSSASafePhiForRAUW.
+
+ RewritePhi(PHINode *P, unsigned I, Value *V, bool H, bool S)
+ : PN(P), Ith(I), Val(V), HighCost(H), SafePhi(S) {}
+};
+}
+
+Value *IndVarSimplify::expandSCEVIfNeeded(SCEVExpander &Rewriter, const SCEV *S,
+ Loop *L, Instruction *InsertPt,
+ Type *ResultTy) {
+ // Before expanding S into an expensive LLVM expression, see if we can use an
+ // already existing value as the expansion for S.
+ if (Value *ExistingValue = Rewriter.findExistingExpansion(S, InsertPt, L))
+ if (ExistingValue->getType() == ResultTy)
+ return ExistingValue;
+
+ // We didn't find anything, fall back to using SCEVExpander.
+ return Rewriter.expandCodeFor(S, ResultTy, InsertPt);
+}
+
+//===----------------------------------------------------------------------===//
+// rewriteLoopExitValues - Optimize IV users outside the loop.
+// As a side effect, reduces the amount of IV processing within the loop.
+//===----------------------------------------------------------------------===//
+
+/// Check to see if this loop has a computable loop-invariant execution count.
+/// If so, this means that we can compute the final value of any expressions
+/// that are recurrent in the loop, and substitute the exit values from the loop
+/// into any instructions outside of the loop that use the final values of the
+/// current expressions.
+///
+/// This is mostly redundant with the regular IndVarSimplify activities that
+/// happen later, except that it's more powerful in some cases, because it's
+/// able to brute-force evaluate arbitrary instructions as long as they have
+/// constant operands at the beginning of the loop.
+void IndVarSimplify::rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
+ // Check a pre-condition.
+ assert(L->isRecursivelyLCSSAForm(*DT) && "Indvars did not preserve LCSSA!");
+
+ SmallVector<BasicBlock*, 8> ExitBlocks;
+ L->getUniqueExitBlocks(ExitBlocks);
+
+ SmallVector<RewritePhi, 8> RewritePhiSet;
+ // Find all values that are computed inside the loop, but used outside of it.
+ // Because of LCSSA, these values will only occur in LCSSA PHI Nodes. Scan
+ // the exit blocks of the loop to find them.
+ for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
+ BasicBlock *ExitBB = ExitBlocks[i];
+
+ // If there are no PHI nodes in this exit block, then no values defined
+ // inside the loop are used on this path, skip it.
+ PHINode *PN = dyn_cast<PHINode>(ExitBB->begin());
+ if (!PN) continue;
+
+ unsigned NumPreds = PN->getNumIncomingValues();
+
+ // We would like to be able to RAUW single-incoming value PHI nodes. We
+ // have to be certain this is safe even when this is an LCSSA PHI node.
+ // While the computed exit value is no longer varying in *this* loop, the
+ // exit block may be an exit block for an outer containing loop as well,
+ // the exit value may be varying in the outer loop, and thus it may still
+ // require an LCSSA PHI node. The safe case is when this is
+ // single-predecessor PHI node (LCSSA) and the exit block containing it is
+ // part of the enclosing loop, or this is the outer most loop of the nest.
+ // In either case the exit value could (at most) be varying in the same
+ // loop body as the phi node itself. Thus if it is in turn used outside of
+ // an enclosing loop it will only be via a separate LCSSA node.
+ bool LCSSASafePhiForRAUW =
+ NumPreds == 1 &&
+ (!L->getParentLoop() || L->getParentLoop() == LI->getLoopFor(ExitBB));
+
+ // Iterate over all of the PHI nodes.
+ BasicBlock::iterator BBI = ExitBB->begin();
+ while ((PN = dyn_cast<PHINode>(BBI++))) {
+ if (PN->use_empty())
+ continue; // dead use, don't replace it
+
+ // SCEV only supports integer expressions for now.
+ if (!PN->getType()->isIntegerTy() && !PN->getType()->isPointerTy())
+ continue;
+
+ // It's necessary to tell ScalarEvolution about this explicitly so that
+ // it can walk the def-use list and forget all SCEVs, as it may not be
+ // watching the PHI itself. Once the new exit value is in place, there
+ // may not be a def-use connection between the loop and every instruction
+ // which got a SCEVAddRecExpr for that loop.
+ SE->forgetValue(PN);
+
+ // Iterate over all of the values in all the PHI nodes.
+ for (unsigned i = 0; i != NumPreds; ++i) {
+ // If the value being merged in is not integer or is not defined
+ // in the loop, skip it.
+ Value *InVal = PN->getIncomingValue(i);
+ if (!isa<Instruction>(InVal))
+ continue;
+
+ // If this pred is for a subloop, not L itself, skip it.
+ if (LI->getLoopFor(PN->getIncomingBlock(i)) != L)
+ continue; // The Block is in a subloop, skip it.
+
+ // Check that InVal is defined in the loop.
+ Instruction *Inst = cast<Instruction>(InVal);
+ if (!L->contains(Inst))
+ continue;
+
+ // Okay, this instruction has a user outside of the current loop
+ // and varies predictably *inside* the loop. Evaluate the value it
+ // contains when the loop exits, if possible.
+ const SCEV *ExitValue = SE->getSCEVAtScope(Inst, L->getParentLoop());
+ if (!SE->isLoopInvariant(ExitValue, L) ||
+ !isSafeToExpand(ExitValue, *SE))
+ continue;
+
+ // Computing the value outside of the loop brings no benefit if :
+ // - it is definitely used inside the loop in a way which can not be
+ // optimized away.
+ // - no use outside of the loop can take advantage of hoisting the
+ // computation out of the loop
+ if (ExitValue->getSCEVType()>=scMulExpr) {
+ unsigned NumHardInternalUses = 0;
+ unsigned NumSoftExternalUses = 0;
+ unsigned NumUses = 0;
+ for (auto IB = Inst->user_begin(), IE = Inst->user_end();
+ IB != IE && NumUses <= 6; ++IB) {
+ Instruction *UseInstr = cast<Instruction>(*IB);
+ unsigned Opc = UseInstr->getOpcode();
+ NumUses++;
+ if (L->contains(UseInstr)) {
+ if (Opc == Instruction::Call || Opc == Instruction::Ret)
+ NumHardInternalUses++;
+ } else {
+ if (Opc == Instruction::PHI) {
+ // Do not count the Phi as a use. LCSSA may have inserted
+ // plenty of trivial ones.
+ NumUses--;
+ for (auto PB = UseInstr->user_begin(),
+ PE = UseInstr->user_end();
+ PB != PE && NumUses <= 6; ++PB, ++NumUses) {
+ unsigned PhiOpc = cast<Instruction>(*PB)->getOpcode();
+ if (PhiOpc != Instruction::Call && PhiOpc != Instruction::Ret)
+ NumSoftExternalUses++;
+ }
+ continue;
+ }
+ if (Opc != Instruction::Call && Opc != Instruction::Ret)
+ NumSoftExternalUses++;
+ }
+ }
+ if (NumUses <= 6 && NumHardInternalUses && !NumSoftExternalUses)
+ continue;
+ }
+
+ bool HighCost = Rewriter.isHighCostExpansion(ExitValue, L, Inst);
+ Value *ExitVal =
+ expandSCEVIfNeeded(Rewriter, ExitValue, L, Inst, PN->getType());
+
+ DEBUG(dbgs() << "INDVARS: RLEV: AfterLoopVal = " << *ExitVal << '\n'
+ << " LoopVal = " << *Inst << "\n");
+
+ if (!isValidRewrite(Inst, ExitVal)) {
+ DeadInsts.push_back(ExitVal);
+ continue;
+ }
+
+ // Collect all the candidate PHINodes to be rewritten.
+ RewritePhiSet.push_back(
+ RewritePhi(PN, i, ExitVal, HighCost, LCSSASafePhiForRAUW));
+ }
+ }
+ }
+
+ bool LoopCanBeDel = canLoopBeDeleted(L, RewritePhiSet);
+
+ // Transformation.
+ for (const RewritePhi &Phi : RewritePhiSet) {
+ PHINode *PN = Phi.PN;
+ Value *ExitVal = Phi.Val;
+
+ // Only do the rewrite when the ExitValue can be expanded cheaply.
+ // If LoopCanBeDel is true, rewrite exit value aggressively.
+ if (ReplaceExitValue == OnlyCheapRepl && !LoopCanBeDel && Phi.HighCost) {
+ DeadInsts.push_back(ExitVal);
+ continue;
+ }
+
+ Changed = true;
+ ++NumReplaced;
+ Instruction *Inst = cast<Instruction>(PN->getIncomingValue(Phi.Ith));
+ PN->setIncomingValue(Phi.Ith, ExitVal);
+
+ // If this instruction is dead now, delete it. Don't do it now to avoid
+ // invalidating iterators.
+ if (isInstructionTriviallyDead(Inst, TLI))
+ DeadInsts.push_back(Inst);
+
+ // If we determined that this PHI is safe to replace even if an LCSSA
+ // PHI, do so.
+ if (Phi.SafePhi) {
+ PN->replaceAllUsesWith(ExitVal);
+ PN->eraseFromParent();
+ }
+ }
+
+ // The insertion point instruction may have been deleted; clear it out
+ // so that the rewriter doesn't trip over it later.
+ Rewriter.clearInsertPoint();
+}
+
+/// Check whether it is possible to delete the loop after rewriting exit
+/// value. If it is possible, ignore ReplaceExitValue and do rewriting
+/// aggressively.
+bool IndVarSimplify::canLoopBeDeleted(
+ Loop *L, SmallVector<RewritePhi, 8> &RewritePhiSet) {
+
+ BasicBlock *Preheader = L->getLoopPreheader();
+ // If there is no preheader, the loop will not be deleted.
+ if (!Preheader)
+ return false;
+
+ // In LoopDeletion pass Loop can be deleted when ExitingBlocks.size() > 1.
+ // We obviate multiple ExitingBlocks case for simplicity.
+ // TODO: If we see testcase with multiple ExitingBlocks can be deleted
+ // after exit value rewriting, we can enhance the logic here.
+ SmallVector<BasicBlock *, 4> ExitingBlocks;
+ L->getExitingBlocks(ExitingBlocks);
+ SmallVector<BasicBlock *, 8> ExitBlocks;
+ L->getUniqueExitBlocks(ExitBlocks);
+ if (ExitBlocks.size() > 1 || ExitingBlocks.size() > 1)
+ return false;
+
+ BasicBlock *ExitBlock = ExitBlocks[0];
+ BasicBlock::iterator BI = ExitBlock->begin();
+ while (PHINode *P = dyn_cast<PHINode>(BI)) {
+ Value *Incoming = P->getIncomingValueForBlock(ExitingBlocks[0]);
+
+ // If the Incoming value of P is found in RewritePhiSet, we know it
+ // could be rewritten to use a loop invariant value in transformation
+ // phase later. Skip it in the loop invariant check below.
+ bool found = false;
+ for (const RewritePhi &Phi : RewritePhiSet) {
+ unsigned i = Phi.Ith;
+ if (Phi.PN == P && (Phi.PN)->getIncomingValue(i) == Incoming) {
+ found = true;
+ break;
+ }
+ }
+
+ Instruction *I;
+ if (!found && (I = dyn_cast<Instruction>(Incoming)))
+ if (!L->hasLoopInvariantOperands(I))
+ return false;
+
+ ++BI;
+ }
+
+ for (auto *BB : L->blocks())
+ if (any_of(*BB, [](Instruction &I) { return I.mayHaveSideEffects(); }))
+ return false;
+
+ return true;
+}
+
+//===----------------------------------------------------------------------===//
+// IV Widening - Extend the width of an IV to cover its widest uses.
+//===----------------------------------------------------------------------===//
+
+namespace {
+// Collect information about induction variables that are used by sign/zero
+// extend operations. This information is recorded by CollectExtend and provides
+// the input to WidenIV.
+struct WideIVInfo {
+ PHINode *NarrowIV = nullptr;
+ Type *WidestNativeType = nullptr; // Widest integer type created [sz]ext
+ bool IsSigned = false; // Was a sext user seen before a zext?
+};
+}
+
+/// Update information about the induction variable that is extended by this
+/// sign or zero extend operation. This is used to determine the final width of
+/// the IV before actually widening it.
+static void visitIVCast(CastInst *Cast, WideIVInfo &WI, ScalarEvolution *SE,
+ const TargetTransformInfo *TTI) {
+ bool IsSigned = Cast->getOpcode() == Instruction::SExt;
+ if (!IsSigned && Cast->getOpcode() != Instruction::ZExt)
+ return;
+
+ Type *Ty = Cast->getType();
+ uint64_t Width = SE->getTypeSizeInBits(Ty);
+ if (!Cast->getModule()->getDataLayout().isLegalInteger(Width))
+ return;
+
+ // Cast is either an sext or zext up to this point.
+ // We should not widen an indvar if arithmetics on the wider indvar are more
+ // expensive than those on the narrower indvar. We check only the cost of ADD
+ // because at least an ADD is required to increment the induction variable. We
+ // could compute more comprehensively the cost of all instructions on the
+ // induction variable when necessary.
+ if (TTI &&
+ TTI->getArithmeticInstrCost(Instruction::Add, Ty) >
+ TTI->getArithmeticInstrCost(Instruction::Add,
+ Cast->getOperand(0)->getType())) {
+ return;
+ }
+
+ if (!WI.WidestNativeType) {
+ WI.WidestNativeType = SE->getEffectiveSCEVType(Ty);
+ WI.IsSigned = IsSigned;
+ return;
+ }
+
+ // We extend the IV to satisfy the sign of its first user, arbitrarily.
+ if (WI.IsSigned != IsSigned)
+ return;
+
+ if (Width > SE->getTypeSizeInBits(WI.WidestNativeType))
+ WI.WidestNativeType = SE->getEffectiveSCEVType(Ty);
+}
+
+namespace {
+
+/// Record a link in the Narrow IV def-use chain along with the WideIV that
+/// computes the same value as the Narrow IV def. This avoids caching Use*
+/// pointers.
+struct NarrowIVDefUse {
+ Instruction *NarrowDef = nullptr;
+ Instruction *NarrowUse = nullptr;
+ Instruction *WideDef = nullptr;
+
+ // True if the narrow def is never negative. Tracking this information lets
+ // us use a sign extension instead of a zero extension or vice versa, when
+ // profitable and legal.
+ bool NeverNegative = false;
+
+ NarrowIVDefUse(Instruction *ND, Instruction *NU, Instruction *WD,
+ bool NeverNegative)
+ : NarrowDef(ND), NarrowUse(NU), WideDef(WD),
+ NeverNegative(NeverNegative) {}
+};
+
+/// The goal of this transform is to remove sign and zero extends without
+/// creating any new induction variables. To do this, it creates a new phi of
+/// the wider type and redirects all users, either removing extends or inserting
+/// truncs whenever we stop propagating the type.
+///
+class WidenIV {
+ // Parameters
+ PHINode *OrigPhi;
+ Type *WideType;
+ bool IsSigned;
+
+ // Context
+ LoopInfo *LI;
+ Loop *L;
+ ScalarEvolution *SE;
+ DominatorTree *DT;
+
+ // Result
+ PHINode *WidePhi;
+ Instruction *WideInc;
+ const SCEV *WideIncExpr;
+ SmallVectorImpl<WeakVH> &DeadInsts;
+
+ SmallPtrSet<Instruction*,16> Widened;
+ SmallVector<NarrowIVDefUse, 8> NarrowIVUsers;
+
+public:
+ WidenIV(const WideIVInfo &WI, LoopInfo *LInfo,
+ ScalarEvolution *SEv, DominatorTree *DTree,
+ SmallVectorImpl<WeakVH> &DI) :
+ OrigPhi(WI.NarrowIV),
+ WideType(WI.WidestNativeType),
+ IsSigned(WI.IsSigned),
+ LI(LInfo),
+ L(LI->getLoopFor(OrigPhi->getParent())),
+ SE(SEv),
+ DT(DTree),
+ WidePhi(nullptr),
+ WideInc(nullptr),
+ WideIncExpr(nullptr),
+ DeadInsts(DI) {
+ assert(L->getHeader() == OrigPhi->getParent() && "Phi must be an IV");
+ }
+
+ PHINode *createWideIV(SCEVExpander &Rewriter);
+
+protected:
+ Value *createExtendInst(Value *NarrowOper, Type *WideType, bool IsSigned,
+ Instruction *Use);
+
+ Instruction *cloneIVUser(NarrowIVDefUse DU, const SCEVAddRecExpr *WideAR);
+ Instruction *cloneArithmeticIVUser(NarrowIVDefUse DU,
+ const SCEVAddRecExpr *WideAR);
+ Instruction *cloneBitwiseIVUser(NarrowIVDefUse DU);
+
+ const SCEVAddRecExpr *getWideRecurrence(Instruction *NarrowUse);
+
+ const SCEVAddRecExpr* getExtendedOperandRecurrence(NarrowIVDefUse DU);
+
+ const SCEV *getSCEVByOpCode(const SCEV *LHS, const SCEV *RHS,
+ unsigned OpCode) const;
+
+ Instruction *widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter);
+
+ bool widenLoopCompare(NarrowIVDefUse DU);
+
+ void pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef);
+};
+} // anonymous namespace
+
+/// Perform a quick domtree based check for loop invariance assuming that V is
+/// used within the loop. LoopInfo::isLoopInvariant() seems gratuitous for this
+/// purpose.
+static bool isLoopInvariant(Value *V, const Loop *L, const DominatorTree *DT) {
+ Instruction *Inst = dyn_cast<Instruction>(V);
+ if (!Inst)
+ return true;
+
+ return DT->properlyDominates(Inst->getParent(), L->getHeader());
+}
+
+Value *WidenIV::createExtendInst(Value *NarrowOper, Type *WideType,
+ bool IsSigned, Instruction *Use) {
+ // Set the debug location and conservative insertion point.
+ IRBuilder<> Builder(Use);
+ // Hoist the insertion point into loop preheaders as far as possible.
+ for (const Loop *L = LI->getLoopFor(Use->getParent());
+ L && L->getLoopPreheader() && isLoopInvariant(NarrowOper, L, DT);
+ L = L->getParentLoop())
+ Builder.SetInsertPoint(L->getLoopPreheader()->getTerminator());
+
+ return IsSigned ? Builder.CreateSExt(NarrowOper, WideType) :
+ Builder.CreateZExt(NarrowOper, WideType);
+}
+
+/// Instantiate a wide operation to replace a narrow operation. This only needs
+/// to handle operations that can evaluation to SCEVAddRec. It can safely return
+/// 0 for any operation we decide not to clone.
+Instruction *WidenIV::cloneIVUser(NarrowIVDefUse DU,
+ const SCEVAddRecExpr *WideAR) {
+ unsigned Opcode = DU.NarrowUse->getOpcode();
+ switch (Opcode) {
+ default:
+ return nullptr;
+ case Instruction::Add:
+ case Instruction::Mul:
+ case Instruction::UDiv:
+ case Instruction::Sub:
+ return cloneArithmeticIVUser(DU, WideAR);
+
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ return cloneBitwiseIVUser(DU);
+ }
+}
+
+Instruction *WidenIV::cloneBitwiseIVUser(NarrowIVDefUse DU) {
+ Instruction *NarrowUse = DU.NarrowUse;
+ Instruction *NarrowDef = DU.NarrowDef;
+ Instruction *WideDef = DU.WideDef;
+
+ DEBUG(dbgs() << "Cloning bitwise IVUser: " << *NarrowUse << "\n");
+
+ // Replace NarrowDef operands with WideDef. Otherwise, we don't know anything
+ // about the narrow operand yet so must insert a [sz]ext. It is probably loop
+ // invariant and will be folded or hoisted. If it actually comes from a
+ // widened IV, it should be removed during a future call to widenIVUse.
+ Value *LHS = (NarrowUse->getOperand(0) == NarrowDef)
+ ? WideDef
+ : createExtendInst(NarrowUse->getOperand(0), WideType,
+ IsSigned, NarrowUse);
+ Value *RHS = (NarrowUse->getOperand(1) == NarrowDef)
+ ? WideDef
+ : createExtendInst(NarrowUse->getOperand(1), WideType,
+ IsSigned, NarrowUse);
+
+ auto *NarrowBO = cast<BinaryOperator>(NarrowUse);
+ auto *WideBO = BinaryOperator::Create(NarrowBO->getOpcode(), LHS, RHS,
+ NarrowBO->getName());
+ IRBuilder<> Builder(NarrowUse);
+ Builder.Insert(WideBO);
+ WideBO->copyIRFlags(NarrowBO);
+ return WideBO;
+}
+
+Instruction *WidenIV::cloneArithmeticIVUser(NarrowIVDefUse DU,
+ const SCEVAddRecExpr *WideAR) {
+ Instruction *NarrowUse = DU.NarrowUse;
+ Instruction *NarrowDef = DU.NarrowDef;
+ Instruction *WideDef = DU.WideDef;
+
+ DEBUG(dbgs() << "Cloning arithmetic IVUser: " << *NarrowUse << "\n");
+
+ unsigned IVOpIdx = (NarrowUse->getOperand(0) == NarrowDef) ? 0 : 1;
+
+ // We're trying to find X such that
+ //
+ // Widen(NarrowDef `op` NonIVNarrowDef) == WideAR == WideDef `op.wide` X
+ //
+ // We guess two solutions to X, sext(NonIVNarrowDef) and zext(NonIVNarrowDef),
+ // and check using SCEV if any of them are correct.
+
+ // Returns true if extending NonIVNarrowDef according to `SignExt` is a
+ // correct solution to X.
+ auto GuessNonIVOperand = [&](bool SignExt) {
+ const SCEV *WideLHS;
+ const SCEV *WideRHS;
+
+ auto GetExtend = [this, SignExt](const SCEV *S, Type *Ty) {
+ if (SignExt)
+ return SE->getSignExtendExpr(S, Ty);
+ return SE->getZeroExtendExpr(S, Ty);
+ };
+
+ if (IVOpIdx == 0) {
+ WideLHS = SE->getSCEV(WideDef);
+ const SCEV *NarrowRHS = SE->getSCEV(NarrowUse->getOperand(1));
+ WideRHS = GetExtend(NarrowRHS, WideType);
+ } else {
+ const SCEV *NarrowLHS = SE->getSCEV(NarrowUse->getOperand(0));
+ WideLHS = GetExtend(NarrowLHS, WideType);
+ WideRHS = SE->getSCEV(WideDef);
+ }
+
+ // WideUse is "WideDef `op.wide` X" as described in the comment.
+ const SCEV *WideUse = nullptr;
+
+ switch (NarrowUse->getOpcode()) {
+ default:
+ llvm_unreachable("No other possibility!");
+
+ case Instruction::Add:
+ WideUse = SE->getAddExpr(WideLHS, WideRHS);
+ break;
+
+ case Instruction::Mul:
+ WideUse = SE->getMulExpr(WideLHS, WideRHS);
+ break;
+
+ case Instruction::UDiv:
+ WideUse = SE->getUDivExpr(WideLHS, WideRHS);
+ break;
+
+ case Instruction::Sub:
+ WideUse = SE->getMinusSCEV(WideLHS, WideRHS);
+ break;
+ }
+
+ return WideUse == WideAR;
+ };
+
+ bool SignExtend = IsSigned;
+ if (!GuessNonIVOperand(SignExtend)) {
+ SignExtend = !SignExtend;
+ if (!GuessNonIVOperand(SignExtend))
+ return nullptr;
+ }
+
+ Value *LHS = (NarrowUse->getOperand(0) == NarrowDef)
+ ? WideDef
+ : createExtendInst(NarrowUse->getOperand(0), WideType,
+ SignExtend, NarrowUse);
+ Value *RHS = (NarrowUse->getOperand(1) == NarrowDef)
+ ? WideDef
+ : createExtendInst(NarrowUse->getOperand(1), WideType,
+ SignExtend, NarrowUse);
+
+ auto *NarrowBO = cast<BinaryOperator>(NarrowUse);
+ auto *WideBO = BinaryOperator::Create(NarrowBO->getOpcode(), LHS, RHS,
+ NarrowBO->getName());
+
+ IRBuilder<> Builder(NarrowUse);
+ Builder.Insert(WideBO);
+ WideBO->copyIRFlags(NarrowBO);
+ return WideBO;
+}
+
+const SCEV *WidenIV::getSCEVByOpCode(const SCEV *LHS, const SCEV *RHS,
+ unsigned OpCode) const {
+ if (OpCode == Instruction::Add)
+ return SE->getAddExpr(LHS, RHS);
+ if (OpCode == Instruction::Sub)
+ return SE->getMinusSCEV(LHS, RHS);
+ if (OpCode == Instruction::Mul)
+ return SE->getMulExpr(LHS, RHS);
+
+ llvm_unreachable("Unsupported opcode.");
+}
+
+/// No-wrap operations can transfer sign extension of their result to their
+/// operands. Generate the SCEV value for the widened operation without
+/// actually modifying the IR yet. If the expression after extending the
+/// operands is an AddRec for this loop, return it.
+const SCEVAddRecExpr* WidenIV::getExtendedOperandRecurrence(NarrowIVDefUse DU) {
+
+ // Handle the common case of add<nsw/nuw>
+ const unsigned OpCode = DU.NarrowUse->getOpcode();
+ // Only Add/Sub/Mul instructions supported yet.
+ if (OpCode != Instruction::Add && OpCode != Instruction::Sub &&
+ OpCode != Instruction::Mul)
+ return nullptr;
+
+ // One operand (NarrowDef) has already been extended to WideDef. Now determine
+ // if extending the other will lead to a recurrence.
+ const unsigned ExtendOperIdx =
+ DU.NarrowUse->getOperand(0) == DU.NarrowDef ? 1 : 0;
+ assert(DU.NarrowUse->getOperand(1-ExtendOperIdx) == DU.NarrowDef && "bad DU");
+
+ const SCEV *ExtendOperExpr = nullptr;
+ const OverflowingBinaryOperator *OBO =
+ cast<OverflowingBinaryOperator>(DU.NarrowUse);
+ if (IsSigned && OBO->hasNoSignedWrap())
+ ExtendOperExpr = SE->getSignExtendExpr(
+ SE->getSCEV(DU.NarrowUse->getOperand(ExtendOperIdx)), WideType);
+ else if(!IsSigned && OBO->hasNoUnsignedWrap())
+ ExtendOperExpr = SE->getZeroExtendExpr(
+ SE->getSCEV(DU.NarrowUse->getOperand(ExtendOperIdx)), WideType);
+ else
+ return nullptr;
+
+ // When creating this SCEV expr, don't apply the current operations NSW or NUW
+ // flags. This instruction may be guarded by control flow that the no-wrap
+ // behavior depends on. Non-control-equivalent instructions can be mapped to
+ // the same SCEV expression, and it would be incorrect to transfer NSW/NUW
+ // semantics to those operations.
+ const SCEV *lhs = SE->getSCEV(DU.WideDef);
+ const SCEV *rhs = ExtendOperExpr;
+
+ // Let's swap operands to the initial order for the case of non-commutative
+ // operations, like SUB. See PR21014.
+ if (ExtendOperIdx == 0)
+ std::swap(lhs, rhs);
+ const SCEVAddRecExpr *AddRec =
+ dyn_cast<SCEVAddRecExpr>(getSCEVByOpCode(lhs, rhs, OpCode));
+
+ if (!AddRec || AddRec->getLoop() != L)
+ return nullptr;
+ return AddRec;
+}
+
+/// Is this instruction potentially interesting for further simplification after
+/// widening it's type? In other words, can the extend be safely hoisted out of
+/// the loop with SCEV reducing the value to a recurrence on the same loop. If
+/// so, return the sign or zero extended recurrence. Otherwise return NULL.
+const SCEVAddRecExpr *WidenIV::getWideRecurrence(Instruction *NarrowUse) {
+ if (!SE->isSCEVable(NarrowUse->getType()))
+ return nullptr;
+
+ const SCEV *NarrowExpr = SE->getSCEV(NarrowUse);
+ if (SE->getTypeSizeInBits(NarrowExpr->getType())
+ >= SE->getTypeSizeInBits(WideType)) {
+ // NarrowUse implicitly widens its operand. e.g. a gep with a narrow
+ // index. So don't follow this use.
+ return nullptr;
+ }
+
+ const SCEV *WideExpr = IsSigned ?
+ SE->getSignExtendExpr(NarrowExpr, WideType) :
+ SE->getZeroExtendExpr(NarrowExpr, WideType);
+ const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(WideExpr);
+ if (!AddRec || AddRec->getLoop() != L)
+ return nullptr;
+ return AddRec;
+}
+
+/// This IV user cannot be widen. Replace this use of the original narrow IV
+/// with a truncation of the new wide IV to isolate and eliminate the narrow IV.
+static void truncateIVUse(NarrowIVDefUse DU, DominatorTree *DT, LoopInfo *LI) {
+ DEBUG(dbgs() << "INDVARS: Truncate IV " << *DU.WideDef
+ << " for user " << *DU.NarrowUse << "\n");
+ IRBuilder<> Builder(
+ getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT, LI));
+ Value *Trunc = Builder.CreateTrunc(DU.WideDef, DU.NarrowDef->getType());
+ DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, Trunc);
+}
+
+/// If the narrow use is a compare instruction, then widen the compare
+// (and possibly the other operand). The extend operation is hoisted into the
+// loop preheader as far as possible.
+bool WidenIV::widenLoopCompare(NarrowIVDefUse DU) {
+ ICmpInst *Cmp = dyn_cast<ICmpInst>(DU.NarrowUse);
+ if (!Cmp)
+ return false;
+
+ // We can legally widen the comparison in the following two cases:
+ //
+ // - The signedness of the IV extension and comparison match
+ //
+ // - The narrow IV is always positive (and thus its sign extension is equal
+ // to its zero extension). For instance, let's say we're zero extending
+ // %narrow for the following use
+ //
+ // icmp slt i32 %narrow, %val ... (A)
+ //
+ // and %narrow is always positive. Then
+ //
+ // (A) == icmp slt i32 sext(%narrow), sext(%val)
+ // == icmp slt i32 zext(%narrow), sext(%val)
+
+ if (!(DU.NeverNegative || IsSigned == Cmp->isSigned()))
+ return false;
+
+ Value *Op = Cmp->getOperand(Cmp->getOperand(0) == DU.NarrowDef ? 1 : 0);
+ unsigned CastWidth = SE->getTypeSizeInBits(Op->getType());
+ unsigned IVWidth = SE->getTypeSizeInBits(WideType);
+ assert (CastWidth <= IVWidth && "Unexpected width while widening compare.");
+
+ // Widen the compare instruction.
+ IRBuilder<> Builder(
+ getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT, LI));
+ DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, DU.WideDef);
+
+ // Widen the other operand of the compare, if necessary.
+ if (CastWidth < IVWidth) {
+ Value *ExtOp = createExtendInst(Op, WideType, Cmp->isSigned(), Cmp);
+ DU.NarrowUse->replaceUsesOfWith(Op, ExtOp);
+ }
+ return true;
+}
+
+/// Determine whether an individual user of the narrow IV can be widened. If so,
+/// return the wide clone of the user.
+Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
+
+ // Stop traversing the def-use chain at inner-loop phis or post-loop phis.
+ if (PHINode *UsePhi = dyn_cast<PHINode>(DU.NarrowUse)) {
+ if (LI->getLoopFor(UsePhi->getParent()) != L) {
+ // For LCSSA phis, sink the truncate outside the loop.
+ // After SimplifyCFG most loop exit targets have a single predecessor.
+ // Otherwise fall back to a truncate within the loop.
+ if (UsePhi->getNumOperands() != 1)
+ truncateIVUse(DU, DT, LI);
+ else {
+ PHINode *WidePhi =
+ PHINode::Create(DU.WideDef->getType(), 1, UsePhi->getName() + ".wide",
+ UsePhi);
+ WidePhi->addIncoming(DU.WideDef, UsePhi->getIncomingBlock(0));
+ IRBuilder<> Builder(&*WidePhi->getParent()->getFirstInsertionPt());
+ Value *Trunc = Builder.CreateTrunc(WidePhi, DU.NarrowDef->getType());
+ UsePhi->replaceAllUsesWith(Trunc);
+ DeadInsts.emplace_back(UsePhi);
+ DEBUG(dbgs() << "INDVARS: Widen lcssa phi " << *UsePhi
+ << " to " << *WidePhi << "\n");
+ }
+ return nullptr;
+ }
+ }
+ // Our raison d'etre! Eliminate sign and zero extension.
+ if (IsSigned ? isa<SExtInst>(DU.NarrowUse) : isa<ZExtInst>(DU.NarrowUse)) {
+ Value *NewDef = DU.WideDef;
+ if (DU.NarrowUse->getType() != WideType) {
+ unsigned CastWidth = SE->getTypeSizeInBits(DU.NarrowUse->getType());
+ unsigned IVWidth = SE->getTypeSizeInBits(WideType);
+ if (CastWidth < IVWidth) {
+ // The cast isn't as wide as the IV, so insert a Trunc.
+ IRBuilder<> Builder(DU.NarrowUse);
+ NewDef = Builder.CreateTrunc(DU.WideDef, DU.NarrowUse->getType());
+ }
+ else {
+ // A wider extend was hidden behind a narrower one. This may induce
+ // another round of IV widening in which the intermediate IV becomes
+ // dead. It should be very rare.
+ DEBUG(dbgs() << "INDVARS: New IV " << *WidePhi
+ << " not wide enough to subsume " << *DU.NarrowUse << "\n");
+ DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, DU.WideDef);
+ NewDef = DU.NarrowUse;
+ }
+ }
+ if (NewDef != DU.NarrowUse) {
+ DEBUG(dbgs() << "INDVARS: eliminating " << *DU.NarrowUse
+ << " replaced by " << *DU.WideDef << "\n");
+ ++NumElimExt;
+ DU.NarrowUse->replaceAllUsesWith(NewDef);
+ DeadInsts.emplace_back(DU.NarrowUse);
+ }
+ // Now that the extend is gone, we want to expose it's uses for potential
+ // further simplification. We don't need to directly inform SimplifyIVUsers
+ // of the new users, because their parent IV will be processed later as a
+ // new loop phi. If we preserved IVUsers analysis, we would also want to
+ // push the uses of WideDef here.
+
+ // No further widening is needed. The deceased [sz]ext had done it for us.
+ return nullptr;
+ }
+
+ // Does this user itself evaluate to a recurrence after widening?
+ const SCEVAddRecExpr *WideAddRec = getWideRecurrence(DU.NarrowUse);
+ if (!WideAddRec)
+ WideAddRec = getExtendedOperandRecurrence(DU);
+
+ if (!WideAddRec) {
+ // If use is a loop condition, try to promote the condition instead of
+ // truncating the IV first.
+ if (widenLoopCompare(DU))
+ return nullptr;
+
+ // This user does not evaluate to a recurence after widening, so don't
+ // follow it. Instead insert a Trunc to kill off the original use,
+ // eventually isolating the original narrow IV so it can be removed.
+ truncateIVUse(DU, DT, LI);
+ return nullptr;
+ }
+ // Assume block terminators cannot evaluate to a recurrence. We can't to
+ // insert a Trunc after a terminator if there happens to be a critical edge.
+ assert(DU.NarrowUse != DU.NarrowUse->getParent()->getTerminator() &&
+ "SCEV is not expected to evaluate a block terminator");
+
+ // Reuse the IV increment that SCEVExpander created as long as it dominates
+ // NarrowUse.
+ Instruction *WideUse = nullptr;
+ if (WideAddRec == WideIncExpr
+ && Rewriter.hoistIVInc(WideInc, DU.NarrowUse))
+ WideUse = WideInc;
+ else {
+ WideUse = cloneIVUser(DU, WideAddRec);
+ if (!WideUse)
+ return nullptr;
+ }
+ // Evaluation of WideAddRec ensured that the narrow expression could be
+ // extended outside the loop without overflow. This suggests that the wide use
+ // evaluates to the same expression as the extended narrow use, but doesn't
+ // absolutely guarantee it. Hence the following failsafe check. In rare cases
+ // where it fails, we simply throw away the newly created wide use.
+ if (WideAddRec != SE->getSCEV(WideUse)) {
+ DEBUG(dbgs() << "Wide use expression mismatch: " << *WideUse
+ << ": " << *SE->getSCEV(WideUse) << " != " << *WideAddRec << "\n");
+ DeadInsts.emplace_back(WideUse);
+ return nullptr;
+ }
+
+ // Returning WideUse pushes it on the worklist.
+ return WideUse;
+}
+
+/// Add eligible users of NarrowDef to NarrowIVUsers.
+///
+void WidenIV::pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef) {
+ const SCEV *NarrowSCEV = SE->getSCEV(NarrowDef);
+ bool NeverNegative =
+ SE->isKnownPredicate(ICmpInst::ICMP_SGE, NarrowSCEV,
+ SE->getConstant(NarrowSCEV->getType(), 0));
+ for (User *U : NarrowDef->users()) {
+ Instruction *NarrowUser = cast<Instruction>(U);
+
+ // Handle data flow merges and bizarre phi cycles.
+ if (!Widened.insert(NarrowUser).second)
+ continue;
+
+ NarrowIVUsers.push_back(
+ NarrowIVDefUse(NarrowDef, NarrowUser, WideDef, NeverNegative));
+ }
+}
+
+/// Process a single induction variable. First use the SCEVExpander to create a
+/// wide induction variable that evaluates to the same recurrence as the
+/// original narrow IV. Then use a worklist to forward traverse the narrow IV's
+/// def-use chain. After widenIVUse has processed all interesting IV users, the
+/// narrow IV will be isolated for removal by DeleteDeadPHIs.
+///
+/// It would be simpler to delete uses as they are processed, but we must avoid
+/// invalidating SCEV expressions.
+///
+PHINode *WidenIV::createWideIV(SCEVExpander &Rewriter) {
+ // Is this phi an induction variable?
+ const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(OrigPhi));
+ if (!AddRec)
+ return nullptr;
+
+ // Widen the induction variable expression.
+ const SCEV *WideIVExpr = IsSigned ?
+ SE->getSignExtendExpr(AddRec, WideType) :
+ SE->getZeroExtendExpr(AddRec, WideType);
+
+ assert(SE->getEffectiveSCEVType(WideIVExpr->getType()) == WideType &&
+ "Expect the new IV expression to preserve its type");
+
+ // Can the IV be extended outside the loop without overflow?
+ AddRec = dyn_cast<SCEVAddRecExpr>(WideIVExpr);
+ if (!AddRec || AddRec->getLoop() != L)
+ return nullptr;
+
+ // An AddRec must have loop-invariant operands. Since this AddRec is
+ // materialized by a loop header phi, the expression cannot have any post-loop
+ // operands, so they must dominate the loop header.
+ assert(SE->properlyDominates(AddRec->getStart(), L->getHeader()) &&
+ SE->properlyDominates(AddRec->getStepRecurrence(*SE), L->getHeader())
+ && "Loop header phi recurrence inputs do not dominate the loop");
+
+ // The rewriter provides a value for the desired IV expression. This may
+ // either find an existing phi or materialize a new one. Either way, we
+ // expect a well-formed cyclic phi-with-increments. i.e. any operand not part
+ // of the phi-SCC dominates the loop entry.
+ Instruction *InsertPt = &L->getHeader()->front();
+ WidePhi = cast<PHINode>(Rewriter.expandCodeFor(AddRec, WideType, InsertPt));
+
+ // Remembering the WideIV increment generated by SCEVExpander allows
+ // widenIVUse to reuse it when widening the narrow IV's increment. We don't
+ // employ a general reuse mechanism because the call above is the only call to
+ // SCEVExpander. Henceforth, we produce 1-to-1 narrow to wide uses.
+ if (BasicBlock *LatchBlock = L->getLoopLatch()) {
+ WideInc =
+ cast<Instruction>(WidePhi->getIncomingValueForBlock(LatchBlock));
+ WideIncExpr = SE->getSCEV(WideInc);
+ }
+
+ DEBUG(dbgs() << "Wide IV: " << *WidePhi << "\n");
+ ++NumWidened;
+
+ // Traverse the def-use chain using a worklist starting at the original IV.
+ assert(Widened.empty() && NarrowIVUsers.empty() && "expect initial state" );
+
+ Widened.insert(OrigPhi);
+ pushNarrowIVUsers(OrigPhi, WidePhi);
+
+ while (!NarrowIVUsers.empty()) {
+ NarrowIVDefUse DU = NarrowIVUsers.pop_back_val();
+
+ // Process a def-use edge. This may replace the use, so don't hold a
+ // use_iterator across it.
+ Instruction *WideUse = widenIVUse(DU, Rewriter);
+
+ // Follow all def-use edges from the previous narrow use.
+ if (WideUse)
+ pushNarrowIVUsers(DU.NarrowUse, WideUse);
+
+ // widenIVUse may have removed the def-use edge.
+ if (DU.NarrowDef->use_empty())
+ DeadInsts.emplace_back(DU.NarrowDef);
+ }
+ return WidePhi;
+}
+
+//===----------------------------------------------------------------------===//
+// Live IV Reduction - Minimize IVs live across the loop.
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// Simplification of IV users based on SCEV evaluation.
+//===----------------------------------------------------------------------===//
+
+namespace {
+class IndVarSimplifyVisitor : public IVVisitor {
+ ScalarEvolution *SE;
+ const TargetTransformInfo *TTI;
+ PHINode *IVPhi;
+
+public:
+ WideIVInfo WI;
+
+ IndVarSimplifyVisitor(PHINode *IV, ScalarEvolution *SCEV,
+ const TargetTransformInfo *TTI,
+ const DominatorTree *DTree)
+ : SE(SCEV), TTI(TTI), IVPhi(IV) {
+ DT = DTree;
+ WI.NarrowIV = IVPhi;
+ if (ReduceLiveIVs)
+ setSplitOverflowIntrinsics();
+ }
+
+ // Implement the interface used by simplifyUsersOfIV.
+ void visitCast(CastInst *Cast) override { visitIVCast(Cast, WI, SE, TTI); }
+};
+}
+
+/// Iteratively perform simplification on a worklist of IV users. Each
+/// successive simplification may push more users which may themselves be
+/// candidates for simplification.
+///
+/// Sign/Zero extend elimination is interleaved with IV simplification.
+///
+void IndVarSimplify::simplifyAndExtend(Loop *L,
+ SCEVExpander &Rewriter,
+ LoopInfo *LI) {
+ SmallVector<WideIVInfo, 8> WideIVs;
+
+ SmallVector<PHINode*, 8> LoopPhis;
+ for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
+ LoopPhis.push_back(cast<PHINode>(I));
+ }
+ // Each round of simplification iterates through the SimplifyIVUsers worklist
+ // for all current phis, then determines whether any IVs can be
+ // widened. Widening adds new phis to LoopPhis, inducing another round of
+ // simplification on the wide IVs.
+ while (!LoopPhis.empty()) {
+ // Evaluate as many IV expressions as possible before widening any IVs. This
+ // forces SCEV to set no-wrap flags before evaluating sign/zero
+ // extension. The first time SCEV attempts to normalize sign/zero extension,
+ // the result becomes final. So for the most predictable results, we delay
+ // evaluation of sign/zero extend evaluation until needed, and avoid running
+ // other SCEV based analysis prior to simplifyAndExtend.
+ do {
+ PHINode *CurrIV = LoopPhis.pop_back_val();
+
+ // Information about sign/zero extensions of CurrIV.
+ IndVarSimplifyVisitor Visitor(CurrIV, SE, TTI, DT);
+
+ Changed |= simplifyUsersOfIV(CurrIV, SE, DT, LI, DeadInsts, &Visitor);
+
+ if (Visitor.WI.WidestNativeType) {
+ WideIVs.push_back(Visitor.WI);
+ }
+ } while(!LoopPhis.empty());
+
+ for (; !WideIVs.empty(); WideIVs.pop_back()) {
+ WidenIV Widener(WideIVs.back(), LI, SE, DT, DeadInsts);
+ if (PHINode *WidePhi = Widener.createWideIV(Rewriter)) {
+ Changed = true;
+ LoopPhis.push_back(WidePhi);
+ }
+ }
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// linearFunctionTestReplace and its kin. Rewrite the loop exit condition.
+//===----------------------------------------------------------------------===//
+
+/// Return true if this loop's backedge taken count expression can be safely and
+/// cheaply expanded into an instruction sequence that can be used by
+/// linearFunctionTestReplace.
+///
+/// TODO: This fails for pointer-type loop counters with greater than one byte
+/// strides, consequently preventing LFTR from running. For the purpose of LFTR
+/// we could skip this check in the case that the LFTR loop counter (chosen by
+/// FindLoopCounter) is also pointer type. Instead, we could directly convert
+/// the loop test to an inequality test by checking the target data's alignment
+/// of element types (given that the initial pointer value originates from or is
+/// used by ABI constrained operation, as opposed to inttoptr/ptrtoint).
+/// However, we don't yet have a strong motivation for converting loop tests
+/// into inequality tests.
+static bool canExpandBackedgeTakenCount(Loop *L, ScalarEvolution *SE,
+ SCEVExpander &Rewriter) {
+ const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L);
+ if (isa<SCEVCouldNotCompute>(BackedgeTakenCount) ||
+ BackedgeTakenCount->isZero())
+ return false;
+
+ if (!L->getExitingBlock())
+ return false;
+
+ // Can't rewrite non-branch yet.
+ if (!isa<BranchInst>(L->getExitingBlock()->getTerminator()))
+ return false;
+
+ if (Rewriter.isHighCostExpansion(BackedgeTakenCount, L))
+ return false;
+
+ return true;
+}
+
+/// Return the loop header phi IFF IncV adds a loop invariant value to the phi.
+static PHINode *getLoopPhiForCounter(Value *IncV, Loop *L, DominatorTree *DT) {
+ Instruction *IncI = dyn_cast<Instruction>(IncV);
+ if (!IncI)
+ return nullptr;
+
+ switch (IncI->getOpcode()) {
+ case Instruction::Add:
+ case Instruction::Sub:
+ break;
+ case Instruction::GetElementPtr:
+ // An IV counter must preserve its type.
+ if (IncI->getNumOperands() == 2)
+ break;
+ default:
+ return nullptr;
+ }
+
+ PHINode *Phi = dyn_cast<PHINode>(IncI->getOperand(0));
+ if (Phi && Phi->getParent() == L->getHeader()) {
+ if (isLoopInvariant(IncI->getOperand(1), L, DT))
+ return Phi;
+ return nullptr;
+ }
+ if (IncI->getOpcode() == Instruction::GetElementPtr)
+ return nullptr;
+
+ // Allow add/sub to be commuted.
+ Phi = dyn_cast<PHINode>(IncI->getOperand(1));
+ if (Phi && Phi->getParent() == L->getHeader()) {
+ if (isLoopInvariant(IncI->getOperand(0), L, DT))
+ return Phi;
+ }
+ return nullptr;
+}
+
+/// Return the compare guarding the loop latch, or NULL for unrecognized tests.
+static ICmpInst *getLoopTest(Loop *L) {
+ assert(L->getExitingBlock() && "expected loop exit");
+
+ BasicBlock *LatchBlock = L->getLoopLatch();
+ // Don't bother with LFTR if the loop is not properly simplified.
+ if (!LatchBlock)
+ return nullptr;
+
+ BranchInst *BI = dyn_cast<BranchInst>(L->getExitingBlock()->getTerminator());
+ assert(BI && "expected exit branch");
+
+ return dyn_cast<ICmpInst>(BI->getCondition());
+}
+
+/// linearFunctionTestReplace policy. Return true unless we can show that the
+/// current exit test is already sufficiently canonical.
+static bool needsLFTR(Loop *L, DominatorTree *DT) {
+ // Do LFTR to simplify the exit condition to an ICMP.
+ ICmpInst *Cond = getLoopTest(L);
+ if (!Cond)
+ return true;
+
+ // Do LFTR to simplify the exit ICMP to EQ/NE
+ ICmpInst::Predicate Pred = Cond->getPredicate();
+ if (Pred != ICmpInst::ICMP_NE && Pred != ICmpInst::ICMP_EQ)
+ return true;
+
+ // Look for a loop invariant RHS
+ Value *LHS = Cond->getOperand(0);
+ Value *RHS = Cond->getOperand(1);
+ if (!isLoopInvariant(RHS, L, DT)) {
+ if (!isLoopInvariant(LHS, L, DT))
+ return true;
+ std::swap(LHS, RHS);
+ }
+ // Look for a simple IV counter LHS
+ PHINode *Phi = dyn_cast<PHINode>(LHS);
+ if (!Phi)
+ Phi = getLoopPhiForCounter(LHS, L, DT);
+
+ if (!Phi)
+ return true;
+
+ // Do LFTR if PHI node is defined in the loop, but is *not* a counter.
+ int Idx = Phi->getBasicBlockIndex(L->getLoopLatch());
+ if (Idx < 0)
+ return true;
+
+ // Do LFTR if the exit condition's IV is *not* a simple counter.
+ Value *IncV = Phi->getIncomingValue(Idx);
+ return Phi != getLoopPhiForCounter(IncV, L, DT);
+}
+
+/// Recursive helper for hasConcreteDef(). Unfortunately, this currently boils
+/// down to checking that all operands are constant and listing instructions
+/// that may hide undef.
+static bool hasConcreteDefImpl(Value *V, SmallPtrSetImpl<Value*> &Visited,
+ unsigned Depth) {
+ if (isa<Constant>(V))
+ return !isa<UndefValue>(V);
+
+ if (Depth >= 6)
+ return false;
+
+ // Conservatively handle non-constant non-instructions. For example, Arguments
+ // may be undef.
+ Instruction *I = dyn_cast<Instruction>(V);
+ if (!I)
+ return false;
+
+ // Load and return values may be undef.
+ if(I->mayReadFromMemory() || isa<CallInst>(I) || isa<InvokeInst>(I))
+ return false;
+
+ // Optimistically handle other instructions.
+ for (Value *Op : I->operands()) {
+ if (!Visited.insert(Op).second)
+ continue;
+ if (!hasConcreteDefImpl(Op, Visited, Depth+1))
+ return false;
+ }
+ return true;
+}
+
+/// Return true if the given value is concrete. We must prove that undef can
+/// never reach it.
+///
+/// TODO: If we decide that this is a good approach to checking for undef, we
+/// may factor it into a common location.
+static bool hasConcreteDef(Value *V) {
+ SmallPtrSet<Value*, 8> Visited;
+ Visited.insert(V);
+ return hasConcreteDefImpl(V, Visited, 0);
+}
+
+/// Return true if this IV has any uses other than the (soon to be rewritten)
+/// loop exit test.
+static bool AlmostDeadIV(PHINode *Phi, BasicBlock *LatchBlock, Value *Cond) {
+ int LatchIdx = Phi->getBasicBlockIndex(LatchBlock);
+ Value *IncV = Phi->getIncomingValue(LatchIdx);
+
+ for (User *U : Phi->users())
+ if (U != Cond && U != IncV) return false;
+
+ for (User *U : IncV->users())
+ if (U != Cond && U != Phi) return false;
+ return true;
+}
+
+/// Find an affine IV in canonical form.
+///
+/// BECount may be an i8* pointer type. The pointer difference is already
+/// valid count without scaling the address stride, so it remains a pointer
+/// expression as far as SCEV is concerned.
+///
+/// Currently only valid for LFTR. See the comments on hasConcreteDef below.
+///
+/// FIXME: Accept -1 stride and set IVLimit = IVInit - BECount
+///
+/// FIXME: Accept non-unit stride as long as SCEV can reduce BECount * Stride.
+/// This is difficult in general for SCEV because of potential overflow. But we
+/// could at least handle constant BECounts.
+static PHINode *FindLoopCounter(Loop *L, const SCEV *BECount,
+ ScalarEvolution *SE, DominatorTree *DT) {
+ uint64_t BCWidth = SE->getTypeSizeInBits(BECount->getType());
+
+ Value *Cond =
+ cast<BranchInst>(L->getExitingBlock()->getTerminator())->getCondition();
+
+ // Loop over all of the PHI nodes, looking for a simple counter.
+ PHINode *BestPhi = nullptr;
+ const SCEV *BestInit = nullptr;
+ BasicBlock *LatchBlock = L->getLoopLatch();
+ assert(LatchBlock && "needsLFTR should guarantee a loop latch");
+
+ for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
+ PHINode *Phi = cast<PHINode>(I);
+ if (!SE->isSCEVable(Phi->getType()))
+ continue;
+
+ // Avoid comparing an integer IV against a pointer Limit.
+ if (BECount->getType()->isPointerTy() && !Phi->getType()->isPointerTy())
+ continue;
+
+ const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Phi));
+ if (!AR || AR->getLoop() != L || !AR->isAffine())
+ continue;
+
+ // AR may be a pointer type, while BECount is an integer type.
+ // AR may be wider than BECount. With eq/ne tests overflow is immaterial.
+ // AR may not be a narrower type, or we may never exit.
+ uint64_t PhiWidth = SE->getTypeSizeInBits(AR->getType());
+ if (PhiWidth < BCWidth ||
+ !L->getHeader()->getModule()->getDataLayout().isLegalInteger(PhiWidth))
+ continue;
+
+ const SCEV *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE));
+ if (!Step || !Step->isOne())
+ continue;
+
+ int LatchIdx = Phi->getBasicBlockIndex(LatchBlock);
+ Value *IncV = Phi->getIncomingValue(LatchIdx);
+ if (getLoopPhiForCounter(IncV, L, DT) != Phi)
+ continue;
+
+ // Avoid reusing a potentially undef value to compute other values that may
+ // have originally had a concrete definition.
+ if (!hasConcreteDef(Phi)) {
+ // We explicitly allow unknown phis as long as they are already used by
+ // the loop test. In this case we assume that performing LFTR could not
+ // increase the number of undef users.
+ if (ICmpInst *Cond = getLoopTest(L)) {
+ if (Phi != getLoopPhiForCounter(Cond->getOperand(0), L, DT)
+ && Phi != getLoopPhiForCounter(Cond->getOperand(1), L, DT)) {
+ continue;
+ }
+ }
+ }
+ const SCEV *Init = AR->getStart();
+
+ if (BestPhi && !AlmostDeadIV(BestPhi, LatchBlock, Cond)) {
+ // Don't force a live loop counter if another IV can be used.
+ if (AlmostDeadIV(Phi, LatchBlock, Cond))
+ continue;
+
+ // Prefer to count-from-zero. This is a more "canonical" counter form. It
+ // also prefers integer to pointer IVs.
+ if (BestInit->isZero() != Init->isZero()) {
+ if (BestInit->isZero())
+ continue;
+ }
+ // If two IVs both count from zero or both count from nonzero then the
+ // narrower is likely a dead phi that has been widened. Use the wider phi
+ // to allow the other to be eliminated.
+ else if (PhiWidth <= SE->getTypeSizeInBits(BestPhi->getType()))
+ continue;
+ }
+ BestPhi = Phi;
+ BestInit = Init;
+ }
+ return BestPhi;
+}
+
+/// Help linearFunctionTestReplace by generating a value that holds the RHS of
+/// the new loop test.
+static Value *genLoopLimit(PHINode *IndVar, const SCEV *IVCount, Loop *L,
+ SCEVExpander &Rewriter, ScalarEvolution *SE) {
+ const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(IndVar));
+ assert(AR && AR->getLoop() == L && AR->isAffine() && "bad loop counter");
+ const SCEV *IVInit = AR->getStart();
+
+ // IVInit may be a pointer while IVCount is an integer when FindLoopCounter
+ // finds a valid pointer IV. Sign extend BECount in order to materialize a
+ // GEP. Avoid running SCEVExpander on a new pointer value, instead reusing
+ // the existing GEPs whenever possible.
+ if (IndVar->getType()->isPointerTy()
+ && !IVCount->getType()->isPointerTy()) {
+
+ // IVOffset will be the new GEP offset that is interpreted by GEP as a
+ // signed value. IVCount on the other hand represents the loop trip count,
+ // which is an unsigned value. FindLoopCounter only allows induction
+ // variables that have a positive unit stride of one. This means we don't
+ // have to handle the case of negative offsets (yet) and just need to zero
+ // extend IVCount.
+ Type *OfsTy = SE->getEffectiveSCEVType(IVInit->getType());
+ const SCEV *IVOffset = SE->getTruncateOrZeroExtend(IVCount, OfsTy);
+
+ // Expand the code for the iteration count.
+ assert(SE->isLoopInvariant(IVOffset, L) &&
+ "Computed iteration count is not loop invariant!");
+ BranchInst *BI = cast<BranchInst>(L->getExitingBlock()->getTerminator());
+ Value *GEPOffset = Rewriter.expandCodeFor(IVOffset, OfsTy, BI);
+
+ Value *GEPBase = IndVar->getIncomingValueForBlock(L->getLoopPreheader());
+ assert(AR->getStart() == SE->getSCEV(GEPBase) && "bad loop counter");
+ // We could handle pointer IVs other than i8*, but we need to compensate for
+ // gep index scaling. See canExpandBackedgeTakenCount comments.
+ assert(SE->getSizeOfExpr(IntegerType::getInt64Ty(IndVar->getContext()),
+ cast<PointerType>(GEPBase->getType())->getElementType())->isOne()
+ && "unit stride pointer IV must be i8*");
+
+ IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
+ return Builder.CreateGEP(nullptr, GEPBase, GEPOffset, "lftr.limit");
+ }
+ else {
+ // In any other case, convert both IVInit and IVCount to integers before
+ // comparing. This may result in SCEV expension of pointers, but in practice
+ // SCEV will fold the pointer arithmetic away as such:
+ // BECount = (IVEnd - IVInit - 1) => IVLimit = IVInit (postinc).
+ //
+ // Valid Cases: (1) both integers is most common; (2) both may be pointers
+ // for simple memset-style loops.
+ //
+ // IVInit integer and IVCount pointer would only occur if a canonical IV
+ // were generated on top of case #2, which is not expected.
+
+ const SCEV *IVLimit = nullptr;
+ // For unit stride, IVCount = Start + BECount with 2's complement overflow.
+ // For non-zero Start, compute IVCount here.
+ if (AR->getStart()->isZero())
+ IVLimit = IVCount;
+ else {
+ assert(AR->getStepRecurrence(*SE)->isOne() && "only handles unit stride");
+ const SCEV *IVInit = AR->getStart();
+
+ // For integer IVs, truncate the IV before computing IVInit + BECount.
+ if (SE->getTypeSizeInBits(IVInit->getType())
+ > SE->getTypeSizeInBits(IVCount->getType()))
+ IVInit = SE->getTruncateExpr(IVInit, IVCount->getType());
+
+ IVLimit = SE->getAddExpr(IVInit, IVCount);
+ }
+ // Expand the code for the iteration count.
+ BranchInst *BI = cast<BranchInst>(L->getExitingBlock()->getTerminator());
+ IRBuilder<> Builder(BI);
+ assert(SE->isLoopInvariant(IVLimit, L) &&
+ "Computed iteration count is not loop invariant!");
+ // Ensure that we generate the same type as IndVar, or a smaller integer
+ // type. In the presence of null pointer values, we have an integer type
+ // SCEV expression (IVInit) for a pointer type IV value (IndVar).
+ Type *LimitTy = IVCount->getType()->isPointerTy() ?
+ IndVar->getType() : IVCount->getType();
+ return Rewriter.expandCodeFor(IVLimit, LimitTy, BI);
+ }
+}
+
+/// This method rewrites the exit condition of the loop to be a canonical !=
+/// comparison against the incremented loop induction variable. This pass is
+/// able to rewrite the exit tests of any loop where the SCEV analysis can
+/// determine a loop-invariant trip count of the loop, which is actually a much
+/// broader range than just linear tests.
+Value *IndVarSimplify::
+linearFunctionTestReplace(Loop *L,
+ const SCEV *BackedgeTakenCount,
+ PHINode *IndVar,
+ SCEVExpander &Rewriter) {
+ assert(canExpandBackedgeTakenCount(L, SE, Rewriter) && "precondition");
+
+ // Initialize CmpIndVar and IVCount to their preincremented values.
+ Value *CmpIndVar = IndVar;
+ const SCEV *IVCount = BackedgeTakenCount;
+
+ // If the exiting block is the same as the backedge block, we prefer to
+ // compare against the post-incremented value, otherwise we must compare
+ // against the preincremented value.
+ if (L->getExitingBlock() == L->getLoopLatch()) {
+ // Add one to the "backedge-taken" count to get the trip count.
+ // This addition may overflow, which is valid as long as the comparison is
+ // truncated to BackedgeTakenCount->getType().
+ IVCount = SE->getAddExpr(BackedgeTakenCount,
+ SE->getOne(BackedgeTakenCount->getType()));
+ // The BackedgeTaken expression contains the number of times that the
+ // backedge branches to the loop header. This is one less than the
+ // number of times the loop executes, so use the incremented indvar.
+ CmpIndVar = IndVar->getIncomingValueForBlock(L->getExitingBlock());
+ }
+
+ Value *ExitCnt = genLoopLimit(IndVar, IVCount, L, Rewriter, SE);
+ assert(ExitCnt->getType()->isPointerTy() == IndVar->getType()->isPointerTy()
+ && "genLoopLimit missed a cast");
+
+ // Insert a new icmp_ne or icmp_eq instruction before the branch.
+ BranchInst *BI = cast<BranchInst>(L->getExitingBlock()->getTerminator());
+ ICmpInst::Predicate P;
+ if (L->contains(BI->getSuccessor(0)))
+ P = ICmpInst::ICMP_NE;
+ else
+ P = ICmpInst::ICMP_EQ;
+
+ DEBUG(dbgs() << "INDVARS: Rewriting loop exit condition to:\n"
+ << " LHS:" << *CmpIndVar << '\n'
+ << " op:\t"
+ << (P == ICmpInst::ICMP_NE ? "!=" : "==") << "\n"
+ << " RHS:\t" << *ExitCnt << "\n"
+ << " IVCount:\t" << *IVCount << "\n");
+
+ IRBuilder<> Builder(BI);
+
+ // LFTR can ignore IV overflow and truncate to the width of
+ // BECount. This avoids materializing the add(zext(add)) expression.
+ unsigned CmpIndVarSize = SE->getTypeSizeInBits(CmpIndVar->getType());
+ unsigned ExitCntSize = SE->getTypeSizeInBits(ExitCnt->getType());
+ if (CmpIndVarSize > ExitCntSize) {
+ const SCEVAddRecExpr *AR = cast<SCEVAddRecExpr>(SE->getSCEV(IndVar));
+ const SCEV *ARStart = AR->getStart();
+ const SCEV *ARStep = AR->getStepRecurrence(*SE);
+ // For constant IVCount, avoid truncation.
+ if (isa<SCEVConstant>(ARStart) && isa<SCEVConstant>(IVCount)) {
+ const APInt &Start = cast<SCEVConstant>(ARStart)->getAPInt();
+ APInt Count = cast<SCEVConstant>(IVCount)->getAPInt();
+ // Note that the post-inc value of BackedgeTakenCount may have overflowed
+ // above such that IVCount is now zero.
+ if (IVCount != BackedgeTakenCount && Count == 0) {
+ Count = APInt::getMaxValue(Count.getBitWidth()).zext(CmpIndVarSize);
+ ++Count;
+ }
+ else
+ Count = Count.zext(CmpIndVarSize);
+ APInt NewLimit;
+ if (cast<SCEVConstant>(ARStep)->getValue()->isNegative())
+ NewLimit = Start - Count;
+ else
+ NewLimit = Start + Count;
+ ExitCnt = ConstantInt::get(CmpIndVar->getType(), NewLimit);
+
+ DEBUG(dbgs() << " Widen RHS:\t" << *ExitCnt << "\n");
+ } else {
+ CmpIndVar = Builder.CreateTrunc(CmpIndVar, ExitCnt->getType(),
+ "lftr.wideiv");
+ }
+ }
+ Value *Cond = Builder.CreateICmp(P, CmpIndVar, ExitCnt, "exitcond");
+ Value *OrigCond = BI->getCondition();
+ // It's tempting to use replaceAllUsesWith here to fully replace the old
+ // comparison, but that's not immediately safe, since users of the old
+ // comparison may not be dominated by the new comparison. Instead, just
+ // update the branch to use the new comparison; in the common case this
+ // will make old comparison dead.
+ BI->setCondition(Cond);
+ DeadInsts.push_back(OrigCond);
+
+ ++NumLFTR;
+ Changed = true;
+ return Cond;
+}
+
+//===----------------------------------------------------------------------===//
+// sinkUnusedInvariants. A late subpass to cleanup loop preheaders.
+//===----------------------------------------------------------------------===//
+
+/// If there's a single exit block, sink any loop-invariant values that
+/// were defined in the preheader but not used inside the loop into the
+/// exit block to reduce register pressure in the loop.
+void IndVarSimplify::sinkUnusedInvariants(Loop *L) {
+ BasicBlock *ExitBlock = L->getExitBlock();
+ if (!ExitBlock) return;
+
+ BasicBlock *Preheader = L->getLoopPreheader();
+ if (!Preheader) return;
+
+ Instruction *InsertPt = &*ExitBlock->getFirstInsertionPt();
+ BasicBlock::iterator I(Preheader->getTerminator());
+ while (I != Preheader->begin()) {
+ --I;
+ // New instructions were inserted at the end of the preheader.
+ if (isa<PHINode>(I))
+ break;
+
+ // Don't move instructions which might have side effects, since the side
+ // effects need to complete before instructions inside the loop. Also don't
+ // move instructions which might read memory, since the loop may modify
+ // memory. Note that it's okay if the instruction might have undefined
+ // behavior: LoopSimplify guarantees that the preheader dominates the exit
+ // block.
+ if (I->mayHaveSideEffects() || I->mayReadFromMemory())
+ continue;
+
+ // Skip debug info intrinsics.
+ if (isa<DbgInfoIntrinsic>(I))
+ continue;
+
+ // Skip eh pad instructions.
+ if (I->isEHPad())
+ continue;
+
+ // Don't sink alloca: we never want to sink static alloca's out of the
+ // entry block, and correctly sinking dynamic alloca's requires
+ // checks for stacksave/stackrestore intrinsics.
+ // FIXME: Refactor this check somehow?
+ if (isa<AllocaInst>(I))
+ continue;
+
+ // Determine if there is a use in or before the loop (direct or
+ // otherwise).
+ bool UsedInLoop = false;
+ for (Use &U : I->uses()) {
+ Instruction *User = cast<Instruction>(U.getUser());
+ BasicBlock *UseBB = User->getParent();
+ if (PHINode *P = dyn_cast<PHINode>(User)) {
+ unsigned i =
+ PHINode::getIncomingValueNumForOperand(U.getOperandNo());
+ UseBB = P->getIncomingBlock(i);
+ }
+ if (UseBB == Preheader || L->contains(UseBB)) {
+ UsedInLoop = true;
+ break;
+ }
+ }
+
+ // If there is, the def must remain in the preheader.
+ if (UsedInLoop)
+ continue;
+
+ // Otherwise, sink it to the exit block.
+ Instruction *ToMove = &*I;
+ bool Done = false;
+
+ if (I != Preheader->begin()) {
+ // Skip debug info intrinsics.
+ do {
+ --I;
+ } while (isa<DbgInfoIntrinsic>(I) && I != Preheader->begin());
+
+ if (isa<DbgInfoIntrinsic>(I) && I == Preheader->begin())
+ Done = true;
+ } else {
+ Done = true;
+ }
+
+ ToMove->moveBefore(InsertPt);
+ if (Done) break;
+ InsertPt = ToMove;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// IndVarSimplify driver. Manage several subpasses of IV simplification.
+//===----------------------------------------------------------------------===//
+
+bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
+ if (skipOptnoneFunction(L))
+ return false;
+
+ // If LoopSimplify form is not available, stay out of trouble. Some notes:
+ // - LSR currently only supports LoopSimplify-form loops. Indvars'
+ // canonicalization can be a pessimization without LSR to "clean up"
+ // afterwards.
+ // - We depend on having a preheader; in particular,
+ // Loop::getCanonicalInductionVariable only supports loops with preheaders,
+ // and we're in trouble if we can't find the induction variable even when
+ // we've manually inserted one.
+ if (!L->isLoopSimplifyForm())
+ return false;
+
+ LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+ TLI = TLIP ? &TLIP->getTLI() : nullptr;
+ auto *TTIP = getAnalysisIfAvailable<TargetTransformInfoWrapperPass>();
+ TTI = TTIP ? &TTIP->getTTI(*L->getHeader()->getParent()) : nullptr;
+ const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+
+ DeadInsts.clear();
+ Changed = false;
+
+ // If there are any floating-point recurrences, attempt to
+ // transform them to use integer recurrences.
+ rewriteNonIntegerIVs(L);
+
+ const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L);
+
+ // Create a rewriter object which we'll use to transform the code with.
+ SCEVExpander Rewriter(*SE, DL, "indvars");
+#ifndef NDEBUG
+ Rewriter.setDebugType(DEBUG_TYPE);
+#endif
+
+ // Eliminate redundant IV users.
+ //
+ // Simplification works best when run before other consumers of SCEV. We
+ // attempt to avoid evaluating SCEVs for sign/zero extend operations until
+ // other expressions involving loop IVs have been evaluated. This helps SCEV
+ // set no-wrap flags before normalizing sign/zero extension.
+ Rewriter.disableCanonicalMode();
+ simplifyAndExtend(L, Rewriter, LI);
+
+ // Check to see if this loop has a computable loop-invariant execution count.
+ // If so, this means that we can compute the final value of any expressions
+ // that are recurrent in the loop, and substitute the exit values from the
+ // loop into any instructions outside of the loop that use the final values of
+ // the current expressions.
+ //
+ if (ReplaceExitValue != NeverRepl &&
+ !isa<SCEVCouldNotCompute>(BackedgeTakenCount))
+ rewriteLoopExitValues(L, Rewriter);
+
+ // Eliminate redundant IV cycles.
+ NumElimIV += Rewriter.replaceCongruentIVs(L, DT, DeadInsts);
+
+ // If we have a trip count expression, rewrite the loop's exit condition
+ // using it. We can currently only handle loops with a single exit.
+ if (canExpandBackedgeTakenCount(L, SE, Rewriter) && needsLFTR(L, DT)) {
+ PHINode *IndVar = FindLoopCounter(L, BackedgeTakenCount, SE, DT);
+ if (IndVar) {
+ // Check preconditions for proper SCEVExpander operation. SCEV does not
+ // express SCEVExpander's dependencies, such as LoopSimplify. Instead any
+ // pass that uses the SCEVExpander must do it. This does not work well for
+ // loop passes because SCEVExpander makes assumptions about all loops,
+ // while LoopPassManager only forces the current loop to be simplified.
+ //
+ // FIXME: SCEV expansion has no way to bail out, so the caller must
+ // explicitly check any assumptions made by SCEV. Brittle.
+ const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(BackedgeTakenCount);
+ if (!AR || AR->getLoop()->getLoopPreheader())
+ (void)linearFunctionTestReplace(L, BackedgeTakenCount, IndVar,
+ Rewriter);
+ }
+ }
+ // Clear the rewriter cache, because values that are in the rewriter's cache
+ // can be deleted in the loop below, causing the AssertingVH in the cache to
+ // trigger.
+ Rewriter.clear();
+
+ // Now that we're done iterating through lists, clean up any instructions
+ // which are now dead.
+ while (!DeadInsts.empty())
+ if (Instruction *Inst =
+ dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val()))
+ RecursivelyDeleteTriviallyDeadInstructions(Inst, TLI);
+
+ // The Rewriter may not be used from this point on.
+
+ // Loop-invariant instructions in the preheader that aren't used in the
+ // loop may be sunk below the loop to reduce register pressure.
+ sinkUnusedInvariants(L);
+
+ // Clean up dead instructions.
+ Changed |= DeleteDeadPHIs(L->getHeader(), TLI);
+
+ // Check a post-condition.
+ assert(L->isRecursivelyLCSSAForm(*DT) && "Indvars did not preserve LCSSA!");
+
+ // Verify that LFTR, and any other change have not interfered with SCEV's
+ // ability to compute trip count.
+#ifndef NDEBUG
+ if (VerifyIndvars && !isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
+ SE->forgetLoop(L);
+ const SCEV *NewBECount = SE->getBackedgeTakenCount(L);
+ if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) <
+ SE->getTypeSizeInBits(NewBECount->getType()))
+ NewBECount = SE->getTruncateOrNoop(NewBECount,
+ BackedgeTakenCount->getType());
+ else
+ BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount,
+ NewBECount->getType());
+ assert(BackedgeTakenCount == NewBECount && "indvars must preserve SCEV");
+ }
+#endif
+
+ return Changed;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
new file mode 100644
index 0000000..dea61f6
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
@@ -0,0 +1,1503 @@
+//===-- InductiveRangeCheckElimination.cpp - ------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// The InductiveRangeCheckElimination pass splits a loop's iteration space into
+// three disjoint ranges. It does that in a way such that the loop running in
+// the middle loop provably does not need range checks. As an example, it will
+// convert
+//
+// len = < known positive >
+// for (i = 0; i < n; i++) {
+// if (0 <= i && i < len) {
+// do_something();
+// } else {
+// throw_out_of_bounds();
+// }
+// }
+//
+// to
+//
+// len = < known positive >
+// limit = smin(n, len)
+// // no first segment
+// for (i = 0; i < limit; i++) {
+// if (0 <= i && i < len) { // this check is fully redundant
+// do_something();
+// } else {
+// throw_out_of_bounds();
+// }
+// }
+// for (i = limit; i < n; i++) {
+// if (0 <= i && i < len) {
+// do_something();
+// } else {
+// throw_out_of_bounds();
+// }
+// }
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/SimplifyIndVar.h"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
+#include <array>
+
+using namespace llvm;
+
+static cl::opt<unsigned> LoopSizeCutoff("irce-loop-size-cutoff", cl::Hidden,
+ cl::init(64));
+
+static cl::opt<bool> PrintChangedLoops("irce-print-changed-loops", cl::Hidden,
+ cl::init(false));
+
+static cl::opt<bool> PrintRangeChecks("irce-print-range-checks", cl::Hidden,
+ cl::init(false));
+
+static cl::opt<int> MaxExitProbReciprocal("irce-max-exit-prob-reciprocal",
+ cl::Hidden, cl::init(10));
+
+#define DEBUG_TYPE "irce"
+
+namespace {
+
+/// An inductive range check is conditional branch in a loop with
+///
+/// 1. a very cold successor (i.e. the branch jumps to that successor very
+/// rarely)
+///
+/// and
+///
+/// 2. a condition that is provably true for some contiguous range of values
+/// taken by the containing loop's induction variable.
+///
+class InductiveRangeCheck {
+ // Classifies a range check
+ enum RangeCheckKind : unsigned {
+ // Range check of the form "0 <= I".
+ RANGE_CHECK_LOWER = 1,
+
+ // Range check of the form "I < L" where L is known positive.
+ RANGE_CHECK_UPPER = 2,
+
+ // The logical and of the RANGE_CHECK_LOWER and RANGE_CHECK_UPPER
+ // conditions.
+ RANGE_CHECK_BOTH = RANGE_CHECK_LOWER | RANGE_CHECK_UPPER,
+
+ // Unrecognized range check condition.
+ RANGE_CHECK_UNKNOWN = (unsigned)-1
+ };
+
+ static const char *rangeCheckKindToStr(RangeCheckKind);
+
+ const SCEV *Offset;
+ const SCEV *Scale;
+ Value *Length;
+ BranchInst *Branch;
+ RangeCheckKind Kind;
+
+ static RangeCheckKind parseRangeCheckICmp(Loop *L, ICmpInst *ICI,
+ ScalarEvolution &SE, Value *&Index,
+ Value *&Length);
+
+ static InductiveRangeCheck::RangeCheckKind
+ parseRangeCheck(Loop *L, ScalarEvolution &SE, Value *Condition,
+ const SCEV *&Index, Value *&UpperLimit);
+
+ InductiveRangeCheck() :
+ Offset(nullptr), Scale(nullptr), Length(nullptr), Branch(nullptr) { }
+
+public:
+ const SCEV *getOffset() const { return Offset; }
+ const SCEV *getScale() const { return Scale; }
+ Value *getLength() const { return Length; }
+
+ void print(raw_ostream &OS) const {
+ OS << "InductiveRangeCheck:\n";
+ OS << " Kind: " << rangeCheckKindToStr(Kind) << "\n";
+ OS << " Offset: ";
+ Offset->print(OS);
+ OS << " Scale: ";
+ Scale->print(OS);
+ OS << " Length: ";
+ if (Length)
+ Length->print(OS);
+ else
+ OS << "(null)";
+ OS << "\n Branch: ";
+ getBranch()->print(OS);
+ OS << "\n";
+ }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ void dump() {
+ print(dbgs());
+ }
+#endif
+
+ BranchInst *getBranch() const { return Branch; }
+
+ /// Represents an signed integer range [Range.getBegin(), Range.getEnd()). If
+ /// R.getEnd() sle R.getBegin(), then R denotes the empty range.
+
+ class Range {
+ const SCEV *Begin;
+ const SCEV *End;
+
+ public:
+ Range(const SCEV *Begin, const SCEV *End) : Begin(Begin), End(End) {
+ assert(Begin->getType() == End->getType() && "ill-typed range!");
+ }
+
+ Type *getType() const { return Begin->getType(); }
+ const SCEV *getBegin() const { return Begin; }
+ const SCEV *getEnd() const { return End; }
+ };
+
+ typedef SpecificBumpPtrAllocator<InductiveRangeCheck> AllocatorTy;
+
+ /// This is the value the condition of the branch needs to evaluate to for the
+ /// branch to take the hot successor (see (1) above).
+ bool getPassingDirection() { return true; }
+
+ /// Computes a range for the induction variable (IndVar) in which the range
+ /// check is redundant and can be constant-folded away. The induction
+ /// variable is not required to be the canonical {0,+,1} induction variable.
+ Optional<Range> computeSafeIterationSpace(ScalarEvolution &SE,
+ const SCEVAddRecExpr *IndVar,
+ IRBuilder<> &B) const;
+
+ /// Create an inductive range check out of BI if possible, else return
+ /// nullptr.
+ static InductiveRangeCheck *create(AllocatorTy &Alloc, BranchInst *BI,
+ Loop *L, ScalarEvolution &SE,
+ BranchProbabilityInfo &BPI);
+};
+
+class InductiveRangeCheckElimination : public LoopPass {
+ InductiveRangeCheck::AllocatorTy Allocator;
+
+public:
+ static char ID;
+ InductiveRangeCheckElimination() : LoopPass(ID) {
+ initializeInductiveRangeCheckEliminationPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addRequiredID(LoopSimplifyID);
+ AU.addRequiredID(LCSSAID);
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addRequired<BranchProbabilityInfoWrapperPass>();
+ }
+
+ bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+};
+
+char InductiveRangeCheckElimination::ID = 0;
+}
+
+INITIALIZE_PASS_BEGIN(InductiveRangeCheckElimination, "irce",
+ "Inductive range check elimination", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(LCSSA)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass)
+INITIALIZE_PASS_END(InductiveRangeCheckElimination, "irce",
+ "Inductive range check elimination", false, false)
+
+const char *InductiveRangeCheck::rangeCheckKindToStr(
+ InductiveRangeCheck::RangeCheckKind RCK) {
+ switch (RCK) {
+ case InductiveRangeCheck::RANGE_CHECK_UNKNOWN:
+ return "RANGE_CHECK_UNKNOWN";
+
+ case InductiveRangeCheck::RANGE_CHECK_UPPER:
+ return "RANGE_CHECK_UPPER";
+
+ case InductiveRangeCheck::RANGE_CHECK_LOWER:
+ return "RANGE_CHECK_LOWER";
+
+ case InductiveRangeCheck::RANGE_CHECK_BOTH:
+ return "RANGE_CHECK_BOTH";
+ }
+
+ llvm_unreachable("unknown range check type!");
+}
+
+/// Parse a single ICmp instruction, `ICI`, into a range check. If `ICI`
+/// cannot
+/// be interpreted as a range check, return `RANGE_CHECK_UNKNOWN` and set
+/// `Index` and `Length` to `nullptr`. Otherwise set `Index` to the value
+/// being
+/// range checked, and set `Length` to the upper limit `Index` is being range
+/// checked with if (and only if) the range check type is stronger or equal to
+/// RANGE_CHECK_UPPER.
+///
+InductiveRangeCheck::RangeCheckKind
+InductiveRangeCheck::parseRangeCheckICmp(Loop *L, ICmpInst *ICI,
+ ScalarEvolution &SE, Value *&Index,
+ Value *&Length) {
+
+ auto IsNonNegativeAndNotLoopVarying = [&SE, L](Value *V) {
+ const SCEV *S = SE.getSCEV(V);
+ if (isa<SCEVCouldNotCompute>(S))
+ return false;
+
+ return SE.getLoopDisposition(S, L) == ScalarEvolution::LoopInvariant &&
+ SE.isKnownNonNegative(S);
+ };
+
+ using namespace llvm::PatternMatch;
+
+ ICmpInst::Predicate Pred = ICI->getPredicate();
+ Value *LHS = ICI->getOperand(0);
+ Value *RHS = ICI->getOperand(1);
+
+ switch (Pred) {
+ default:
+ return RANGE_CHECK_UNKNOWN;
+
+ case ICmpInst::ICMP_SLE:
+ std::swap(LHS, RHS);
+ // fallthrough
+ case ICmpInst::ICMP_SGE:
+ if (match(RHS, m_ConstantInt<0>())) {
+ Index = LHS;
+ return RANGE_CHECK_LOWER;
+ }
+ return RANGE_CHECK_UNKNOWN;
+
+ case ICmpInst::ICMP_SLT:
+ std::swap(LHS, RHS);
+ // fallthrough
+ case ICmpInst::ICMP_SGT:
+ if (match(RHS, m_ConstantInt<-1>())) {
+ Index = LHS;
+ return RANGE_CHECK_LOWER;
+ }
+
+ if (IsNonNegativeAndNotLoopVarying(LHS)) {
+ Index = RHS;
+ Length = LHS;
+ return RANGE_CHECK_UPPER;
+ }
+ return RANGE_CHECK_UNKNOWN;
+
+ case ICmpInst::ICMP_ULT:
+ std::swap(LHS, RHS);
+ // fallthrough
+ case ICmpInst::ICMP_UGT:
+ if (IsNonNegativeAndNotLoopVarying(LHS)) {
+ Index = RHS;
+ Length = LHS;
+ return RANGE_CHECK_BOTH;
+ }
+ return RANGE_CHECK_UNKNOWN;
+ }
+
+ llvm_unreachable("default clause returns!");
+}
+
+/// Parses an arbitrary condition into a range check. `Length` is set only if
+/// the range check is recognized to be `RANGE_CHECK_UPPER` or stronger.
+InductiveRangeCheck::RangeCheckKind
+InductiveRangeCheck::parseRangeCheck(Loop *L, ScalarEvolution &SE,
+ Value *Condition, const SCEV *&Index,
+ Value *&Length) {
+ using namespace llvm::PatternMatch;
+
+ Value *A = nullptr;
+ Value *B = nullptr;
+
+ if (match(Condition, m_And(m_Value(A), m_Value(B)))) {
+ Value *IndexA = nullptr, *IndexB = nullptr;
+ Value *LengthA = nullptr, *LengthB = nullptr;
+ ICmpInst *ICmpA = dyn_cast<ICmpInst>(A), *ICmpB = dyn_cast<ICmpInst>(B);
+
+ if (!ICmpA || !ICmpB)
+ return InductiveRangeCheck::RANGE_CHECK_UNKNOWN;
+
+ auto RCKindA = parseRangeCheckICmp(L, ICmpA, SE, IndexA, LengthA);
+ auto RCKindB = parseRangeCheckICmp(L, ICmpB, SE, IndexB, LengthB);
+
+ if (RCKindA == InductiveRangeCheck::RANGE_CHECK_UNKNOWN ||
+ RCKindB == InductiveRangeCheck::RANGE_CHECK_UNKNOWN)
+ return InductiveRangeCheck::RANGE_CHECK_UNKNOWN;
+
+ if (IndexA != IndexB)
+ return InductiveRangeCheck::RANGE_CHECK_UNKNOWN;
+
+ if (LengthA != nullptr && LengthB != nullptr && LengthA != LengthB)
+ return InductiveRangeCheck::RANGE_CHECK_UNKNOWN;
+
+ Index = SE.getSCEV(IndexA);
+ if (isa<SCEVCouldNotCompute>(Index))
+ return InductiveRangeCheck::RANGE_CHECK_UNKNOWN;
+
+ Length = LengthA == nullptr ? LengthB : LengthA;
+
+ return (InductiveRangeCheck::RangeCheckKind)(RCKindA | RCKindB);
+ }
+
+ if (ICmpInst *ICI = dyn_cast<ICmpInst>(Condition)) {
+ Value *IndexVal = nullptr;
+
+ auto RCKind = parseRangeCheckICmp(L, ICI, SE, IndexVal, Length);
+
+ if (RCKind == InductiveRangeCheck::RANGE_CHECK_UNKNOWN)
+ return InductiveRangeCheck::RANGE_CHECK_UNKNOWN;
+
+ Index = SE.getSCEV(IndexVal);
+ if (isa<SCEVCouldNotCompute>(Index))
+ return InductiveRangeCheck::RANGE_CHECK_UNKNOWN;
+
+ return RCKind;
+ }
+
+ return InductiveRangeCheck::RANGE_CHECK_UNKNOWN;
+}
+
+
+InductiveRangeCheck *
+InductiveRangeCheck::create(InductiveRangeCheck::AllocatorTy &A, BranchInst *BI,
+ Loop *L, ScalarEvolution &SE,
+ BranchProbabilityInfo &BPI) {
+
+ if (BI->isUnconditional() || BI->getParent() == L->getLoopLatch())
+ return nullptr;
+
+ BranchProbability LikelyTaken(15, 16);
+
+ if (BPI.getEdgeProbability(BI->getParent(), (unsigned) 0) < LikelyTaken)
+ return nullptr;
+
+ Value *Length = nullptr;
+ const SCEV *IndexSCEV = nullptr;
+
+ auto RCKind = InductiveRangeCheck::parseRangeCheck(L, SE, BI->getCondition(),
+ IndexSCEV, Length);
+
+ if (RCKind == InductiveRangeCheck::RANGE_CHECK_UNKNOWN)
+ return nullptr;
+
+ assert(IndexSCEV && "contract with SplitRangeCheckCondition!");
+ assert((!(RCKind & InductiveRangeCheck::RANGE_CHECK_UPPER) || Length) &&
+ "contract with SplitRangeCheckCondition!");
+
+ const SCEVAddRecExpr *IndexAddRec = dyn_cast<SCEVAddRecExpr>(IndexSCEV);
+ bool IsAffineIndex =
+ IndexAddRec && (IndexAddRec->getLoop() == L) && IndexAddRec->isAffine();
+
+ if (!IsAffineIndex)
+ return nullptr;
+
+ InductiveRangeCheck *IRC = new (A.Allocate()) InductiveRangeCheck;
+ IRC->Length = Length;
+ IRC->Offset = IndexAddRec->getStart();
+ IRC->Scale = IndexAddRec->getStepRecurrence(SE);
+ IRC->Branch = BI;
+ IRC->Kind = RCKind;
+ return IRC;
+}
+
+namespace {
+
+// Keeps track of the structure of a loop. This is similar to llvm::Loop,
+// except that it is more lightweight and can track the state of a loop through
+// changing and potentially invalid IR. This structure also formalizes the
+// kinds of loops we can deal with -- ones that have a single latch that is also
+// an exiting block *and* have a canonical induction variable.
+struct LoopStructure {
+ const char *Tag;
+
+ BasicBlock *Header;
+ BasicBlock *Latch;
+
+ // `Latch's terminator instruction is `LatchBr', and it's `LatchBrExitIdx'th
+ // successor is `LatchExit', the exit block of the loop.
+ BranchInst *LatchBr;
+ BasicBlock *LatchExit;
+ unsigned LatchBrExitIdx;
+
+ Value *IndVarNext;
+ Value *IndVarStart;
+ Value *LoopExitAt;
+ bool IndVarIncreasing;
+
+ LoopStructure()
+ : Tag(""), Header(nullptr), Latch(nullptr), LatchBr(nullptr),
+ LatchExit(nullptr), LatchBrExitIdx(-1), IndVarNext(nullptr),
+ IndVarStart(nullptr), LoopExitAt(nullptr), IndVarIncreasing(false) {}
+
+ template <typename M> LoopStructure map(M Map) const {
+ LoopStructure Result;
+ Result.Tag = Tag;
+ Result.Header = cast<BasicBlock>(Map(Header));
+ Result.Latch = cast<BasicBlock>(Map(Latch));
+ Result.LatchBr = cast<BranchInst>(Map(LatchBr));
+ Result.LatchExit = cast<BasicBlock>(Map(LatchExit));
+ Result.LatchBrExitIdx = LatchBrExitIdx;
+ Result.IndVarNext = Map(IndVarNext);
+ Result.IndVarStart = Map(IndVarStart);
+ Result.LoopExitAt = Map(LoopExitAt);
+ Result.IndVarIncreasing = IndVarIncreasing;
+ return Result;
+ }
+
+ static Optional<LoopStructure> parseLoopStructure(ScalarEvolution &,
+ BranchProbabilityInfo &BPI,
+ Loop &,
+ const char *&);
+};
+
+/// This class is used to constrain loops to run within a given iteration space.
+/// The algorithm this class implements is given a Loop and a range [Begin,
+/// End). The algorithm then tries to break out a "main loop" out of the loop
+/// it is given in a way that the "main loop" runs with the induction variable
+/// in a subset of [Begin, End). The algorithm emits appropriate pre and post
+/// loops to run any remaining iterations. The pre loop runs any iterations in
+/// which the induction variable is < Begin, and the post loop runs any
+/// iterations in which the induction variable is >= End.
+///
+class LoopConstrainer {
+ // The representation of a clone of the original loop we started out with.
+ struct ClonedLoop {
+ // The cloned blocks
+ std::vector<BasicBlock *> Blocks;
+
+ // `Map` maps values in the clonee into values in the cloned version
+ ValueToValueMapTy Map;
+
+ // An instance of `LoopStructure` for the cloned loop
+ LoopStructure Structure;
+ };
+
+ // Result of rewriting the range of a loop. See changeIterationSpaceEnd for
+ // more details on what these fields mean.
+ struct RewrittenRangeInfo {
+ BasicBlock *PseudoExit;
+ BasicBlock *ExitSelector;
+ std::vector<PHINode *> PHIValuesAtPseudoExit;
+ PHINode *IndVarEnd;
+
+ RewrittenRangeInfo()
+ : PseudoExit(nullptr), ExitSelector(nullptr), IndVarEnd(nullptr) {}
+ };
+
+ // Calculated subranges we restrict the iteration space of the main loop to.
+ // See the implementation of `calculateSubRanges' for more details on how
+ // these fields are computed. `LowLimit` is None if there is no restriction
+ // on low end of the restricted iteration space of the main loop. `HighLimit`
+ // is None if there is no restriction on high end of the restricted iteration
+ // space of the main loop.
+
+ struct SubRanges {
+ Optional<const SCEV *> LowLimit;
+ Optional<const SCEV *> HighLimit;
+ };
+
+ // A utility function that does a `replaceUsesOfWith' on the incoming block
+ // set of a `PHINode' -- replaces instances of `Block' in the `PHINode's
+ // incoming block list with `ReplaceBy'.
+ static void replacePHIBlock(PHINode *PN, BasicBlock *Block,
+ BasicBlock *ReplaceBy);
+
+ // Compute a safe set of limits for the main loop to run in -- effectively the
+ // intersection of `Range' and the iteration space of the original loop.
+ // Return None if unable to compute the set of subranges.
+ //
+ Optional<SubRanges> calculateSubRanges() const;
+
+ // Clone `OriginalLoop' and return the result in CLResult. The IR after
+ // running `cloneLoop' is well formed except for the PHI nodes in CLResult --
+ // the PHI nodes say that there is an incoming edge from `OriginalPreheader`
+ // but there is no such edge.
+ //
+ void cloneLoop(ClonedLoop &CLResult, const char *Tag) const;
+
+ // Rewrite the iteration space of the loop denoted by (LS, Preheader). The
+ // iteration space of the rewritten loop ends at ExitLoopAt. The start of the
+ // iteration space is not changed. `ExitLoopAt' is assumed to be slt
+ // `OriginalHeaderCount'.
+ //
+ // If there are iterations left to execute, control is made to jump to
+ // `ContinuationBlock', otherwise they take the normal loop exit. The
+ // returned `RewrittenRangeInfo' object is populated as follows:
+ //
+ // .PseudoExit is a basic block that unconditionally branches to
+ // `ContinuationBlock'.
+ //
+ // .ExitSelector is a basic block that decides, on exit from the loop,
+ // whether to branch to the "true" exit or to `PseudoExit'.
+ //
+ // .PHIValuesAtPseudoExit are PHINodes in `PseudoExit' that compute the value
+ // for each PHINode in the loop header on taking the pseudo exit.
+ //
+ // After changeIterationSpaceEnd, `Preheader' is no longer a legitimate
+ // preheader because it is made to branch to the loop header only
+ // conditionally.
+ //
+ RewrittenRangeInfo
+ changeIterationSpaceEnd(const LoopStructure &LS, BasicBlock *Preheader,
+ Value *ExitLoopAt,
+ BasicBlock *ContinuationBlock) const;
+
+ // The loop denoted by `LS' has `OldPreheader' as its preheader. This
+ // function creates a new preheader for `LS' and returns it.
+ //
+ BasicBlock *createPreheader(const LoopStructure &LS, BasicBlock *OldPreheader,
+ const char *Tag) const;
+
+ // `ContinuationBlockAndPreheader' was the continuation block for some call to
+ // `changeIterationSpaceEnd' and is the preheader to the loop denoted by `LS'.
+ // This function rewrites the PHI nodes in `LS.Header' to start with the
+ // correct value.
+ void rewriteIncomingValuesForPHIs(
+ LoopStructure &LS, BasicBlock *ContinuationBlockAndPreheader,
+ const LoopConstrainer::RewrittenRangeInfo &RRI) const;
+
+ // Even though we do not preserve any passes at this time, we at least need to
+ // keep the parent loop structure consistent. The `LPPassManager' seems to
+ // verify this after running a loop pass. This function adds the list of
+ // blocks denoted by BBs to this loops parent loop if required.
+ void addToParentLoopIfNeeded(ArrayRef<BasicBlock *> BBs);
+
+ // Some global state.
+ Function &F;
+ LLVMContext &Ctx;
+ ScalarEvolution &SE;
+
+ // Information about the original loop we started out with.
+ Loop &OriginalLoop;
+ LoopInfo &OriginalLoopInfo;
+ const SCEV *LatchTakenCount;
+ BasicBlock *OriginalPreheader;
+
+ // The preheader of the main loop. This may or may not be different from
+ // `OriginalPreheader'.
+ BasicBlock *MainLoopPreheader;
+
+ // The range we need to run the main loop in.
+ InductiveRangeCheck::Range Range;
+
+ // The structure of the main loop (see comment at the beginning of this class
+ // for a definition)
+ LoopStructure MainLoopStructure;
+
+public:
+ LoopConstrainer(Loop &L, LoopInfo &LI, const LoopStructure &LS,
+ ScalarEvolution &SE, InductiveRangeCheck::Range R)
+ : F(*L.getHeader()->getParent()), Ctx(L.getHeader()->getContext()),
+ SE(SE), OriginalLoop(L), OriginalLoopInfo(LI), LatchTakenCount(nullptr),
+ OriginalPreheader(nullptr), MainLoopPreheader(nullptr), Range(R),
+ MainLoopStructure(LS) {}
+
+ // Entry point for the algorithm. Returns true on success.
+ bool run();
+};
+
+}
+
+void LoopConstrainer::replacePHIBlock(PHINode *PN, BasicBlock *Block,
+ BasicBlock *ReplaceBy) {
+ for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+ if (PN->getIncomingBlock(i) == Block)
+ PN->setIncomingBlock(i, ReplaceBy);
+}
+
+static bool CanBeSMax(ScalarEvolution &SE, const SCEV *S) {
+ APInt SMax =
+ APInt::getSignedMaxValue(cast<IntegerType>(S->getType())->getBitWidth());
+ return SE.getSignedRange(S).contains(SMax) &&
+ SE.getUnsignedRange(S).contains(SMax);
+}
+
+static bool CanBeSMin(ScalarEvolution &SE, const SCEV *S) {
+ APInt SMin =
+ APInt::getSignedMinValue(cast<IntegerType>(S->getType())->getBitWidth());
+ return SE.getSignedRange(S).contains(SMin) &&
+ SE.getUnsignedRange(S).contains(SMin);
+}
+
+Optional<LoopStructure>
+LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BPI,
+ Loop &L, const char *&FailureReason) {
+ assert(L.isLoopSimplifyForm() && "should follow from addRequired<>");
+
+ BasicBlock *Latch = L.getLoopLatch();
+ if (!L.isLoopExiting(Latch)) {
+ FailureReason = "no loop latch";
+ return None;
+ }
+
+ BasicBlock *Header = L.getHeader();
+ BasicBlock *Preheader = L.getLoopPreheader();
+ if (!Preheader) {
+ FailureReason = "no preheader";
+ return None;
+ }
+
+ BranchInst *LatchBr = dyn_cast<BranchInst>(&*Latch->rbegin());
+ if (!LatchBr || LatchBr->isUnconditional()) {
+ FailureReason = "latch terminator not conditional branch";
+ return None;
+ }
+
+ unsigned LatchBrExitIdx = LatchBr->getSuccessor(0) == Header ? 1 : 0;
+
+ BranchProbability ExitProbability =
+ BPI.getEdgeProbability(LatchBr->getParent(), LatchBrExitIdx);
+
+ if (ExitProbability > BranchProbability(1, MaxExitProbReciprocal)) {
+ FailureReason = "short running loop, not profitable";
+ return None;
+ }
+
+ ICmpInst *ICI = dyn_cast<ICmpInst>(LatchBr->getCondition());
+ if (!ICI || !isa<IntegerType>(ICI->getOperand(0)->getType())) {
+ FailureReason = "latch terminator branch not conditional on integral icmp";
+ return None;
+ }
+
+ const SCEV *LatchCount = SE.getExitCount(&L, Latch);
+ if (isa<SCEVCouldNotCompute>(LatchCount)) {
+ FailureReason = "could not compute latch count";
+ return None;
+ }
+
+ ICmpInst::Predicate Pred = ICI->getPredicate();
+ Value *LeftValue = ICI->getOperand(0);
+ const SCEV *LeftSCEV = SE.getSCEV(LeftValue);
+ IntegerType *IndVarTy = cast<IntegerType>(LeftValue->getType());
+
+ Value *RightValue = ICI->getOperand(1);
+ const SCEV *RightSCEV = SE.getSCEV(RightValue);
+
+ // We canonicalize `ICI` such that `LeftSCEV` is an add recurrence.
+ if (!isa<SCEVAddRecExpr>(LeftSCEV)) {
+ if (isa<SCEVAddRecExpr>(RightSCEV)) {
+ std::swap(LeftSCEV, RightSCEV);
+ std::swap(LeftValue, RightValue);
+ Pred = ICmpInst::getSwappedPredicate(Pred);
+ } else {
+ FailureReason = "no add recurrences in the icmp";
+ return None;
+ }
+ }
+
+ auto HasNoSignedWrap = [&](const SCEVAddRecExpr *AR) {
+ if (AR->getNoWrapFlags(SCEV::FlagNSW))
+ return true;
+
+ IntegerType *Ty = cast<IntegerType>(AR->getType());
+ IntegerType *WideTy =
+ IntegerType::get(Ty->getContext(), Ty->getBitWidth() * 2);
+
+ const SCEVAddRecExpr *ExtendAfterOp =
+ dyn_cast<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
+ if (ExtendAfterOp) {
+ const SCEV *ExtendedStart = SE.getSignExtendExpr(AR->getStart(), WideTy);
+ const SCEV *ExtendedStep =
+ SE.getSignExtendExpr(AR->getStepRecurrence(SE), WideTy);
+
+ bool NoSignedWrap = ExtendAfterOp->getStart() == ExtendedStart &&
+ ExtendAfterOp->getStepRecurrence(SE) == ExtendedStep;
+
+ if (NoSignedWrap)
+ return true;
+ }
+
+ // We may have proved this when computing the sign extension above.
+ return AR->getNoWrapFlags(SCEV::FlagNSW) != SCEV::FlagAnyWrap;
+ };
+
+ auto IsInductionVar = [&](const SCEVAddRecExpr *AR, bool &IsIncreasing) {
+ if (!AR->isAffine())
+ return false;
+
+ // Currently we only work with induction variables that have been proved to
+ // not wrap. This restriction can potentially be lifted in the future.
+
+ if (!HasNoSignedWrap(AR))
+ return false;
+
+ if (const SCEVConstant *StepExpr =
+ dyn_cast<SCEVConstant>(AR->getStepRecurrence(SE))) {
+ ConstantInt *StepCI = StepExpr->getValue();
+ if (StepCI->isOne() || StepCI->isMinusOne()) {
+ IsIncreasing = StepCI->isOne();
+ return true;
+ }
+ }
+
+ return false;
+ };
+
+ // `ICI` is interpreted as taking the backedge if the *next* value of the
+ // induction variable satisfies some constraint.
+
+ const SCEVAddRecExpr *IndVarNext = cast<SCEVAddRecExpr>(LeftSCEV);
+ bool IsIncreasing = false;
+ if (!IsInductionVar(IndVarNext, IsIncreasing)) {
+ FailureReason = "LHS in icmp not induction variable";
+ return None;
+ }
+
+ ConstantInt *One = ConstantInt::get(IndVarTy, 1);
+ // TODO: generalize the predicates here to also match their unsigned variants.
+ if (IsIncreasing) {
+ bool FoundExpectedPred =
+ (Pred == ICmpInst::ICMP_SLT && LatchBrExitIdx == 1) ||
+ (Pred == ICmpInst::ICMP_SGT && LatchBrExitIdx == 0);
+
+ if (!FoundExpectedPred) {
+ FailureReason = "expected icmp slt semantically, found something else";
+ return None;
+ }
+
+ if (LatchBrExitIdx == 0) {
+ if (CanBeSMax(SE, RightSCEV)) {
+ // TODO: this restriction is easily removable -- we just have to
+ // remember that the icmp was an slt and not an sle.
+ FailureReason = "limit may overflow when coercing sle to slt";
+ return None;
+ }
+
+ IRBuilder<> B(&*Preheader->rbegin());
+ RightValue = B.CreateAdd(RightValue, One);
+ }
+
+ } else {
+ bool FoundExpectedPred =
+ (Pred == ICmpInst::ICMP_SGT && LatchBrExitIdx == 1) ||
+ (Pred == ICmpInst::ICMP_SLT && LatchBrExitIdx == 0);
+
+ if (!FoundExpectedPred) {
+ FailureReason = "expected icmp sgt semantically, found something else";
+ return None;
+ }
+
+ if (LatchBrExitIdx == 0) {
+ if (CanBeSMin(SE, RightSCEV)) {
+ // TODO: this restriction is easily removable -- we just have to
+ // remember that the icmp was an sgt and not an sge.
+ FailureReason = "limit may overflow when coercing sge to sgt";
+ return None;
+ }
+
+ IRBuilder<> B(&*Preheader->rbegin());
+ RightValue = B.CreateSub(RightValue, One);
+ }
+ }
+
+ const SCEV *StartNext = IndVarNext->getStart();
+ const SCEV *Addend = SE.getNegativeSCEV(IndVarNext->getStepRecurrence(SE));
+ const SCEV *IndVarStart = SE.getAddExpr(StartNext, Addend);
+
+ BasicBlock *LatchExit = LatchBr->getSuccessor(LatchBrExitIdx);
+
+ assert(SE.getLoopDisposition(LatchCount, &L) ==
+ ScalarEvolution::LoopInvariant &&
+ "loop variant exit count doesn't make sense!");
+
+ assert(!L.contains(LatchExit) && "expected an exit block!");
+ const DataLayout &DL = Preheader->getModule()->getDataLayout();
+ Value *IndVarStartV =
+ SCEVExpander(SE, DL, "irce")
+ .expandCodeFor(IndVarStart, IndVarTy, &*Preheader->rbegin());
+ IndVarStartV->setName("indvar.start");
+
+ LoopStructure Result;
+
+ Result.Tag = "main";
+ Result.Header = Header;
+ Result.Latch = Latch;
+ Result.LatchBr = LatchBr;
+ Result.LatchExit = LatchExit;
+ Result.LatchBrExitIdx = LatchBrExitIdx;
+ Result.IndVarStart = IndVarStartV;
+ Result.IndVarNext = LeftValue;
+ Result.IndVarIncreasing = IsIncreasing;
+ Result.LoopExitAt = RightValue;
+
+ FailureReason = nullptr;
+
+ return Result;
+}
+
+Optional<LoopConstrainer::SubRanges>
+LoopConstrainer::calculateSubRanges() const {
+ IntegerType *Ty = cast<IntegerType>(LatchTakenCount->getType());
+
+ if (Range.getType() != Ty)
+ return None;
+
+ LoopConstrainer::SubRanges Result;
+
+ // I think we can be more aggressive here and make this nuw / nsw if the
+ // addition that feeds into the icmp for the latch's terminating branch is nuw
+ // / nsw. In any case, a wrapping 2's complement addition is safe.
+ ConstantInt *One = ConstantInt::get(Ty, 1);
+ const SCEV *Start = SE.getSCEV(MainLoopStructure.IndVarStart);
+ const SCEV *End = SE.getSCEV(MainLoopStructure.LoopExitAt);
+
+ bool Increasing = MainLoopStructure.IndVarIncreasing;
+
+ // We compute `Smallest` and `Greatest` such that [Smallest, Greatest) is the
+ // range of values the induction variable takes.
+
+ const SCEV *Smallest = nullptr, *Greatest = nullptr;
+
+ if (Increasing) {
+ Smallest = Start;
+ Greatest = End;
+ } else {
+ // These two computations may sign-overflow. Here is why that is okay:
+ //
+ // We know that the induction variable does not sign-overflow on any
+ // iteration except the last one, and it starts at `Start` and ends at
+ // `End`, decrementing by one every time.
+ //
+ // * if `Smallest` sign-overflows we know `End` is `INT_SMAX`. Since the
+ // induction variable is decreasing we know that that the smallest value
+ // the loop body is actually executed with is `INT_SMIN` == `Smallest`.
+ //
+ // * if `Greatest` sign-overflows, we know it can only be `INT_SMIN`. In
+ // that case, `Clamp` will always return `Smallest` and
+ // [`Result.LowLimit`, `Result.HighLimit`) = [`Smallest`, `Smallest`)
+ // will be an empty range. Returning an empty range is always safe.
+ //
+
+ Smallest = SE.getAddExpr(End, SE.getSCEV(One));
+ Greatest = SE.getAddExpr(Start, SE.getSCEV(One));
+ }
+
+ auto Clamp = [this, Smallest, Greatest](const SCEV *S) {
+ return SE.getSMaxExpr(Smallest, SE.getSMinExpr(Greatest, S));
+ };
+
+ // In some cases we can prove that we don't need a pre or post loop
+
+ bool ProvablyNoPreloop =
+ SE.isKnownPredicate(ICmpInst::ICMP_SLE, Range.getBegin(), Smallest);
+ if (!ProvablyNoPreloop)
+ Result.LowLimit = Clamp(Range.getBegin());
+
+ bool ProvablyNoPostLoop =
+ SE.isKnownPredicate(ICmpInst::ICMP_SLE, Greatest, Range.getEnd());
+ if (!ProvablyNoPostLoop)
+ Result.HighLimit = Clamp(Range.getEnd());
+
+ return Result;
+}
+
+void LoopConstrainer::cloneLoop(LoopConstrainer::ClonedLoop &Result,
+ const char *Tag) const {
+ for (BasicBlock *BB : OriginalLoop.getBlocks()) {
+ BasicBlock *Clone = CloneBasicBlock(BB, Result.Map, Twine(".") + Tag, &F);
+ Result.Blocks.push_back(Clone);
+ Result.Map[BB] = Clone;
+ }
+
+ auto GetClonedValue = [&Result](Value *V) {
+ assert(V && "null values not in domain!");
+ auto It = Result.Map.find(V);
+ if (It == Result.Map.end())
+ return V;
+ return static_cast<Value *>(It->second);
+ };
+
+ Result.Structure = MainLoopStructure.map(GetClonedValue);
+ Result.Structure.Tag = Tag;
+
+ for (unsigned i = 0, e = Result.Blocks.size(); i != e; ++i) {
+ BasicBlock *ClonedBB = Result.Blocks[i];
+ BasicBlock *OriginalBB = OriginalLoop.getBlocks()[i];
+
+ assert(Result.Map[OriginalBB] == ClonedBB && "invariant!");
+
+ for (Instruction &I : *ClonedBB)
+ RemapInstruction(&I, Result.Map,
+ RF_NoModuleLevelChanges | RF_IgnoreMissingEntries);
+
+ // Exit blocks will now have one more predecessor and their PHI nodes need
+ // to be edited to reflect that. No phi nodes need to be introduced because
+ // the loop is in LCSSA.
+
+ for (auto SBBI = succ_begin(OriginalBB), SBBE = succ_end(OriginalBB);
+ SBBI != SBBE; ++SBBI) {
+
+ if (OriginalLoop.contains(*SBBI))
+ continue; // not an exit block
+
+ for (Instruction &I : **SBBI) {
+ if (!isa<PHINode>(&I))
+ break;
+
+ PHINode *PN = cast<PHINode>(&I);
+ Value *OldIncoming = PN->getIncomingValueForBlock(OriginalBB);
+ PN->addIncoming(GetClonedValue(OldIncoming), ClonedBB);
+ }
+ }
+ }
+}
+
+LoopConstrainer::RewrittenRangeInfo LoopConstrainer::changeIterationSpaceEnd(
+ const LoopStructure &LS, BasicBlock *Preheader, Value *ExitSubloopAt,
+ BasicBlock *ContinuationBlock) const {
+
+ // We start with a loop with a single latch:
+ //
+ // +--------------------+
+ // | |
+ // | preheader |
+ // | |
+ // +--------+-----------+
+ // | ----------------\
+ // | / |
+ // +--------v----v------+ |
+ // | | |
+ // | header | |
+ // | | |
+ // +--------------------+ |
+ // |
+ // ..... |
+ // |
+ // +--------------------+ |
+ // | | |
+ // | latch >----------/
+ // | |
+ // +-------v------------+
+ // |
+ // |
+ // | +--------------------+
+ // | | |
+ // +---> original exit |
+ // | |
+ // +--------------------+
+ //
+ // We change the control flow to look like
+ //
+ //
+ // +--------------------+
+ // | |
+ // | preheader >-------------------------+
+ // | | |
+ // +--------v-----------+ |
+ // | /-------------+ |
+ // | / | |
+ // +--------v--v--------+ | |
+ // | | | |
+ // | header | | +--------+ |
+ // | | | | | |
+ // +--------------------+ | | +-----v-----v-----------+
+ // | | | |
+ // | | | .pseudo.exit |
+ // | | | |
+ // | | +-----------v-----------+
+ // | | |
+ // ..... | | |
+ // | | +--------v-------------+
+ // +--------------------+ | | | |
+ // | | | | | ContinuationBlock |
+ // | latch >------+ | | |
+ // | | | +----------------------+
+ // +---------v----------+ |
+ // | |
+ // | |
+ // | +---------------^-----+
+ // | | |
+ // +-----> .exit.selector |
+ // | |
+ // +----------v----------+
+ // |
+ // +--------------------+ |
+ // | | |
+ // | original exit <----+
+ // | |
+ // +--------------------+
+ //
+
+ RewrittenRangeInfo RRI;
+
+ auto BBInsertLocation = std::next(Function::iterator(LS.Latch));
+ RRI.ExitSelector = BasicBlock::Create(Ctx, Twine(LS.Tag) + ".exit.selector",
+ &F, &*BBInsertLocation);
+ RRI.PseudoExit = BasicBlock::Create(Ctx, Twine(LS.Tag) + ".pseudo.exit", &F,
+ &*BBInsertLocation);
+
+ BranchInst *PreheaderJump = cast<BranchInst>(&*Preheader->rbegin());
+ bool Increasing = LS.IndVarIncreasing;
+
+ IRBuilder<> B(PreheaderJump);
+
+ // EnterLoopCond - is it okay to start executing this `LS'?
+ Value *EnterLoopCond = Increasing
+ ? B.CreateICmpSLT(LS.IndVarStart, ExitSubloopAt)
+ : B.CreateICmpSGT(LS.IndVarStart, ExitSubloopAt);
+
+ B.CreateCondBr(EnterLoopCond, LS.Header, RRI.PseudoExit);
+ PreheaderJump->eraseFromParent();
+
+ LS.LatchBr->setSuccessor(LS.LatchBrExitIdx, RRI.ExitSelector);
+ B.SetInsertPoint(LS.LatchBr);
+ Value *TakeBackedgeLoopCond =
+ Increasing ? B.CreateICmpSLT(LS.IndVarNext, ExitSubloopAt)
+ : B.CreateICmpSGT(LS.IndVarNext, ExitSubloopAt);
+ Value *CondForBranch = LS.LatchBrExitIdx == 1
+ ? TakeBackedgeLoopCond
+ : B.CreateNot(TakeBackedgeLoopCond);
+
+ LS.LatchBr->setCondition(CondForBranch);
+
+ B.SetInsertPoint(RRI.ExitSelector);
+
+ // IterationsLeft - are there any more iterations left, given the original
+ // upper bound on the induction variable? If not, we branch to the "real"
+ // exit.
+ Value *IterationsLeft = Increasing
+ ? B.CreateICmpSLT(LS.IndVarNext, LS.LoopExitAt)
+ : B.CreateICmpSGT(LS.IndVarNext, LS.LoopExitAt);
+ B.CreateCondBr(IterationsLeft, RRI.PseudoExit, LS.LatchExit);
+
+ BranchInst *BranchToContinuation =
+ BranchInst::Create(ContinuationBlock, RRI.PseudoExit);
+
+ // We emit PHI nodes into `RRI.PseudoExit' that compute the "latest" value of
+ // each of the PHI nodes in the loop header. This feeds into the initial
+ // value of the same PHI nodes if/when we continue execution.
+ for (Instruction &I : *LS.Header) {
+ if (!isa<PHINode>(&I))
+ break;
+
+ PHINode *PN = cast<PHINode>(&I);
+
+ PHINode *NewPHI = PHINode::Create(PN->getType(), 2, PN->getName() + ".copy",
+ BranchToContinuation);
+
+ NewPHI->addIncoming(PN->getIncomingValueForBlock(Preheader), Preheader);
+ NewPHI->addIncoming(PN->getIncomingValueForBlock(LS.Latch),
+ RRI.ExitSelector);
+ RRI.PHIValuesAtPseudoExit.push_back(NewPHI);
+ }
+
+ RRI.IndVarEnd = PHINode::Create(LS.IndVarNext->getType(), 2, "indvar.end",
+ BranchToContinuation);
+ RRI.IndVarEnd->addIncoming(LS.IndVarStart, Preheader);
+ RRI.IndVarEnd->addIncoming(LS.IndVarNext, RRI.ExitSelector);
+
+ // The latch exit now has a branch from `RRI.ExitSelector' instead of
+ // `LS.Latch'. The PHI nodes need to be updated to reflect that.
+ for (Instruction &I : *LS.LatchExit) {
+ if (PHINode *PN = dyn_cast<PHINode>(&I))
+ replacePHIBlock(PN, LS.Latch, RRI.ExitSelector);
+ else
+ break;
+ }
+
+ return RRI;
+}
+
+void LoopConstrainer::rewriteIncomingValuesForPHIs(
+ LoopStructure &LS, BasicBlock *ContinuationBlock,
+ const LoopConstrainer::RewrittenRangeInfo &RRI) const {
+
+ unsigned PHIIndex = 0;
+ for (Instruction &I : *LS.Header) {
+ if (!isa<PHINode>(&I))
+ break;
+
+ PHINode *PN = cast<PHINode>(&I);
+
+ for (unsigned i = 0, e = PN->getNumIncomingValues(); i < e; ++i)
+ if (PN->getIncomingBlock(i) == ContinuationBlock)
+ PN->setIncomingValue(i, RRI.PHIValuesAtPseudoExit[PHIIndex++]);
+ }
+
+ LS.IndVarStart = RRI.IndVarEnd;
+}
+
+BasicBlock *LoopConstrainer::createPreheader(const LoopStructure &LS,
+ BasicBlock *OldPreheader,
+ const char *Tag) const {
+
+ BasicBlock *Preheader = BasicBlock::Create(Ctx, Tag, &F, LS.Header);
+ BranchInst::Create(LS.Header, Preheader);
+
+ for (Instruction &I : *LS.Header) {
+ if (!isa<PHINode>(&I))
+ break;
+
+ PHINode *PN = cast<PHINode>(&I);
+ for (unsigned i = 0, e = PN->getNumIncomingValues(); i < e; ++i)
+ replacePHIBlock(PN, OldPreheader, Preheader);
+ }
+
+ return Preheader;
+}
+
+void LoopConstrainer::addToParentLoopIfNeeded(ArrayRef<BasicBlock *> BBs) {
+ Loop *ParentLoop = OriginalLoop.getParentLoop();
+ if (!ParentLoop)
+ return;
+
+ for (BasicBlock *BB : BBs)
+ ParentLoop->addBasicBlockToLoop(BB, OriginalLoopInfo);
+}
+
+bool LoopConstrainer::run() {
+ BasicBlock *Preheader = nullptr;
+ LatchTakenCount = SE.getExitCount(&OriginalLoop, MainLoopStructure.Latch);
+ Preheader = OriginalLoop.getLoopPreheader();
+ assert(!isa<SCEVCouldNotCompute>(LatchTakenCount) && Preheader != nullptr &&
+ "preconditions!");
+
+ OriginalPreheader = Preheader;
+ MainLoopPreheader = Preheader;
+
+ Optional<SubRanges> MaybeSR = calculateSubRanges();
+ if (!MaybeSR.hasValue()) {
+ DEBUG(dbgs() << "irce: could not compute subranges\n");
+ return false;
+ }
+
+ SubRanges SR = MaybeSR.getValue();
+ bool Increasing = MainLoopStructure.IndVarIncreasing;
+ IntegerType *IVTy =
+ cast<IntegerType>(MainLoopStructure.IndVarNext->getType());
+
+ SCEVExpander Expander(SE, F.getParent()->getDataLayout(), "irce");
+ Instruction *InsertPt = OriginalPreheader->getTerminator();
+
+ // It would have been better to make `PreLoop' and `PostLoop'
+ // `Optional<ClonedLoop>'s, but `ValueToValueMapTy' does not have a copy
+ // constructor.
+ ClonedLoop PreLoop, PostLoop;
+ bool NeedsPreLoop =
+ Increasing ? SR.LowLimit.hasValue() : SR.HighLimit.hasValue();
+ bool NeedsPostLoop =
+ Increasing ? SR.HighLimit.hasValue() : SR.LowLimit.hasValue();
+
+ Value *ExitPreLoopAt = nullptr;
+ Value *ExitMainLoopAt = nullptr;
+ const SCEVConstant *MinusOneS =
+ cast<SCEVConstant>(SE.getConstant(IVTy, -1, true /* isSigned */));
+
+ if (NeedsPreLoop) {
+ const SCEV *ExitPreLoopAtSCEV = nullptr;
+
+ if (Increasing)
+ ExitPreLoopAtSCEV = *SR.LowLimit;
+ else {
+ if (CanBeSMin(SE, *SR.HighLimit)) {
+ DEBUG(dbgs() << "irce: could not prove no-overflow when computing "
+ << "preloop exit limit. HighLimit = " << *(*SR.HighLimit)
+ << "\n");
+ return false;
+ }
+ ExitPreLoopAtSCEV = SE.getAddExpr(*SR.HighLimit, MinusOneS);
+ }
+
+ ExitPreLoopAt = Expander.expandCodeFor(ExitPreLoopAtSCEV, IVTy, InsertPt);
+ ExitPreLoopAt->setName("exit.preloop.at");
+ }
+
+ if (NeedsPostLoop) {
+ const SCEV *ExitMainLoopAtSCEV = nullptr;
+
+ if (Increasing)
+ ExitMainLoopAtSCEV = *SR.HighLimit;
+ else {
+ if (CanBeSMin(SE, *SR.LowLimit)) {
+ DEBUG(dbgs() << "irce: could not prove no-overflow when computing "
+ << "mainloop exit limit. LowLimit = " << *(*SR.LowLimit)
+ << "\n");
+ return false;
+ }
+ ExitMainLoopAtSCEV = SE.getAddExpr(*SR.LowLimit, MinusOneS);
+ }
+
+ ExitMainLoopAt = Expander.expandCodeFor(ExitMainLoopAtSCEV, IVTy, InsertPt);
+ ExitMainLoopAt->setName("exit.mainloop.at");
+ }
+
+ // We clone these ahead of time so that we don't have to deal with changing
+ // and temporarily invalid IR as we transform the loops.
+ if (NeedsPreLoop)
+ cloneLoop(PreLoop, "preloop");
+ if (NeedsPostLoop)
+ cloneLoop(PostLoop, "postloop");
+
+ RewrittenRangeInfo PreLoopRRI;
+
+ if (NeedsPreLoop) {
+ Preheader->getTerminator()->replaceUsesOfWith(MainLoopStructure.Header,
+ PreLoop.Structure.Header);
+
+ MainLoopPreheader =
+ createPreheader(MainLoopStructure, Preheader, "mainloop");
+ PreLoopRRI = changeIterationSpaceEnd(PreLoop.Structure, Preheader,
+ ExitPreLoopAt, MainLoopPreheader);
+ rewriteIncomingValuesForPHIs(MainLoopStructure, MainLoopPreheader,
+ PreLoopRRI);
+ }
+
+ BasicBlock *PostLoopPreheader = nullptr;
+ RewrittenRangeInfo PostLoopRRI;
+
+ if (NeedsPostLoop) {
+ PostLoopPreheader =
+ createPreheader(PostLoop.Structure, Preheader, "postloop");
+ PostLoopRRI = changeIterationSpaceEnd(MainLoopStructure, MainLoopPreheader,
+ ExitMainLoopAt, PostLoopPreheader);
+ rewriteIncomingValuesForPHIs(PostLoop.Structure, PostLoopPreheader,
+ PostLoopRRI);
+ }
+
+ BasicBlock *NewMainLoopPreheader =
+ MainLoopPreheader != Preheader ? MainLoopPreheader : nullptr;
+ BasicBlock *NewBlocks[] = {PostLoopPreheader, PreLoopRRI.PseudoExit,
+ PreLoopRRI.ExitSelector, PostLoopRRI.PseudoExit,
+ PostLoopRRI.ExitSelector, NewMainLoopPreheader};
+
+ // Some of the above may be nullptr, filter them out before passing to
+ // addToParentLoopIfNeeded.
+ auto NewBlocksEnd =
+ std::remove(std::begin(NewBlocks), std::end(NewBlocks), nullptr);
+
+ addToParentLoopIfNeeded(makeArrayRef(std::begin(NewBlocks), NewBlocksEnd));
+ addToParentLoopIfNeeded(PreLoop.Blocks);
+ addToParentLoopIfNeeded(PostLoop.Blocks);
+
+ return true;
+}
+
+/// Computes and returns a range of values for the induction variable (IndVar)
+/// in which the range check can be safely elided. If it cannot compute such a
+/// range, returns None.
+Optional<InductiveRangeCheck::Range>
+InductiveRangeCheck::computeSafeIterationSpace(ScalarEvolution &SE,
+ const SCEVAddRecExpr *IndVar,
+ IRBuilder<> &) const {
+ // IndVar is of the form "A + B * I" (where "I" is the canonical induction
+ // variable, that may or may not exist as a real llvm::Value in the loop) and
+ // this inductive range check is a range check on the "C + D * I" ("C" is
+ // getOffset() and "D" is getScale()). We rewrite the value being range
+ // checked to "M + N * IndVar" where "N" = "D * B^(-1)" and "M" = "C - NA".
+ // Currently we support this only for "B" = "D" = { 1 or -1 }, but the code
+ // can be generalized as needed.
+ //
+ // The actual inequalities we solve are of the form
+ //
+ // 0 <= M + 1 * IndVar < L given L >= 0 (i.e. N == 1)
+ //
+ // The inequality is satisfied by -M <= IndVar < (L - M) [^1]. All additions
+ // and subtractions are twos-complement wrapping and comparisons are signed.
+ //
+ // Proof:
+ //
+ // If there exists IndVar such that -M <= IndVar < (L - M) then it follows
+ // that -M <= (-M + L) [== Eq. 1]. Since L >= 0, if (-M + L) sign-overflows
+ // then (-M + L) < (-M). Hence by [Eq. 1], (-M + L) could not have
+ // overflown.
+ //
+ // This means IndVar = t + (-M) for t in [0, L). Hence (IndVar + M) = t.
+ // Hence 0 <= (IndVar + M) < L
+
+ // [^1]: Note that the solution does _not_ apply if L < 0; consider values M =
+ // 127, IndVar = 126 and L = -2 in an i8 world.
+
+ if (!IndVar->isAffine())
+ return None;
+
+ const SCEV *A = IndVar->getStart();
+ const SCEVConstant *B = dyn_cast<SCEVConstant>(IndVar->getStepRecurrence(SE));
+ if (!B)
+ return None;
+
+ const SCEV *C = getOffset();
+ const SCEVConstant *D = dyn_cast<SCEVConstant>(getScale());
+ if (D != B)
+ return None;
+
+ ConstantInt *ConstD = D->getValue();
+ if (!(ConstD->isMinusOne() || ConstD->isOne()))
+ return None;
+
+ const SCEV *M = SE.getMinusSCEV(C, A);
+
+ const SCEV *Begin = SE.getNegativeSCEV(M);
+ const SCEV *UpperLimit = nullptr;
+
+ // We strengthen "0 <= I" to "0 <= I < INT_SMAX" and "I < L" to "0 <= I < L".
+ // We can potentially do much better here.
+ if (Value *V = getLength()) {
+ UpperLimit = SE.getSCEV(V);
+ } else {
+ assert(Kind == InductiveRangeCheck::RANGE_CHECK_LOWER && "invariant!");
+ unsigned BitWidth = cast<IntegerType>(IndVar->getType())->getBitWidth();
+ UpperLimit = SE.getConstant(APInt::getSignedMaxValue(BitWidth));
+ }
+
+ const SCEV *End = SE.getMinusSCEV(UpperLimit, M);
+ return InductiveRangeCheck::Range(Begin, End);
+}
+
+static Optional<InductiveRangeCheck::Range>
+IntersectRange(ScalarEvolution &SE,
+ const Optional<InductiveRangeCheck::Range> &R1,
+ const InductiveRangeCheck::Range &R2, IRBuilder<> &B) {
+ if (!R1.hasValue())
+ return R2;
+ auto &R1Value = R1.getValue();
+
+ // TODO: we could widen the smaller range and have this work; but for now we
+ // bail out to keep things simple.
+ if (R1Value.getType() != R2.getType())
+ return None;
+
+ const SCEV *NewBegin = SE.getSMaxExpr(R1Value.getBegin(), R2.getBegin());
+ const SCEV *NewEnd = SE.getSMinExpr(R1Value.getEnd(), R2.getEnd());
+
+ return InductiveRangeCheck::Range(NewBegin, NewEnd);
+}
+
+bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) {
+ if (L->getBlocks().size() >= LoopSizeCutoff) {
+ DEBUG(dbgs() << "irce: giving up constraining loop, too large\n";);
+ return false;
+ }
+
+ BasicBlock *Preheader = L->getLoopPreheader();
+ if (!Preheader) {
+ DEBUG(dbgs() << "irce: loop has no preheader, leaving\n");
+ return false;
+ }
+
+ LLVMContext &Context = Preheader->getContext();
+ InductiveRangeCheck::AllocatorTy IRCAlloc;
+ SmallVector<InductiveRangeCheck *, 16> RangeChecks;
+ ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ BranchProbabilityInfo &BPI =
+ getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI();
+
+ for (auto BBI : L->getBlocks())
+ if (BranchInst *TBI = dyn_cast<BranchInst>(BBI->getTerminator()))
+ if (InductiveRangeCheck *IRC =
+ InductiveRangeCheck::create(IRCAlloc, TBI, L, SE, BPI))
+ RangeChecks.push_back(IRC);
+
+ if (RangeChecks.empty())
+ return false;
+
+ auto PrintRecognizedRangeChecks = [&](raw_ostream &OS) {
+ OS << "irce: looking at loop "; L->print(OS);
+ OS << "irce: loop has " << RangeChecks.size()
+ << " inductive range checks: \n";
+ for (InductiveRangeCheck *IRC : RangeChecks)
+ IRC->print(OS);
+ };
+
+ DEBUG(PrintRecognizedRangeChecks(dbgs()));
+
+ if (PrintRangeChecks)
+ PrintRecognizedRangeChecks(errs());
+
+ const char *FailureReason = nullptr;
+ Optional<LoopStructure> MaybeLoopStructure =
+ LoopStructure::parseLoopStructure(SE, BPI, *L, FailureReason);
+ if (!MaybeLoopStructure.hasValue()) {
+ DEBUG(dbgs() << "irce: could not parse loop structure: " << FailureReason
+ << "\n";);
+ return false;
+ }
+ LoopStructure LS = MaybeLoopStructure.getValue();
+ bool Increasing = LS.IndVarIncreasing;
+ const SCEV *MinusOne =
+ SE.getConstant(LS.IndVarNext->getType(), Increasing ? -1 : 1, true);
+ const SCEVAddRecExpr *IndVar =
+ cast<SCEVAddRecExpr>(SE.getAddExpr(SE.getSCEV(LS.IndVarNext), MinusOne));
+
+ Optional<InductiveRangeCheck::Range> SafeIterRange;
+ Instruction *ExprInsertPt = Preheader->getTerminator();
+
+ SmallVector<InductiveRangeCheck *, 4> RangeChecksToEliminate;
+
+ IRBuilder<> B(ExprInsertPt);
+ for (InductiveRangeCheck *IRC : RangeChecks) {
+ auto Result = IRC->computeSafeIterationSpace(SE, IndVar, B);
+ if (Result.hasValue()) {
+ auto MaybeSafeIterRange =
+ IntersectRange(SE, SafeIterRange, Result.getValue(), B);
+ if (MaybeSafeIterRange.hasValue()) {
+ RangeChecksToEliminate.push_back(IRC);
+ SafeIterRange = MaybeSafeIterRange.getValue();
+ }
+ }
+ }
+
+ if (!SafeIterRange.hasValue())
+ return false;
+
+ LoopConstrainer LC(*L, getAnalysis<LoopInfoWrapperPass>().getLoopInfo(), LS,
+ SE, SafeIterRange.getValue());
+ bool Changed = LC.run();
+
+ if (Changed) {
+ auto PrintConstrainedLoopInfo = [L]() {
+ dbgs() << "irce: in function ";
+ dbgs() << L->getHeader()->getParent()->getName() << ": ";
+ dbgs() << "constrained ";
+ L->print(dbgs());
+ };
+
+ DEBUG(PrintConstrainedLoopInfo());
+
+ if (PrintChangedLoops)
+ PrintConstrainedLoopInfo();
+
+ // Optimize away the now-redundant range checks.
+
+ for (InductiveRangeCheck *IRC : RangeChecksToEliminate) {
+ ConstantInt *FoldedRangeCheck = IRC->getPassingDirection()
+ ? ConstantInt::getTrue(Context)
+ : ConstantInt::getFalse(Context);
+ IRC->getBranch()->setCondition(FoldedRangeCheck);
+ }
+ }
+
+ return Changed;
+}
+
+Pass *llvm::createInductiveRangeCheckEliminationPass() {
+ return new InductiveRangeCheckElimination;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp
new file mode 100644
index 0000000..dcdcfed
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -0,0 +1,1955 @@
+//===- JumpThreading.cpp - Thread control through conditional blocks ------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Jump Threading pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BlockFrequencyInfoImpl.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LazyValueInfo.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include <algorithm>
+#include <memory>
+using namespace llvm;
+
+#define DEBUG_TYPE "jump-threading"
+
+STATISTIC(NumThreads, "Number of jumps threaded");
+STATISTIC(NumFolds, "Number of terminators folded");
+STATISTIC(NumDupes, "Number of branch blocks duplicated to eliminate phi");
+
+static cl::opt<unsigned>
+BBDuplicateThreshold("jump-threading-threshold",
+ cl::desc("Max block size to duplicate for jump threading"),
+ cl::init(6), cl::Hidden);
+
+static cl::opt<unsigned>
+ImplicationSearchThreshold(
+ "jump-threading-implication-search-threshold",
+ cl::desc("The number of predecessors to search for a stronger "
+ "condition to use to thread over a weaker condition"),
+ cl::init(3), cl::Hidden);
+
+namespace {
+ // These are at global scope so static functions can use them too.
+ typedef SmallVectorImpl<std::pair<Constant*, BasicBlock*> > PredValueInfo;
+ typedef SmallVector<std::pair<Constant*, BasicBlock*>, 8> PredValueInfoTy;
+
+ // This is used to keep track of what kind of constant we're currently hoping
+ // to find.
+ enum ConstantPreference {
+ WantInteger,
+ WantBlockAddress
+ };
+
+ /// This pass performs 'jump threading', which looks at blocks that have
+ /// multiple predecessors and multiple successors. If one or more of the
+ /// predecessors of the block can be proven to always jump to one of the
+ /// successors, we forward the edge from the predecessor to the successor by
+ /// duplicating the contents of this block.
+ ///
+ /// An example of when this can occur is code like this:
+ ///
+ /// if () { ...
+ /// X = 4;
+ /// }
+ /// if (X < 3) {
+ ///
+ /// In this case, the unconditional branch at the end of the first if can be
+ /// revectored to the false side of the second if.
+ ///
+ class JumpThreading : public FunctionPass {
+ TargetLibraryInfo *TLI;
+ LazyValueInfo *LVI;
+ std::unique_ptr<BlockFrequencyInfo> BFI;
+ std::unique_ptr<BranchProbabilityInfo> BPI;
+ bool HasProfileData;
+#ifdef NDEBUG
+ SmallPtrSet<const BasicBlock *, 16> LoopHeaders;
+#else
+ SmallSet<AssertingVH<const BasicBlock>, 16> LoopHeaders;
+#endif
+ DenseSet<std::pair<Value*, BasicBlock*> > RecursionSet;
+
+ unsigned BBDupThreshold;
+
+ // RAII helper for updating the recursion stack.
+ struct RecursionSetRemover {
+ DenseSet<std::pair<Value*, BasicBlock*> > &TheSet;
+ std::pair<Value*, BasicBlock*> ThePair;
+
+ RecursionSetRemover(DenseSet<std::pair<Value*, BasicBlock*> > &S,
+ std::pair<Value*, BasicBlock*> P)
+ : TheSet(S), ThePair(P) { }
+
+ ~RecursionSetRemover() {
+ TheSet.erase(ThePair);
+ }
+ };
+ public:
+ static char ID; // Pass identification
+ JumpThreading(int T = -1) : FunctionPass(ID) {
+ BBDupThreshold = (T == -1) ? BBDuplicateThreshold : unsigned(T);
+ initializeJumpThreadingPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<LazyValueInfo>();
+ AU.addPreserved<LazyValueInfo>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ }
+
+ void releaseMemory() override {
+ BFI.reset();
+ BPI.reset();
+ }
+
+ void FindLoopHeaders(Function &F);
+ bool ProcessBlock(BasicBlock *BB);
+ bool ThreadEdge(BasicBlock *BB, const SmallVectorImpl<BasicBlock*> &PredBBs,
+ BasicBlock *SuccBB);
+ bool DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB,
+ const SmallVectorImpl<BasicBlock *> &PredBBs);
+
+ bool ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB,
+ PredValueInfo &Result,
+ ConstantPreference Preference,
+ Instruction *CxtI = nullptr);
+ bool ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
+ ConstantPreference Preference,
+ Instruction *CxtI = nullptr);
+
+ bool ProcessBranchOnPHI(PHINode *PN);
+ bool ProcessBranchOnXOR(BinaryOperator *BO);
+ bool ProcessImpliedCondition(BasicBlock *BB);
+
+ bool SimplifyPartiallyRedundantLoad(LoadInst *LI);
+ bool TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB);
+ bool TryToUnfoldSelectInCurrBB(BasicBlock *BB);
+
+ private:
+ BasicBlock *SplitBlockPreds(BasicBlock *BB, ArrayRef<BasicBlock *> Preds,
+ const char *Suffix);
+ void UpdateBlockFreqAndEdgeWeight(BasicBlock *PredBB, BasicBlock *BB,
+ BasicBlock *NewBB, BasicBlock *SuccBB);
+ };
+}
+
+char JumpThreading::ID = 0;
+INITIALIZE_PASS_BEGIN(JumpThreading, "jump-threading",
+ "Jump Threading", false, false)
+INITIALIZE_PASS_DEPENDENCY(LazyValueInfo)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(JumpThreading, "jump-threading",
+ "Jump Threading", false, false)
+
+// Public interface to the Jump Threading pass
+FunctionPass *llvm::createJumpThreadingPass(int Threshold) { return new JumpThreading(Threshold); }
+
+/// runOnFunction - Top level algorithm.
+///
+bool JumpThreading::runOnFunction(Function &F) {
+ if (skipOptnoneFunction(F))
+ return false;
+
+ DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n");
+ TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ LVI = &getAnalysis<LazyValueInfo>();
+ BFI.reset();
+ BPI.reset();
+ // When profile data is available, we need to update edge weights after
+ // successful jump threading, which requires both BPI and BFI being available.
+ HasProfileData = F.getEntryCount().hasValue();
+ if (HasProfileData) {
+ LoopInfo LI{DominatorTree(F)};
+ BPI.reset(new BranchProbabilityInfo(F, LI));
+ BFI.reset(new BlockFrequencyInfo(F, *BPI, LI));
+ }
+
+ // Remove unreachable blocks from function as they may result in infinite
+ // loop. We do threading if we found something profitable. Jump threading a
+ // branch can create other opportunities. If these opportunities form a cycle
+ // i.e. if any jump threading is undoing previous threading in the path, then
+ // we will loop forever. We take care of this issue by not jump threading for
+ // back edges. This works for normal cases but not for unreachable blocks as
+ // they may have cycle with no back edge.
+ bool EverChanged = false;
+ EverChanged |= removeUnreachableBlocks(F, LVI);
+
+ FindLoopHeaders(F);
+
+ bool Changed;
+ do {
+ Changed = false;
+ for (Function::iterator I = F.begin(), E = F.end(); I != E;) {
+ BasicBlock *BB = &*I;
+ // Thread all of the branches we can over this block.
+ while (ProcessBlock(BB))
+ Changed = true;
+
+ ++I;
+
+ // If the block is trivially dead, zap it. This eliminates the successor
+ // edges which simplifies the CFG.
+ if (pred_empty(BB) &&
+ BB != &BB->getParent()->getEntryBlock()) {
+ DEBUG(dbgs() << " JT: Deleting dead block '" << BB->getName()
+ << "' with terminator: " << *BB->getTerminator() << '\n');
+ LoopHeaders.erase(BB);
+ LVI->eraseBlock(BB);
+ DeleteDeadBlock(BB);
+ Changed = true;
+ continue;
+ }
+
+ BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator());
+
+ // Can't thread an unconditional jump, but if the block is "almost
+ // empty", we can replace uses of it with uses of the successor and make
+ // this dead.
+ if (BI && BI->isUnconditional() &&
+ BB != &BB->getParent()->getEntryBlock() &&
+ // If the terminator is the only non-phi instruction, try to nuke it.
+ BB->getFirstNonPHIOrDbg()->isTerminator()) {
+ // Since TryToSimplifyUncondBranchFromEmptyBlock may delete the
+ // block, we have to make sure it isn't in the LoopHeaders set. We
+ // reinsert afterward if needed.
+ bool ErasedFromLoopHeaders = LoopHeaders.erase(BB);
+ BasicBlock *Succ = BI->getSuccessor(0);
+
+ // FIXME: It is always conservatively correct to drop the info
+ // for a block even if it doesn't get erased. This isn't totally
+ // awesome, but it allows us to use AssertingVH to prevent nasty
+ // dangling pointer issues within LazyValueInfo.
+ LVI->eraseBlock(BB);
+ if (TryToSimplifyUncondBranchFromEmptyBlock(BB)) {
+ Changed = true;
+ // If we deleted BB and BB was the header of a loop, then the
+ // successor is now the header of the loop.
+ BB = Succ;
+ }
+
+ if (ErasedFromLoopHeaders)
+ LoopHeaders.insert(BB);
+ }
+ }
+ EverChanged |= Changed;
+ } while (Changed);
+
+ LoopHeaders.clear();
+ return EverChanged;
+}
+
+/// getJumpThreadDuplicationCost - Return the cost of duplicating this block to
+/// thread across it. Stop scanning the block when passing the threshold.
+static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB,
+ unsigned Threshold) {
+ /// Ignore PHI nodes, these will be flattened when duplication happens.
+ BasicBlock::const_iterator I(BB->getFirstNonPHI());
+
+ // FIXME: THREADING will delete values that are just used to compute the
+ // branch, so they shouldn't count against the duplication cost.
+
+ unsigned Bonus = 0;
+ const TerminatorInst *BBTerm = BB->getTerminator();
+ // Threading through a switch statement is particularly profitable. If this
+ // block ends in a switch, decrease its cost to make it more likely to happen.
+ if (isa<SwitchInst>(BBTerm))
+ Bonus = 6;
+
+ // The same holds for indirect branches, but slightly more so.
+ if (isa<IndirectBrInst>(BBTerm))
+ Bonus = 8;
+
+ // Bump the threshold up so the early exit from the loop doesn't skip the
+ // terminator-based Size adjustment at the end.
+ Threshold += Bonus;
+
+ // Sum up the cost of each instruction until we get to the terminator. Don't
+ // include the terminator because the copy won't include it.
+ unsigned Size = 0;
+ for (; !isa<TerminatorInst>(I); ++I) {
+
+ // Stop scanning the block if we've reached the threshold.
+ if (Size > Threshold)
+ return Size;
+
+ // Debugger intrinsics don't incur code size.
+ if (isa<DbgInfoIntrinsic>(I)) continue;
+
+ // If this is a pointer->pointer bitcast, it is free.
+ if (isa<BitCastInst>(I) && I->getType()->isPointerTy())
+ continue;
+
+ // Bail out if this instruction gives back a token type, it is not possible
+ // to duplicate it if it is used outside this BB.
+ if (I->getType()->isTokenTy() && I->isUsedOutsideOfBlock(BB))
+ return ~0U;
+
+ // All other instructions count for at least one unit.
+ ++Size;
+
+ // Calls are more expensive. If they are non-intrinsic calls, we model them
+ // as having cost of 4. If they are a non-vector intrinsic, we model them
+ // as having cost of 2 total, and if they are a vector intrinsic, we model
+ // them as having cost 1.
+ if (const CallInst *CI = dyn_cast<CallInst>(I)) {
+ if (CI->cannotDuplicate() || CI->isConvergent())
+ // Blocks with NoDuplicate are modelled as having infinite cost, so they
+ // are never duplicated.
+ return ~0U;
+ else if (!isa<IntrinsicInst>(CI))
+ Size += 3;
+ else if (!CI->getType()->isVectorTy())
+ Size += 1;
+ }
+ }
+
+ return Size > Bonus ? Size - Bonus : 0;
+}
+
+/// FindLoopHeaders - We do not want jump threading to turn proper loop
+/// structures into irreducible loops. Doing this breaks up the loop nesting
+/// hierarchy and pessimizes later transformations. To prevent this from
+/// happening, we first have to find the loop headers. Here we approximate this
+/// by finding targets of backedges in the CFG.
+///
+/// Note that there definitely are cases when we want to allow threading of
+/// edges across a loop header. For example, threading a jump from outside the
+/// loop (the preheader) to an exit block of the loop is definitely profitable.
+/// It is also almost always profitable to thread backedges from within the loop
+/// to exit blocks, and is often profitable to thread backedges to other blocks
+/// within the loop (forming a nested loop). This simple analysis is not rich
+/// enough to track all of these properties and keep it up-to-date as the CFG
+/// mutates, so we don't allow any of these transformations.
+///
+void JumpThreading::FindLoopHeaders(Function &F) {
+ SmallVector<std::pair<const BasicBlock*,const BasicBlock*>, 32> Edges;
+ FindFunctionBackedges(F, Edges);
+
+ for (const auto &Edge : Edges)
+ LoopHeaders.insert(Edge.second);
+}
+
+/// getKnownConstant - Helper method to determine if we can thread over a
+/// terminator with the given value as its condition, and if so what value to
+/// use for that. What kind of value this is depends on whether we want an
+/// integer or a block address, but an undef is always accepted.
+/// Returns null if Val is null or not an appropriate constant.
+static Constant *getKnownConstant(Value *Val, ConstantPreference Preference) {
+ if (!Val)
+ return nullptr;
+
+ // Undef is "known" enough.
+ if (UndefValue *U = dyn_cast<UndefValue>(Val))
+ return U;
+
+ if (Preference == WantBlockAddress)
+ return dyn_cast<BlockAddress>(Val->stripPointerCasts());
+
+ return dyn_cast<ConstantInt>(Val);
+}
+
+/// ComputeValueKnownInPredecessors - Given a basic block BB and a value V, see
+/// if we can infer that the value is a known ConstantInt/BlockAddress or undef
+/// in any of our predecessors. If so, return the known list of value and pred
+/// BB in the result vector.
+///
+/// This returns true if there were any known values.
+///
+bool JumpThreading::
+ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result,
+ ConstantPreference Preference,
+ Instruction *CxtI) {
+ // This method walks up use-def chains recursively. Because of this, we could
+ // get into an infinite loop going around loops in the use-def chain. To
+ // prevent this, keep track of what (value, block) pairs we've already visited
+ // and terminate the search if we loop back to them
+ if (!RecursionSet.insert(std::make_pair(V, BB)).second)
+ return false;
+
+ // An RAII help to remove this pair from the recursion set once the recursion
+ // stack pops back out again.
+ RecursionSetRemover remover(RecursionSet, std::make_pair(V, BB));
+
+ // If V is a constant, then it is known in all predecessors.
+ if (Constant *KC = getKnownConstant(V, Preference)) {
+ for (BasicBlock *Pred : predecessors(BB))
+ Result.push_back(std::make_pair(KC, Pred));
+
+ return true;
+ }
+
+ // If V is a non-instruction value, or an instruction in a different block,
+ // then it can't be derived from a PHI.
+ Instruction *I = dyn_cast<Instruction>(V);
+ if (!I || I->getParent() != BB) {
+
+ // Okay, if this is a live-in value, see if it has a known value at the end
+ // of any of our predecessors.
+ //
+ // FIXME: This should be an edge property, not a block end property.
+ /// TODO: Per PR2563, we could infer value range information about a
+ /// predecessor based on its terminator.
+ //
+ // FIXME: change this to use the more-rich 'getPredicateOnEdge' method if
+ // "I" is a non-local compare-with-a-constant instruction. This would be
+ // able to handle value inequalities better, for example if the compare is
+ // "X < 4" and "X < 3" is known true but "X < 4" itself is not available.
+ // Perhaps getConstantOnEdge should be smart enough to do this?
+
+ for (BasicBlock *P : predecessors(BB)) {
+ // If the value is known by LazyValueInfo to be a constant in a
+ // predecessor, use that information to try to thread this block.
+ Constant *PredCst = LVI->getConstantOnEdge(V, P, BB, CxtI);
+ if (Constant *KC = getKnownConstant(PredCst, Preference))
+ Result.push_back(std::make_pair(KC, P));
+ }
+
+ return !Result.empty();
+ }
+
+ /// If I is a PHI node, then we know the incoming values for any constants.
+ if (PHINode *PN = dyn_cast<PHINode>(I)) {
+ for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+ Value *InVal = PN->getIncomingValue(i);
+ if (Constant *KC = getKnownConstant(InVal, Preference)) {
+ Result.push_back(std::make_pair(KC, PN->getIncomingBlock(i)));
+ } else {
+ Constant *CI = LVI->getConstantOnEdge(InVal,
+ PN->getIncomingBlock(i),
+ BB, CxtI);
+ if (Constant *KC = getKnownConstant(CI, Preference))
+ Result.push_back(std::make_pair(KC, PN->getIncomingBlock(i)));
+ }
+ }
+
+ return !Result.empty();
+ }
+
+ PredValueInfoTy LHSVals, RHSVals;
+
+ // Handle some boolean conditions.
+ if (I->getType()->getPrimitiveSizeInBits() == 1) {
+ assert(Preference == WantInteger && "One-bit non-integer type?");
+ // X | true -> true
+ // X & false -> false
+ if (I->getOpcode() == Instruction::Or ||
+ I->getOpcode() == Instruction::And) {
+ ComputeValueKnownInPredecessors(I->getOperand(0), BB, LHSVals,
+ WantInteger, CxtI);
+ ComputeValueKnownInPredecessors(I->getOperand(1), BB, RHSVals,
+ WantInteger, CxtI);
+
+ if (LHSVals.empty() && RHSVals.empty())
+ return false;
+
+ ConstantInt *InterestingVal;
+ if (I->getOpcode() == Instruction::Or)
+ InterestingVal = ConstantInt::getTrue(I->getContext());
+ else
+ InterestingVal = ConstantInt::getFalse(I->getContext());
+
+ SmallPtrSet<BasicBlock*, 4> LHSKnownBBs;
+
+ // Scan for the sentinel. If we find an undef, force it to the
+ // interesting value: x|undef -> true and x&undef -> false.
+ for (const auto &LHSVal : LHSVals)
+ if (LHSVal.first == InterestingVal || isa<UndefValue>(LHSVal.first)) {
+ Result.emplace_back(InterestingVal, LHSVal.second);
+ LHSKnownBBs.insert(LHSVal.second);
+ }
+ for (const auto &RHSVal : RHSVals)
+ if (RHSVal.first == InterestingVal || isa<UndefValue>(RHSVal.first)) {
+ // If we already inferred a value for this block on the LHS, don't
+ // re-add it.
+ if (!LHSKnownBBs.count(RHSVal.second))
+ Result.emplace_back(InterestingVal, RHSVal.second);
+ }
+
+ return !Result.empty();
+ }
+
+ // Handle the NOT form of XOR.
+ if (I->getOpcode() == Instruction::Xor &&
+ isa<ConstantInt>(I->getOperand(1)) &&
+ cast<ConstantInt>(I->getOperand(1))->isOne()) {
+ ComputeValueKnownInPredecessors(I->getOperand(0), BB, Result,
+ WantInteger, CxtI);
+ if (Result.empty())
+ return false;
+
+ // Invert the known values.
+ for (auto &R : Result)
+ R.first = ConstantExpr::getNot(R.first);
+
+ return true;
+ }
+
+ // Try to simplify some other binary operator values.
+ } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
+ assert(Preference != WantBlockAddress
+ && "A binary operator creating a block address?");
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->getOperand(1))) {
+ PredValueInfoTy LHSVals;
+ ComputeValueKnownInPredecessors(BO->getOperand(0), BB, LHSVals,
+ WantInteger, CxtI);
+
+ // Try to use constant folding to simplify the binary operator.
+ for (const auto &LHSVal : LHSVals) {
+ Constant *V = LHSVal.first;
+ Constant *Folded = ConstantExpr::get(BO->getOpcode(), V, CI);
+
+ if (Constant *KC = getKnownConstant(Folded, WantInteger))
+ Result.push_back(std::make_pair(KC, LHSVal.second));
+ }
+ }
+
+ return !Result.empty();
+ }
+
+ // Handle compare with phi operand, where the PHI is defined in this block.
+ if (CmpInst *Cmp = dyn_cast<CmpInst>(I)) {
+ assert(Preference == WantInteger && "Compares only produce integers");
+ PHINode *PN = dyn_cast<PHINode>(Cmp->getOperand(0));
+ if (PN && PN->getParent() == BB) {
+ const DataLayout &DL = PN->getModule()->getDataLayout();
+ // We can do this simplification if any comparisons fold to true or false.
+ // See if any do.
+ for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+ BasicBlock *PredBB = PN->getIncomingBlock(i);
+ Value *LHS = PN->getIncomingValue(i);
+ Value *RHS = Cmp->getOperand(1)->DoPHITranslation(BB, PredBB);
+
+ Value *Res = SimplifyCmpInst(Cmp->getPredicate(), LHS, RHS, DL);
+ if (!Res) {
+ if (!isa<Constant>(RHS))
+ continue;
+
+ LazyValueInfo::Tristate
+ ResT = LVI->getPredicateOnEdge(Cmp->getPredicate(), LHS,
+ cast<Constant>(RHS), PredBB, BB,
+ CxtI ? CxtI : Cmp);
+ if (ResT == LazyValueInfo::Unknown)
+ continue;
+ Res = ConstantInt::get(Type::getInt1Ty(LHS->getContext()), ResT);
+ }
+
+ if (Constant *KC = getKnownConstant(Res, WantInteger))
+ Result.push_back(std::make_pair(KC, PredBB));
+ }
+
+ return !Result.empty();
+ }
+
+ // If comparing a live-in value against a constant, see if we know the
+ // live-in value on any predecessors.
+ if (isa<Constant>(Cmp->getOperand(1)) && Cmp->getType()->isIntegerTy()) {
+ if (!isa<Instruction>(Cmp->getOperand(0)) ||
+ cast<Instruction>(Cmp->getOperand(0))->getParent() != BB) {
+ Constant *RHSCst = cast<Constant>(Cmp->getOperand(1));
+
+ for (BasicBlock *P : predecessors(BB)) {
+ // If the value is known by LazyValueInfo to be a constant in a
+ // predecessor, use that information to try to thread this block.
+ LazyValueInfo::Tristate Res =
+ LVI->getPredicateOnEdge(Cmp->getPredicate(), Cmp->getOperand(0),
+ RHSCst, P, BB, CxtI ? CxtI : Cmp);
+ if (Res == LazyValueInfo::Unknown)
+ continue;
+
+ Constant *ResC = ConstantInt::get(Cmp->getType(), Res);
+ Result.push_back(std::make_pair(ResC, P));
+ }
+
+ return !Result.empty();
+ }
+
+ // Try to find a constant value for the LHS of a comparison,
+ // and evaluate it statically if we can.
+ if (Constant *CmpConst = dyn_cast<Constant>(Cmp->getOperand(1))) {
+ PredValueInfoTy LHSVals;
+ ComputeValueKnownInPredecessors(I->getOperand(0), BB, LHSVals,
+ WantInteger, CxtI);
+
+ for (const auto &LHSVal : LHSVals) {
+ Constant *V = LHSVal.first;
+ Constant *Folded = ConstantExpr::getCompare(Cmp->getPredicate(),
+ V, CmpConst);
+ if (Constant *KC = getKnownConstant(Folded, WantInteger))
+ Result.push_back(std::make_pair(KC, LHSVal.second));
+ }
+
+ return !Result.empty();
+ }
+ }
+ }
+
+ if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
+ // Handle select instructions where at least one operand is a known constant
+ // and we can figure out the condition value for any predecessor block.
+ Constant *TrueVal = getKnownConstant(SI->getTrueValue(), Preference);
+ Constant *FalseVal = getKnownConstant(SI->getFalseValue(), Preference);
+ PredValueInfoTy Conds;
+ if ((TrueVal || FalseVal) &&
+ ComputeValueKnownInPredecessors(SI->getCondition(), BB, Conds,
+ WantInteger, CxtI)) {
+ for (auto &C : Conds) {
+ Constant *Cond = C.first;
+
+ // Figure out what value to use for the condition.
+ bool KnownCond;
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(Cond)) {
+ // A known boolean.
+ KnownCond = CI->isOne();
+ } else {
+ assert(isa<UndefValue>(Cond) && "Unexpected condition value");
+ // Either operand will do, so be sure to pick the one that's a known
+ // constant.
+ // FIXME: Do this more cleverly if both values are known constants?
+ KnownCond = (TrueVal != nullptr);
+ }
+
+ // See if the select has a known constant value for this predecessor.
+ if (Constant *Val = KnownCond ? TrueVal : FalseVal)
+ Result.push_back(std::make_pair(Val, C.second));
+ }
+
+ return !Result.empty();
+ }
+ }
+
+ // If all else fails, see if LVI can figure out a constant value for us.
+ Constant *CI = LVI->getConstant(V, BB, CxtI);
+ if (Constant *KC = getKnownConstant(CI, Preference)) {
+ for (BasicBlock *Pred : predecessors(BB))
+ Result.push_back(std::make_pair(KC, Pred));
+ }
+
+ return !Result.empty();
+}
+
+
+
+/// GetBestDestForBranchOnUndef - If we determine that the specified block ends
+/// in an undefined jump, decide which block is best to revector to.
+///
+/// Since we can pick an arbitrary destination, we pick the successor with the
+/// fewest predecessors. This should reduce the in-degree of the others.
+///
+static unsigned GetBestDestForJumpOnUndef(BasicBlock *BB) {
+ TerminatorInst *BBTerm = BB->getTerminator();
+ unsigned MinSucc = 0;
+ BasicBlock *TestBB = BBTerm->getSuccessor(MinSucc);
+ // Compute the successor with the minimum number of predecessors.
+ unsigned MinNumPreds = std::distance(pred_begin(TestBB), pred_end(TestBB));
+ for (unsigned i = 1, e = BBTerm->getNumSuccessors(); i != e; ++i) {
+ TestBB = BBTerm->getSuccessor(i);
+ unsigned NumPreds = std::distance(pred_begin(TestBB), pred_end(TestBB));
+ if (NumPreds < MinNumPreds) {
+ MinSucc = i;
+ MinNumPreds = NumPreds;
+ }
+ }
+
+ return MinSucc;
+}
+
+static bool hasAddressTakenAndUsed(BasicBlock *BB) {
+ if (!BB->hasAddressTaken()) return false;
+
+ // If the block has its address taken, it may be a tree of dead constants
+ // hanging off of it. These shouldn't keep the block alive.
+ BlockAddress *BA = BlockAddress::get(BB);
+ BA->removeDeadConstantUsers();
+ return !BA->use_empty();
+}
+
+/// ProcessBlock - If there are any predecessors whose control can be threaded
+/// through to a successor, transform them now.
+bool JumpThreading::ProcessBlock(BasicBlock *BB) {
+ // If the block is trivially dead, just return and let the caller nuke it.
+ // This simplifies other transformations.
+ if (pred_empty(BB) &&
+ BB != &BB->getParent()->getEntryBlock())
+ return false;
+
+ // If this block has a single predecessor, and if that pred has a single
+ // successor, merge the blocks. This encourages recursive jump threading
+ // because now the condition in this block can be threaded through
+ // predecessors of our predecessor block.
+ if (BasicBlock *SinglePred = BB->getSinglePredecessor()) {
+ const TerminatorInst *TI = SinglePred->getTerminator();
+ if (!TI->isExceptional() && TI->getNumSuccessors() == 1 &&
+ SinglePred != BB && !hasAddressTakenAndUsed(BB)) {
+ // If SinglePred was a loop header, BB becomes one.
+ if (LoopHeaders.erase(SinglePred))
+ LoopHeaders.insert(BB);
+
+ LVI->eraseBlock(SinglePred);
+ MergeBasicBlockIntoOnlyPred(BB);
+
+ return true;
+ }
+ }
+
+ if (TryToUnfoldSelectInCurrBB(BB))
+ return true;
+
+ // What kind of constant we're looking for.
+ ConstantPreference Preference = WantInteger;
+
+ // Look to see if the terminator is a conditional branch, switch or indirect
+ // branch, if not we can't thread it.
+ Value *Condition;
+ Instruction *Terminator = BB->getTerminator();
+ if (BranchInst *BI = dyn_cast<BranchInst>(Terminator)) {
+ // Can't thread an unconditional jump.
+ if (BI->isUnconditional()) return false;
+ Condition = BI->getCondition();
+ } else if (SwitchInst *SI = dyn_cast<SwitchInst>(Terminator)) {
+ Condition = SI->getCondition();
+ } else if (IndirectBrInst *IB = dyn_cast<IndirectBrInst>(Terminator)) {
+ // Can't thread indirect branch with no successors.
+ if (IB->getNumSuccessors() == 0) return false;
+ Condition = IB->getAddress()->stripPointerCasts();
+ Preference = WantBlockAddress;
+ } else {
+ return false; // Must be an invoke.
+ }
+
+ // Run constant folding to see if we can reduce the condition to a simple
+ // constant.
+ if (Instruction *I = dyn_cast<Instruction>(Condition)) {
+ Value *SimpleVal =
+ ConstantFoldInstruction(I, BB->getModule()->getDataLayout(), TLI);
+ if (SimpleVal) {
+ I->replaceAllUsesWith(SimpleVal);
+ I->eraseFromParent();
+ Condition = SimpleVal;
+ }
+ }
+
+ // If the terminator is branching on an undef, we can pick any of the
+ // successors to branch to. Let GetBestDestForJumpOnUndef decide.
+ if (isa<UndefValue>(Condition)) {
+ unsigned BestSucc = GetBestDestForJumpOnUndef(BB);
+
+ // Fold the branch/switch.
+ TerminatorInst *BBTerm = BB->getTerminator();
+ for (unsigned i = 0, e = BBTerm->getNumSuccessors(); i != e; ++i) {
+ if (i == BestSucc) continue;
+ BBTerm->getSuccessor(i)->removePredecessor(BB, true);
+ }
+
+ DEBUG(dbgs() << " In block '" << BB->getName()
+ << "' folding undef terminator: " << *BBTerm << '\n');
+ BranchInst::Create(BBTerm->getSuccessor(BestSucc), BBTerm);
+ BBTerm->eraseFromParent();
+ return true;
+ }
+
+ // If the terminator of this block is branching on a constant, simplify the
+ // terminator to an unconditional branch. This can occur due to threading in
+ // other blocks.
+ if (getKnownConstant(Condition, Preference)) {
+ DEBUG(dbgs() << " In block '" << BB->getName()
+ << "' folding terminator: " << *BB->getTerminator() << '\n');
+ ++NumFolds;
+ ConstantFoldTerminator(BB, true);
+ return true;
+ }
+
+ Instruction *CondInst = dyn_cast<Instruction>(Condition);
+
+ // All the rest of our checks depend on the condition being an instruction.
+ if (!CondInst) {
+ // FIXME: Unify this with code below.
+ if (ProcessThreadableEdges(Condition, BB, Preference, Terminator))
+ return true;
+ return false;
+ }
+
+
+ if (CmpInst *CondCmp = dyn_cast<CmpInst>(CondInst)) {
+ // If we're branching on a conditional, LVI might be able to determine
+ // it's value at the branch instruction. We only handle comparisons
+ // against a constant at this time.
+ // TODO: This should be extended to handle switches as well.
+ BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
+ Constant *CondConst = dyn_cast<Constant>(CondCmp->getOperand(1));
+ if (CondBr && CondConst && CondBr->isConditional()) {
+ LazyValueInfo::Tristate Ret =
+ LVI->getPredicateAt(CondCmp->getPredicate(), CondCmp->getOperand(0),
+ CondConst, CondBr);
+ if (Ret != LazyValueInfo::Unknown) {
+ unsigned ToRemove = Ret == LazyValueInfo::True ? 1 : 0;
+ unsigned ToKeep = Ret == LazyValueInfo::True ? 0 : 1;
+ CondBr->getSuccessor(ToRemove)->removePredecessor(BB, true);
+ BranchInst::Create(CondBr->getSuccessor(ToKeep), CondBr);
+ CondBr->eraseFromParent();
+ if (CondCmp->use_empty())
+ CondCmp->eraseFromParent();
+ else if (CondCmp->getParent() == BB) {
+ // If the fact we just learned is true for all uses of the
+ // condition, replace it with a constant value
+ auto *CI = Ret == LazyValueInfo::True ?
+ ConstantInt::getTrue(CondCmp->getType()) :
+ ConstantInt::getFalse(CondCmp->getType());
+ CondCmp->replaceAllUsesWith(CI);
+ CondCmp->eraseFromParent();
+ }
+ return true;
+ }
+ }
+
+ if (CondBr && CondConst && TryToUnfoldSelect(CondCmp, BB))
+ return true;
+ }
+
+ // Check for some cases that are worth simplifying. Right now we want to look
+ // for loads that are used by a switch or by the condition for the branch. If
+ // we see one, check to see if it's partially redundant. If so, insert a PHI
+ // which can then be used to thread the values.
+ //
+ Value *SimplifyValue = CondInst;
+ if (CmpInst *CondCmp = dyn_cast<CmpInst>(SimplifyValue))
+ if (isa<Constant>(CondCmp->getOperand(1)))
+ SimplifyValue = CondCmp->getOperand(0);
+
+ // TODO: There are other places where load PRE would be profitable, such as
+ // more complex comparisons.
+ if (LoadInst *LI = dyn_cast<LoadInst>(SimplifyValue))
+ if (SimplifyPartiallyRedundantLoad(LI))
+ return true;
+
+
+ // Handle a variety of cases where we are branching on something derived from
+ // a PHI node in the current block. If we can prove that any predecessors
+ // compute a predictable value based on a PHI node, thread those predecessors.
+ //
+ if (ProcessThreadableEdges(CondInst, BB, Preference, Terminator))
+ return true;
+
+ // If this is an otherwise-unfoldable branch on a phi node in the current
+ // block, see if we can simplify.
+ if (PHINode *PN = dyn_cast<PHINode>(CondInst))
+ if (PN->getParent() == BB && isa<BranchInst>(BB->getTerminator()))
+ return ProcessBranchOnPHI(PN);
+
+
+ // If this is an otherwise-unfoldable branch on a XOR, see if we can simplify.
+ if (CondInst->getOpcode() == Instruction::Xor &&
+ CondInst->getParent() == BB && isa<BranchInst>(BB->getTerminator()))
+ return ProcessBranchOnXOR(cast<BinaryOperator>(CondInst));
+
+ // Search for a stronger dominating condition that can be used to simplify a
+ // conditional branch leaving BB.
+ if (ProcessImpliedCondition(BB))
+ return true;
+
+ return false;
+}
+
+bool JumpThreading::ProcessImpliedCondition(BasicBlock *BB) {
+ auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
+ if (!BI || !BI->isConditional())
+ return false;
+
+ Value *Cond = BI->getCondition();
+ BasicBlock *CurrentBB = BB;
+ BasicBlock *CurrentPred = BB->getSinglePredecessor();
+ unsigned Iter = 0;
+
+ auto &DL = BB->getModule()->getDataLayout();
+
+ while (CurrentPred && Iter++ < ImplicationSearchThreshold) {
+ auto *PBI = dyn_cast<BranchInst>(CurrentPred->getTerminator());
+ if (!PBI || !PBI->isConditional() || PBI->getSuccessor(0) != CurrentBB)
+ return false;
+
+ if (isImpliedCondition(PBI->getCondition(), Cond, DL)) {
+ BI->getSuccessor(1)->removePredecessor(BB);
+ BranchInst::Create(BI->getSuccessor(0), BI);
+ BI->eraseFromParent();
+ return true;
+ }
+ CurrentBB = CurrentPred;
+ CurrentPred = CurrentBB->getSinglePredecessor();
+ }
+
+ return false;
+}
+
+/// SimplifyPartiallyRedundantLoad - If LI is an obviously partially redundant
+/// load instruction, eliminate it by replacing it with a PHI node. This is an
+/// important optimization that encourages jump threading, and needs to be run
+/// interlaced with other jump threading tasks.
+bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
+ // Don't hack volatile/atomic loads.
+ if (!LI->isSimple()) return false;
+
+ // If the load is defined in a block with exactly one predecessor, it can't be
+ // partially redundant.
+ BasicBlock *LoadBB = LI->getParent();
+ if (LoadBB->getSinglePredecessor())
+ return false;
+
+ // If the load is defined in an EH pad, it can't be partially redundant,
+ // because the edges between the invoke and the EH pad cannot have other
+ // instructions between them.
+ if (LoadBB->isEHPad())
+ return false;
+
+ Value *LoadedPtr = LI->getOperand(0);
+
+ // If the loaded operand is defined in the LoadBB, it can't be available.
+ // TODO: Could do simple PHI translation, that would be fun :)
+ if (Instruction *PtrOp = dyn_cast<Instruction>(LoadedPtr))
+ if (PtrOp->getParent() == LoadBB)
+ return false;
+
+ // Scan a few instructions up from the load, to see if it is obviously live at
+ // the entry to its block.
+ BasicBlock::iterator BBIt(LI);
+
+ if (Value *AvailableVal =
+ FindAvailableLoadedValue(LoadedPtr, LoadBB, BBIt, DefMaxInstsToScan)) {
+ // If the value of the load is locally available within the block, just use
+ // it. This frequently occurs for reg2mem'd allocas.
+ //cerr << "LOAD ELIMINATED:\n" << *BBIt << *LI << "\n";
+
+ // If the returned value is the load itself, replace with an undef. This can
+ // only happen in dead loops.
+ if (AvailableVal == LI) AvailableVal = UndefValue::get(LI->getType());
+ if (AvailableVal->getType() != LI->getType())
+ AvailableVal =
+ CastInst::CreateBitOrPointerCast(AvailableVal, LI->getType(), "", LI);
+ LI->replaceAllUsesWith(AvailableVal);
+ LI->eraseFromParent();
+ return true;
+ }
+
+ // Otherwise, if we scanned the whole block and got to the top of the block,
+ // we know the block is locally transparent to the load. If not, something
+ // might clobber its value.
+ if (BBIt != LoadBB->begin())
+ return false;
+
+ // If all of the loads and stores that feed the value have the same AA tags,
+ // then we can propagate them onto any newly inserted loads.
+ AAMDNodes AATags;
+ LI->getAAMetadata(AATags);
+
+ SmallPtrSet<BasicBlock*, 8> PredsScanned;
+ typedef SmallVector<std::pair<BasicBlock*, Value*>, 8> AvailablePredsTy;
+ AvailablePredsTy AvailablePreds;
+ BasicBlock *OneUnavailablePred = nullptr;
+
+ // If we got here, the loaded value is transparent through to the start of the
+ // block. Check to see if it is available in any of the predecessor blocks.
+ for (BasicBlock *PredBB : predecessors(LoadBB)) {
+ // If we already scanned this predecessor, skip it.
+ if (!PredsScanned.insert(PredBB).second)
+ continue;
+
+ // Scan the predecessor to see if the value is available in the pred.
+ BBIt = PredBB->end();
+ AAMDNodes ThisAATags;
+ Value *PredAvailable = FindAvailableLoadedValue(LoadedPtr, PredBB, BBIt,
+ DefMaxInstsToScan,
+ nullptr, &ThisAATags);
+ if (!PredAvailable) {
+ OneUnavailablePred = PredBB;
+ continue;
+ }
+
+ // If AA tags disagree or are not present, forget about them.
+ if (AATags != ThisAATags) AATags = AAMDNodes();
+
+ // If so, this load is partially redundant. Remember this info so that we
+ // can create a PHI node.
+ AvailablePreds.push_back(std::make_pair(PredBB, PredAvailable));
+ }
+
+ // If the loaded value isn't available in any predecessor, it isn't partially
+ // redundant.
+ if (AvailablePreds.empty()) return false;
+
+ // Okay, the loaded value is available in at least one (and maybe all!)
+ // predecessors. If the value is unavailable in more than one unique
+ // predecessor, we want to insert a merge block for those common predecessors.
+ // This ensures that we only have to insert one reload, thus not increasing
+ // code size.
+ BasicBlock *UnavailablePred = nullptr;
+
+ // If there is exactly one predecessor where the value is unavailable, the
+ // already computed 'OneUnavailablePred' block is it. If it ends in an
+ // unconditional branch, we know that it isn't a critical edge.
+ if (PredsScanned.size() == AvailablePreds.size()+1 &&
+ OneUnavailablePred->getTerminator()->getNumSuccessors() == 1) {
+ UnavailablePred = OneUnavailablePred;
+ } else if (PredsScanned.size() != AvailablePreds.size()) {
+ // Otherwise, we had multiple unavailable predecessors or we had a critical
+ // edge from the one.
+ SmallVector<BasicBlock*, 8> PredsToSplit;
+ SmallPtrSet<BasicBlock*, 8> AvailablePredSet;
+
+ for (const auto &AvailablePred : AvailablePreds)
+ AvailablePredSet.insert(AvailablePred.first);
+
+ // Add all the unavailable predecessors to the PredsToSplit list.
+ for (BasicBlock *P : predecessors(LoadBB)) {
+ // If the predecessor is an indirect goto, we can't split the edge.
+ if (isa<IndirectBrInst>(P->getTerminator()))
+ return false;
+
+ if (!AvailablePredSet.count(P))
+ PredsToSplit.push_back(P);
+ }
+
+ // Split them out to their own block.
+ UnavailablePred = SplitBlockPreds(LoadBB, PredsToSplit, "thread-pre-split");
+ }
+
+ // If the value isn't available in all predecessors, then there will be
+ // exactly one where it isn't available. Insert a load on that edge and add
+ // it to the AvailablePreds list.
+ if (UnavailablePred) {
+ assert(UnavailablePred->getTerminator()->getNumSuccessors() == 1 &&
+ "Can't handle critical edge here!");
+ LoadInst *NewVal = new LoadInst(LoadedPtr, LI->getName()+".pr", false,
+ LI->getAlignment(),
+ UnavailablePred->getTerminator());
+ NewVal->setDebugLoc(LI->getDebugLoc());
+ if (AATags)
+ NewVal->setAAMetadata(AATags);
+
+ AvailablePreds.push_back(std::make_pair(UnavailablePred, NewVal));
+ }
+
+ // Now we know that each predecessor of this block has a value in
+ // AvailablePreds, sort them for efficient access as we're walking the preds.
+ array_pod_sort(AvailablePreds.begin(), AvailablePreds.end());
+
+ // Create a PHI node at the start of the block for the PRE'd load value.
+ pred_iterator PB = pred_begin(LoadBB), PE = pred_end(LoadBB);
+ PHINode *PN = PHINode::Create(LI->getType(), std::distance(PB, PE), "",
+ &LoadBB->front());
+ PN->takeName(LI);
+ PN->setDebugLoc(LI->getDebugLoc());
+
+ // Insert new entries into the PHI for each predecessor. A single block may
+ // have multiple entries here.
+ for (pred_iterator PI = PB; PI != PE; ++PI) {
+ BasicBlock *P = *PI;
+ AvailablePredsTy::iterator I =
+ std::lower_bound(AvailablePreds.begin(), AvailablePreds.end(),
+ std::make_pair(P, (Value*)nullptr));
+
+ assert(I != AvailablePreds.end() && I->first == P &&
+ "Didn't find entry for predecessor!");
+
+ // If we have an available predecessor but it requires casting, insert the
+ // cast in the predecessor and use the cast. Note that we have to update the
+ // AvailablePreds vector as we go so that all of the PHI entries for this
+ // predecessor use the same bitcast.
+ Value *&PredV = I->second;
+ if (PredV->getType() != LI->getType())
+ PredV = CastInst::CreateBitOrPointerCast(PredV, LI->getType(), "",
+ P->getTerminator());
+
+ PN->addIncoming(PredV, I->first);
+ }
+
+ //cerr << "PRE: " << *LI << *PN << "\n";
+
+ LI->replaceAllUsesWith(PN);
+ LI->eraseFromParent();
+
+ return true;
+}
+
+/// FindMostPopularDest - The specified list contains multiple possible
+/// threadable destinations. Pick the one that occurs the most frequently in
+/// the list.
+static BasicBlock *
+FindMostPopularDest(BasicBlock *BB,
+ const SmallVectorImpl<std::pair<BasicBlock*,
+ BasicBlock*> > &PredToDestList) {
+ assert(!PredToDestList.empty());
+
+ // Determine popularity. If there are multiple possible destinations, we
+ // explicitly choose to ignore 'undef' destinations. We prefer to thread
+ // blocks with known and real destinations to threading undef. We'll handle
+ // them later if interesting.
+ DenseMap<BasicBlock*, unsigned> DestPopularity;
+ for (const auto &PredToDest : PredToDestList)
+ if (PredToDest.second)
+ DestPopularity[PredToDest.second]++;
+
+ // Find the most popular dest.
+ DenseMap<BasicBlock*, unsigned>::iterator DPI = DestPopularity.begin();
+ BasicBlock *MostPopularDest = DPI->first;
+ unsigned Popularity = DPI->second;
+ SmallVector<BasicBlock*, 4> SamePopularity;
+
+ for (++DPI; DPI != DestPopularity.end(); ++DPI) {
+ // If the popularity of this entry isn't higher than the popularity we've
+ // seen so far, ignore it.
+ if (DPI->second < Popularity)
+ ; // ignore.
+ else if (DPI->second == Popularity) {
+ // If it is the same as what we've seen so far, keep track of it.
+ SamePopularity.push_back(DPI->first);
+ } else {
+ // If it is more popular, remember it.
+ SamePopularity.clear();
+ MostPopularDest = DPI->first;
+ Popularity = DPI->second;
+ }
+ }
+
+ // Okay, now we know the most popular destination. If there is more than one
+ // destination, we need to determine one. This is arbitrary, but we need
+ // to make a deterministic decision. Pick the first one that appears in the
+ // successor list.
+ if (!SamePopularity.empty()) {
+ SamePopularity.push_back(MostPopularDest);
+ TerminatorInst *TI = BB->getTerminator();
+ for (unsigned i = 0; ; ++i) {
+ assert(i != TI->getNumSuccessors() && "Didn't find any successor!");
+
+ if (std::find(SamePopularity.begin(), SamePopularity.end(),
+ TI->getSuccessor(i)) == SamePopularity.end())
+ continue;
+
+ MostPopularDest = TI->getSuccessor(i);
+ break;
+ }
+ }
+
+ // Okay, we have finally picked the most popular destination.
+ return MostPopularDest;
+}
+
+bool JumpThreading::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
+ ConstantPreference Preference,
+ Instruction *CxtI) {
+ // If threading this would thread across a loop header, don't even try to
+ // thread the edge.
+ if (LoopHeaders.count(BB))
+ return false;
+
+ PredValueInfoTy PredValues;
+ if (!ComputeValueKnownInPredecessors(Cond, BB, PredValues, Preference, CxtI))
+ return false;
+
+ assert(!PredValues.empty() &&
+ "ComputeValueKnownInPredecessors returned true with no values");
+
+ DEBUG(dbgs() << "IN BB: " << *BB;
+ for (const auto &PredValue : PredValues) {
+ dbgs() << " BB '" << BB->getName() << "': FOUND condition = "
+ << *PredValue.first
+ << " for pred '" << PredValue.second->getName() << "'.\n";
+ });
+
+ // Decide what we want to thread through. Convert our list of known values to
+ // a list of known destinations for each pred. This also discards duplicate
+ // predecessors and keeps track of the undefined inputs (which are represented
+ // as a null dest in the PredToDestList).
+ SmallPtrSet<BasicBlock*, 16> SeenPreds;
+ SmallVector<std::pair<BasicBlock*, BasicBlock*>, 16> PredToDestList;
+
+ BasicBlock *OnlyDest = nullptr;
+ BasicBlock *MultipleDestSentinel = (BasicBlock*)(intptr_t)~0ULL;
+
+ for (const auto &PredValue : PredValues) {
+ BasicBlock *Pred = PredValue.second;
+ if (!SeenPreds.insert(Pred).second)
+ continue; // Duplicate predecessor entry.
+
+ // If the predecessor ends with an indirect goto, we can't change its
+ // destination.
+ if (isa<IndirectBrInst>(Pred->getTerminator()))
+ continue;
+
+ Constant *Val = PredValue.first;
+
+ BasicBlock *DestBB;
+ if (isa<UndefValue>(Val))
+ DestBB = nullptr;
+ else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator()))
+ DestBB = BI->getSuccessor(cast<ConstantInt>(Val)->isZero());
+ else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
+ DestBB = SI->findCaseValue(cast<ConstantInt>(Val)).getCaseSuccessor();
+ } else {
+ assert(isa<IndirectBrInst>(BB->getTerminator())
+ && "Unexpected terminator");
+ DestBB = cast<BlockAddress>(Val)->getBasicBlock();
+ }
+
+ // If we have exactly one destination, remember it for efficiency below.
+ if (PredToDestList.empty())
+ OnlyDest = DestBB;
+ else if (OnlyDest != DestBB)
+ OnlyDest = MultipleDestSentinel;
+
+ PredToDestList.push_back(std::make_pair(Pred, DestBB));
+ }
+
+ // If all edges were unthreadable, we fail.
+ if (PredToDestList.empty())
+ return false;
+
+ // Determine which is the most common successor. If we have many inputs and
+ // this block is a switch, we want to start by threading the batch that goes
+ // to the most popular destination first. If we only know about one
+ // threadable destination (the common case) we can avoid this.
+ BasicBlock *MostPopularDest = OnlyDest;
+
+ if (MostPopularDest == MultipleDestSentinel)
+ MostPopularDest = FindMostPopularDest(BB, PredToDestList);
+
+ // Now that we know what the most popular destination is, factor all
+ // predecessors that will jump to it into a single predecessor.
+ SmallVector<BasicBlock*, 16> PredsToFactor;
+ for (const auto &PredToDest : PredToDestList)
+ if (PredToDest.second == MostPopularDest) {
+ BasicBlock *Pred = PredToDest.first;
+
+ // This predecessor may be a switch or something else that has multiple
+ // edges to the block. Factor each of these edges by listing them
+ // according to # occurrences in PredsToFactor.
+ for (BasicBlock *Succ : successors(Pred))
+ if (Succ == BB)
+ PredsToFactor.push_back(Pred);
+ }
+
+ // If the threadable edges are branching on an undefined value, we get to pick
+ // the destination that these predecessors should get to.
+ if (!MostPopularDest)
+ MostPopularDest = BB->getTerminator()->
+ getSuccessor(GetBestDestForJumpOnUndef(BB));
+
+ // Ok, try to thread it!
+ return ThreadEdge(BB, PredsToFactor, MostPopularDest);
+}
+
+/// ProcessBranchOnPHI - We have an otherwise unthreadable conditional branch on
+/// a PHI node in the current block. See if there are any simplifications we
+/// can do based on inputs to the phi node.
+///
+bool JumpThreading::ProcessBranchOnPHI(PHINode *PN) {
+ BasicBlock *BB = PN->getParent();
+
+ // TODO: We could make use of this to do it once for blocks with common PHI
+ // values.
+ SmallVector<BasicBlock*, 1> PredBBs;
+ PredBBs.resize(1);
+
+ // If any of the predecessor blocks end in an unconditional branch, we can
+ // *duplicate* the conditional branch into that block in order to further
+ // encourage jump threading and to eliminate cases where we have branch on a
+ // phi of an icmp (branch on icmp is much better).
+ for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+ BasicBlock *PredBB = PN->getIncomingBlock(i);
+ if (BranchInst *PredBr = dyn_cast<BranchInst>(PredBB->getTerminator()))
+ if (PredBr->isUnconditional()) {
+ PredBBs[0] = PredBB;
+ // Try to duplicate BB into PredBB.
+ if (DuplicateCondBranchOnPHIIntoPred(BB, PredBBs))
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/// ProcessBranchOnXOR - We have an otherwise unthreadable conditional branch on
+/// a xor instruction in the current block. See if there are any
+/// simplifications we can do based on inputs to the xor.
+///
+bool JumpThreading::ProcessBranchOnXOR(BinaryOperator *BO) {
+ BasicBlock *BB = BO->getParent();
+
+ // If either the LHS or RHS of the xor is a constant, don't do this
+ // optimization.
+ if (isa<ConstantInt>(BO->getOperand(0)) ||
+ isa<ConstantInt>(BO->getOperand(1)))
+ return false;
+
+ // If the first instruction in BB isn't a phi, we won't be able to infer
+ // anything special about any particular predecessor.
+ if (!isa<PHINode>(BB->front()))
+ return false;
+
+ // If we have a xor as the branch input to this block, and we know that the
+ // LHS or RHS of the xor in any predecessor is true/false, then we can clone
+ // the condition into the predecessor and fix that value to true, saving some
+ // logical ops on that path and encouraging other paths to simplify.
+ //
+ // This copies something like this:
+ //
+ // BB:
+ // %X = phi i1 [1], [%X']
+ // %Y = icmp eq i32 %A, %B
+ // %Z = xor i1 %X, %Y
+ // br i1 %Z, ...
+ //
+ // Into:
+ // BB':
+ // %Y = icmp ne i32 %A, %B
+ // br i1 %Y, ...
+
+ PredValueInfoTy XorOpValues;
+ bool isLHS = true;
+ if (!ComputeValueKnownInPredecessors(BO->getOperand(0), BB, XorOpValues,
+ WantInteger, BO)) {
+ assert(XorOpValues.empty());
+ if (!ComputeValueKnownInPredecessors(BO->getOperand(1), BB, XorOpValues,
+ WantInteger, BO))
+ return false;
+ isLHS = false;
+ }
+
+ assert(!XorOpValues.empty() &&
+ "ComputeValueKnownInPredecessors returned true with no values");
+
+ // Scan the information to see which is most popular: true or false. The
+ // predecessors can be of the set true, false, or undef.
+ unsigned NumTrue = 0, NumFalse = 0;
+ for (const auto &XorOpValue : XorOpValues) {
+ if (isa<UndefValue>(XorOpValue.first))
+ // Ignore undefs for the count.
+ continue;
+ if (cast<ConstantInt>(XorOpValue.first)->isZero())
+ ++NumFalse;
+ else
+ ++NumTrue;
+ }
+
+ // Determine which value to split on, true, false, or undef if neither.
+ ConstantInt *SplitVal = nullptr;
+ if (NumTrue > NumFalse)
+ SplitVal = ConstantInt::getTrue(BB->getContext());
+ else if (NumTrue != 0 || NumFalse != 0)
+ SplitVal = ConstantInt::getFalse(BB->getContext());
+
+ // Collect all of the blocks that this can be folded into so that we can
+ // factor this once and clone it once.
+ SmallVector<BasicBlock*, 8> BlocksToFoldInto;
+ for (const auto &XorOpValue : XorOpValues) {
+ if (XorOpValue.first != SplitVal && !isa<UndefValue>(XorOpValue.first))
+ continue;
+
+ BlocksToFoldInto.push_back(XorOpValue.second);
+ }
+
+ // If we inferred a value for all of the predecessors, then duplication won't
+ // help us. However, we can just replace the LHS or RHS with the constant.
+ if (BlocksToFoldInto.size() ==
+ cast<PHINode>(BB->front()).getNumIncomingValues()) {
+ if (!SplitVal) {
+ // If all preds provide undef, just nuke the xor, because it is undef too.
+ BO->replaceAllUsesWith(UndefValue::get(BO->getType()));
+ BO->eraseFromParent();
+ } else if (SplitVal->isZero()) {
+ // If all preds provide 0, replace the xor with the other input.
+ BO->replaceAllUsesWith(BO->getOperand(isLHS));
+ BO->eraseFromParent();
+ } else {
+ // If all preds provide 1, set the computed value to 1.
+ BO->setOperand(!isLHS, SplitVal);
+ }
+
+ return true;
+ }
+
+ // Try to duplicate BB into PredBB.
+ return DuplicateCondBranchOnPHIIntoPred(BB, BlocksToFoldInto);
+}
+
+
+/// AddPHINodeEntriesForMappedBlock - We're adding 'NewPred' as a new
+/// predecessor to the PHIBB block. If it has PHI nodes, add entries for
+/// NewPred using the entries from OldPred (suitably mapped).
+static void AddPHINodeEntriesForMappedBlock(BasicBlock *PHIBB,
+ BasicBlock *OldPred,
+ BasicBlock *NewPred,
+ DenseMap<Instruction*, Value*> &ValueMap) {
+ for (BasicBlock::iterator PNI = PHIBB->begin();
+ PHINode *PN = dyn_cast<PHINode>(PNI); ++PNI) {
+ // Ok, we have a PHI node. Figure out what the incoming value was for the
+ // DestBlock.
+ Value *IV = PN->getIncomingValueForBlock(OldPred);
+
+ // Remap the value if necessary.
+ if (Instruction *Inst = dyn_cast<Instruction>(IV)) {
+ DenseMap<Instruction*, Value*>::iterator I = ValueMap.find(Inst);
+ if (I != ValueMap.end())
+ IV = I->second;
+ }
+
+ PN->addIncoming(IV, NewPred);
+ }
+}
+
+/// ThreadEdge - We have decided that it is safe and profitable to factor the
+/// blocks in PredBBs to one predecessor, then thread an edge from it to SuccBB
+/// across BB. Transform the IR to reflect this change.
+bool JumpThreading::ThreadEdge(BasicBlock *BB,
+ const SmallVectorImpl<BasicBlock*> &PredBBs,
+ BasicBlock *SuccBB) {
+ // If threading to the same block as we come from, we would infinite loop.
+ if (SuccBB == BB) {
+ DEBUG(dbgs() << " Not threading across BB '" << BB->getName()
+ << "' - would thread to self!\n");
+ return false;
+ }
+
+ // If threading this would thread across a loop header, don't thread the edge.
+ // See the comments above FindLoopHeaders for justifications and caveats.
+ if (LoopHeaders.count(BB)) {
+ DEBUG(dbgs() << " Not threading across loop header BB '" << BB->getName()
+ << "' to dest BB '" << SuccBB->getName()
+ << "' - it might create an irreducible loop!\n");
+ return false;
+ }
+
+ unsigned JumpThreadCost = getJumpThreadDuplicationCost(BB, BBDupThreshold);
+ if (JumpThreadCost > BBDupThreshold) {
+ DEBUG(dbgs() << " Not threading BB '" << BB->getName()
+ << "' - Cost is too high: " << JumpThreadCost << "\n");
+ return false;
+ }
+
+ // And finally, do it! Start by factoring the predecessors if needed.
+ BasicBlock *PredBB;
+ if (PredBBs.size() == 1)
+ PredBB = PredBBs[0];
+ else {
+ DEBUG(dbgs() << " Factoring out " << PredBBs.size()
+ << " common predecessors.\n");
+ PredBB = SplitBlockPreds(BB, PredBBs, ".thr_comm");
+ }
+
+ // And finally, do it!
+ DEBUG(dbgs() << " Threading edge from '" << PredBB->getName() << "' to '"
+ << SuccBB->getName() << "' with cost: " << JumpThreadCost
+ << ", across block:\n "
+ << *BB << "\n");
+
+ LVI->threadEdge(PredBB, BB, SuccBB);
+
+ // We are going to have to map operands from the original BB block to the new
+ // copy of the block 'NewBB'. If there are PHI nodes in BB, evaluate them to
+ // account for entry from PredBB.
+ DenseMap<Instruction*, Value*> ValueMapping;
+
+ BasicBlock *NewBB = BasicBlock::Create(BB->getContext(),
+ BB->getName()+".thread",
+ BB->getParent(), BB);
+ NewBB->moveAfter(PredBB);
+
+ // Set the block frequency of NewBB.
+ if (HasProfileData) {
+ auto NewBBFreq =
+ BFI->getBlockFreq(PredBB) * BPI->getEdgeProbability(PredBB, BB);
+ BFI->setBlockFreq(NewBB, NewBBFreq.getFrequency());
+ }
+
+ BasicBlock::iterator BI = BB->begin();
+ for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
+ ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB);
+
+ // Clone the non-phi instructions of BB into NewBB, keeping track of the
+ // mapping and using it to remap operands in the cloned instructions.
+ for (; !isa<TerminatorInst>(BI); ++BI) {
+ Instruction *New = BI->clone();
+ New->setName(BI->getName());
+ NewBB->getInstList().push_back(New);
+ ValueMapping[&*BI] = New;
+
+ // Remap operands to patch up intra-block references.
+ for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
+ if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) {
+ DenseMap<Instruction*, Value*>::iterator I = ValueMapping.find(Inst);
+ if (I != ValueMapping.end())
+ New->setOperand(i, I->second);
+ }
+ }
+
+ // We didn't copy the terminator from BB over to NewBB, because there is now
+ // an unconditional jump to SuccBB. Insert the unconditional jump.
+ BranchInst *NewBI = BranchInst::Create(SuccBB, NewBB);
+ NewBI->setDebugLoc(BB->getTerminator()->getDebugLoc());
+
+ // Check to see if SuccBB has PHI nodes. If so, we need to add entries to the
+ // PHI nodes for NewBB now.
+ AddPHINodeEntriesForMappedBlock(SuccBB, BB, NewBB, ValueMapping);
+
+ // If there were values defined in BB that are used outside the block, then we
+ // now have to update all uses of the value to use either the original value,
+ // the cloned value, or some PHI derived value. This can require arbitrary
+ // PHI insertion, of which we are prepared to do, clean these up now.
+ SSAUpdater SSAUpdate;
+ SmallVector<Use*, 16> UsesToRename;
+ for (Instruction &I : *BB) {
+ // Scan all uses of this instruction to see if it is used outside of its
+ // block, and if so, record them in UsesToRename.
+ for (Use &U : I.uses()) {
+ Instruction *User = cast<Instruction>(U.getUser());
+ if (PHINode *UserPN = dyn_cast<PHINode>(User)) {
+ if (UserPN->getIncomingBlock(U) == BB)
+ continue;
+ } else if (User->getParent() == BB)
+ continue;
+
+ UsesToRename.push_back(&U);
+ }
+
+ // If there are no uses outside the block, we're done with this instruction.
+ if (UsesToRename.empty())
+ continue;
+
+ DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n");
+
+ // We found a use of I outside of BB. Rename all uses of I that are outside
+ // its block to be uses of the appropriate PHI node etc. See ValuesInBlocks
+ // with the two values we know.
+ SSAUpdate.Initialize(I.getType(), I.getName());
+ SSAUpdate.AddAvailableValue(BB, &I);
+ SSAUpdate.AddAvailableValue(NewBB, ValueMapping[&I]);
+
+ while (!UsesToRename.empty())
+ SSAUpdate.RewriteUse(*UsesToRename.pop_back_val());
+ DEBUG(dbgs() << "\n");
+ }
+
+
+ // Ok, NewBB is good to go. Update the terminator of PredBB to jump to
+ // NewBB instead of BB. This eliminates predecessors from BB, which requires
+ // us to simplify any PHI nodes in BB.
+ TerminatorInst *PredTerm = PredBB->getTerminator();
+ for (unsigned i = 0, e = PredTerm->getNumSuccessors(); i != e; ++i)
+ if (PredTerm->getSuccessor(i) == BB) {
+ BB->removePredecessor(PredBB, true);
+ PredTerm->setSuccessor(i, NewBB);
+ }
+
+ // At this point, the IR is fully up to date and consistent. Do a quick scan
+ // over the new instructions and zap any that are constants or dead. This
+ // frequently happens because of phi translation.
+ SimplifyInstructionsInBlock(NewBB, TLI);
+
+ // Update the edge weight from BB to SuccBB, which should be less than before.
+ UpdateBlockFreqAndEdgeWeight(PredBB, BB, NewBB, SuccBB);
+
+ // Threaded an edge!
+ ++NumThreads;
+ return true;
+}
+
+/// Create a new basic block that will be the predecessor of BB and successor of
+/// all blocks in Preds. When profile data is availble, update the frequency of
+/// this new block.
+BasicBlock *JumpThreading::SplitBlockPreds(BasicBlock *BB,
+ ArrayRef<BasicBlock *> Preds,
+ const char *Suffix) {
+ // Collect the frequencies of all predecessors of BB, which will be used to
+ // update the edge weight on BB->SuccBB.
+ BlockFrequency PredBBFreq(0);
+ if (HasProfileData)
+ for (auto Pred : Preds)
+ PredBBFreq += BFI->getBlockFreq(Pred) * BPI->getEdgeProbability(Pred, BB);
+
+ BasicBlock *PredBB = SplitBlockPredecessors(BB, Preds, Suffix);
+
+ // Set the block frequency of the newly created PredBB, which is the sum of
+ // frequencies of Preds.
+ if (HasProfileData)
+ BFI->setBlockFreq(PredBB, PredBBFreq.getFrequency());
+ return PredBB;
+}
+
+/// Update the block frequency of BB and branch weight and the metadata on the
+/// edge BB->SuccBB. This is done by scaling the weight of BB->SuccBB by 1 -
+/// Freq(PredBB->BB) / Freq(BB->SuccBB).
+void JumpThreading::UpdateBlockFreqAndEdgeWeight(BasicBlock *PredBB,
+ BasicBlock *BB,
+ BasicBlock *NewBB,
+ BasicBlock *SuccBB) {
+ if (!HasProfileData)
+ return;
+
+ assert(BFI && BPI && "BFI & BPI should have been created here");
+
+ // As the edge from PredBB to BB is deleted, we have to update the block
+ // frequency of BB.
+ auto BBOrigFreq = BFI->getBlockFreq(BB);
+ auto NewBBFreq = BFI->getBlockFreq(NewBB);
+ auto BB2SuccBBFreq = BBOrigFreq * BPI->getEdgeProbability(BB, SuccBB);
+ auto BBNewFreq = BBOrigFreq - NewBBFreq;
+ BFI->setBlockFreq(BB, BBNewFreq.getFrequency());
+
+ // Collect updated outgoing edges' frequencies from BB and use them to update
+ // edge probabilities.
+ SmallVector<uint64_t, 4> BBSuccFreq;
+ for (BasicBlock *Succ : successors(BB)) {
+ auto SuccFreq = (Succ == SuccBB)
+ ? BB2SuccBBFreq - NewBBFreq
+ : BBOrigFreq * BPI->getEdgeProbability(BB, Succ);
+ BBSuccFreq.push_back(SuccFreq.getFrequency());
+ }
+
+ uint64_t MaxBBSuccFreq =
+ *std::max_element(BBSuccFreq.begin(), BBSuccFreq.end());
+
+ SmallVector<BranchProbability, 4> BBSuccProbs;
+ if (MaxBBSuccFreq == 0)
+ BBSuccProbs.assign(BBSuccFreq.size(),
+ {1, static_cast<unsigned>(BBSuccFreq.size())});
+ else {
+ for (uint64_t Freq : BBSuccFreq)
+ BBSuccProbs.push_back(
+ BranchProbability::getBranchProbability(Freq, MaxBBSuccFreq));
+ // Normalize edge probabilities so that they sum up to one.
+ BranchProbability::normalizeProbabilities(BBSuccProbs.begin(),
+ BBSuccProbs.end());
+ }
+
+ // Update edge probabilities in BPI.
+ for (int I = 0, E = BBSuccProbs.size(); I < E; I++)
+ BPI->setEdgeProbability(BB, I, BBSuccProbs[I]);
+
+ if (BBSuccProbs.size() >= 2) {
+ SmallVector<uint32_t, 4> Weights;
+ for (auto Prob : BBSuccProbs)
+ Weights.push_back(Prob.getNumerator());
+
+ auto TI = BB->getTerminator();
+ TI->setMetadata(
+ LLVMContext::MD_prof,
+ MDBuilder(TI->getParent()->getContext()).createBranchWeights(Weights));
+ }
+}
+
+/// DuplicateCondBranchOnPHIIntoPred - PredBB contains an unconditional branch
+/// to BB which contains an i1 PHI node and a conditional branch on that PHI.
+/// If we can duplicate the contents of BB up into PredBB do so now, this
+/// improves the odds that the branch will be on an analyzable instruction like
+/// a compare.
+bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB,
+ const SmallVectorImpl<BasicBlock *> &PredBBs) {
+ assert(!PredBBs.empty() && "Can't handle an empty set");
+
+ // If BB is a loop header, then duplicating this block outside the loop would
+ // cause us to transform this into an irreducible loop, don't do this.
+ // See the comments above FindLoopHeaders for justifications and caveats.
+ if (LoopHeaders.count(BB)) {
+ DEBUG(dbgs() << " Not duplicating loop header '" << BB->getName()
+ << "' into predecessor block '" << PredBBs[0]->getName()
+ << "' - it might create an irreducible loop!\n");
+ return false;
+ }
+
+ unsigned DuplicationCost = getJumpThreadDuplicationCost(BB, BBDupThreshold);
+ if (DuplicationCost > BBDupThreshold) {
+ DEBUG(dbgs() << " Not duplicating BB '" << BB->getName()
+ << "' - Cost is too high: " << DuplicationCost << "\n");
+ return false;
+ }
+
+ // And finally, do it! Start by factoring the predecessors if needed.
+ BasicBlock *PredBB;
+ if (PredBBs.size() == 1)
+ PredBB = PredBBs[0];
+ else {
+ DEBUG(dbgs() << " Factoring out " << PredBBs.size()
+ << " common predecessors.\n");
+ PredBB = SplitBlockPreds(BB, PredBBs, ".thr_comm");
+ }
+
+ // Okay, we decided to do this! Clone all the instructions in BB onto the end
+ // of PredBB.
+ DEBUG(dbgs() << " Duplicating block '" << BB->getName() << "' into end of '"
+ << PredBB->getName() << "' to eliminate branch on phi. Cost: "
+ << DuplicationCost << " block is:" << *BB << "\n");
+
+ // Unless PredBB ends with an unconditional branch, split the edge so that we
+ // can just clone the bits from BB into the end of the new PredBB.
+ BranchInst *OldPredBranch = dyn_cast<BranchInst>(PredBB->getTerminator());
+
+ if (!OldPredBranch || !OldPredBranch->isUnconditional()) {
+ PredBB = SplitEdge(PredBB, BB);
+ OldPredBranch = cast<BranchInst>(PredBB->getTerminator());
+ }
+
+ // We are going to have to map operands from the original BB block into the
+ // PredBB block. Evaluate PHI nodes in BB.
+ DenseMap<Instruction*, Value*> ValueMapping;
+
+ BasicBlock::iterator BI = BB->begin();
+ for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
+ ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB);
+ // Clone the non-phi instructions of BB into PredBB, keeping track of the
+ // mapping and using it to remap operands in the cloned instructions.
+ for (; BI != BB->end(); ++BI) {
+ Instruction *New = BI->clone();
+
+ // Remap operands to patch up intra-block references.
+ for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
+ if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) {
+ DenseMap<Instruction*, Value*>::iterator I = ValueMapping.find(Inst);
+ if (I != ValueMapping.end())
+ New->setOperand(i, I->second);
+ }
+
+ // If this instruction can be simplified after the operands are updated,
+ // just use the simplified value instead. This frequently happens due to
+ // phi translation.
+ if (Value *IV =
+ SimplifyInstruction(New, BB->getModule()->getDataLayout())) {
+ delete New;
+ ValueMapping[&*BI] = IV;
+ } else {
+ // Otherwise, insert the new instruction into the block.
+ New->setName(BI->getName());
+ PredBB->getInstList().insert(OldPredBranch->getIterator(), New);
+ ValueMapping[&*BI] = New;
+ }
+ }
+
+ // Check to see if the targets of the branch had PHI nodes. If so, we need to
+ // add entries to the PHI nodes for branch from PredBB now.
+ BranchInst *BBBranch = cast<BranchInst>(BB->getTerminator());
+ AddPHINodeEntriesForMappedBlock(BBBranch->getSuccessor(0), BB, PredBB,
+ ValueMapping);
+ AddPHINodeEntriesForMappedBlock(BBBranch->getSuccessor(1), BB, PredBB,
+ ValueMapping);
+
+ // If there were values defined in BB that are used outside the block, then we
+ // now have to update all uses of the value to use either the original value,
+ // the cloned value, or some PHI derived value. This can require arbitrary
+ // PHI insertion, of which we are prepared to do, clean these up now.
+ SSAUpdater SSAUpdate;
+ SmallVector<Use*, 16> UsesToRename;
+ for (Instruction &I : *BB) {
+ // Scan all uses of this instruction to see if it is used outside of its
+ // block, and if so, record them in UsesToRename.
+ for (Use &U : I.uses()) {
+ Instruction *User = cast<Instruction>(U.getUser());
+ if (PHINode *UserPN = dyn_cast<PHINode>(User)) {
+ if (UserPN->getIncomingBlock(U) == BB)
+ continue;
+ } else if (User->getParent() == BB)
+ continue;
+
+ UsesToRename.push_back(&U);
+ }
+
+ // If there are no uses outside the block, we're done with this instruction.
+ if (UsesToRename.empty())
+ continue;
+
+ DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n");
+
+ // We found a use of I outside of BB. Rename all uses of I that are outside
+ // its block to be uses of the appropriate PHI node etc. See ValuesInBlocks
+ // with the two values we know.
+ SSAUpdate.Initialize(I.getType(), I.getName());
+ SSAUpdate.AddAvailableValue(BB, &I);
+ SSAUpdate.AddAvailableValue(PredBB, ValueMapping[&I]);
+
+ while (!UsesToRename.empty())
+ SSAUpdate.RewriteUse(*UsesToRename.pop_back_val());
+ DEBUG(dbgs() << "\n");
+ }
+
+ // PredBB no longer jumps to BB, remove entries in the PHI node for the edge
+ // that we nuked.
+ BB->removePredecessor(PredBB, true);
+
+ // Remove the unconditional branch at the end of the PredBB block.
+ OldPredBranch->eraseFromParent();
+
+ ++NumDupes;
+ return true;
+}
+
+/// TryToUnfoldSelect - Look for blocks of the form
+/// bb1:
+/// %a = select
+/// br bb
+///
+/// bb2:
+/// %p = phi [%a, %bb] ...
+/// %c = icmp %p
+/// br i1 %c
+///
+/// And expand the select into a branch structure if one of its arms allows %c
+/// to be folded. This later enables threading from bb1 over bb2.
+bool JumpThreading::TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) {
+ BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
+ PHINode *CondLHS = dyn_cast<PHINode>(CondCmp->getOperand(0));
+ Constant *CondRHS = cast<Constant>(CondCmp->getOperand(1));
+
+ if (!CondBr || !CondBr->isConditional() || !CondLHS ||
+ CondLHS->getParent() != BB)
+ return false;
+
+ for (unsigned I = 0, E = CondLHS->getNumIncomingValues(); I != E; ++I) {
+ BasicBlock *Pred = CondLHS->getIncomingBlock(I);
+ SelectInst *SI = dyn_cast<SelectInst>(CondLHS->getIncomingValue(I));
+
+ // Look if one of the incoming values is a select in the corresponding
+ // predecessor.
+ if (!SI || SI->getParent() != Pred || !SI->hasOneUse())
+ continue;
+
+ BranchInst *PredTerm = dyn_cast<BranchInst>(Pred->getTerminator());
+ if (!PredTerm || !PredTerm->isUnconditional())
+ continue;
+
+ // Now check if one of the select values would allow us to constant fold the
+ // terminator in BB. We don't do the transform if both sides fold, those
+ // cases will be threaded in any case.
+ LazyValueInfo::Tristate LHSFolds =
+ LVI->getPredicateOnEdge(CondCmp->getPredicate(), SI->getOperand(1),
+ CondRHS, Pred, BB, CondCmp);
+ LazyValueInfo::Tristate RHSFolds =
+ LVI->getPredicateOnEdge(CondCmp->getPredicate(), SI->getOperand(2),
+ CondRHS, Pred, BB, CondCmp);
+ if ((LHSFolds != LazyValueInfo::Unknown ||
+ RHSFolds != LazyValueInfo::Unknown) &&
+ LHSFolds != RHSFolds) {
+ // Expand the select.
+ //
+ // Pred --
+ // | v
+ // | NewBB
+ // | |
+ // |-----
+ // v
+ // BB
+ BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "select.unfold",
+ BB->getParent(), BB);
+ // Move the unconditional branch to NewBB.
+ PredTerm->removeFromParent();
+ NewBB->getInstList().insert(NewBB->end(), PredTerm);
+ // Create a conditional branch and update PHI nodes.
+ BranchInst::Create(NewBB, BB, SI->getCondition(), Pred);
+ CondLHS->setIncomingValue(I, SI->getFalseValue());
+ CondLHS->addIncoming(SI->getTrueValue(), NewBB);
+ // The select is now dead.
+ SI->eraseFromParent();
+
+ // Update any other PHI nodes in BB.
+ for (BasicBlock::iterator BI = BB->begin();
+ PHINode *Phi = dyn_cast<PHINode>(BI); ++BI)
+ if (Phi != CondLHS)
+ Phi->addIncoming(Phi->getIncomingValueForBlock(Pred), NewBB);
+ return true;
+ }
+ }
+ return false;
+}
+
+/// TryToUnfoldSelectInCurrBB - Look for PHI/Select in the same BB of the form
+/// bb:
+/// %p = phi [false, %bb1], [true, %bb2], [false, %bb3], [true, %bb4], ...
+/// %s = select p, trueval, falseval
+///
+/// And expand the select into a branch structure. This later enables
+/// jump-threading over bb in this pass.
+///
+/// Using the similar approach of SimplifyCFG::FoldCondBranchOnPHI(), unfold
+/// select if the associated PHI has at least one constant. If the unfolded
+/// select is not jump-threaded, it will be folded again in the later
+/// optimizations.
+bool JumpThreading::TryToUnfoldSelectInCurrBB(BasicBlock *BB) {
+ // If threading this would thread across a loop header, don't thread the edge.
+ // See the comments above FindLoopHeaders for justifications and caveats.
+ if (LoopHeaders.count(BB))
+ return false;
+
+ // Look for a Phi/Select pair in the same basic block. The Phi feeds the
+ // condition of the Select and at least one of the incoming values is a
+ // constant.
+ for (BasicBlock::iterator BI = BB->begin();
+ PHINode *PN = dyn_cast<PHINode>(BI); ++BI) {
+ unsigned NumPHIValues = PN->getNumIncomingValues();
+ if (NumPHIValues == 0 || !PN->hasOneUse())
+ continue;
+
+ SelectInst *SI = dyn_cast<SelectInst>(PN->user_back());
+ if (!SI || SI->getParent() != BB)
+ continue;
+
+ Value *Cond = SI->getCondition();
+ if (!Cond || Cond != PN || !Cond->getType()->isIntegerTy(1))
+ continue;
+
+ bool HasConst = false;
+ for (unsigned i = 0; i != NumPHIValues; ++i) {
+ if (PN->getIncomingBlock(i) == BB)
+ return false;
+ if (isa<ConstantInt>(PN->getIncomingValue(i)))
+ HasConst = true;
+ }
+
+ if (HasConst) {
+ // Expand the select.
+ TerminatorInst *Term =
+ SplitBlockAndInsertIfThen(SI->getCondition(), SI, false);
+ PHINode *NewPN = PHINode::Create(SI->getType(), 2, "", SI);
+ NewPN->addIncoming(SI->getTrueValue(), Term->getParent());
+ NewPN->addIncoming(SI->getFalseValue(), BB);
+ SI->replaceAllUsesWith(NewPN);
+ SI->eraseFromParent();
+ return true;
+ }
+ }
+
+ return false;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp
new file mode 100644
index 0000000..8923ff7
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -0,0 +1,1107 @@
+//===-- LICM.cpp - Loop Invariant Code Motion Pass ------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs loop invariant code motion, attempting to remove as much
+// code from the body of a loop as possible. It does this by either hoisting
+// code into the preheader block, or by sinking code to the exit blocks if it is
+// safe. This pass also promotes must-aliased memory locations in the loop to
+// live in registers, thus hoisting and sinking "invariant" loads and stores.
+//
+// This pass uses alias analysis for two purposes:
+//
+// 1. Moving loop invariant loads and calls out of loops. If we can determine
+// that a load or call inside of a loop never aliases anything stored to,
+// we can hoist it or sink it like any other instruction.
+// 2. Scalar Promotion of Memory - If there is a store instruction inside of
+// the loop, we try to move the store to happen AFTER the loop instead of
+// inside of the loop. This can only happen if a few conditions are true:
+// A. The pointer stored through is loop invariant
+// B. There are no stores or loads in the loop which _may_ alias the
+// pointer. There are no calls in the loop which mod/ref the pointer.
+// If these conditions are true, we can promote the loads and stores in the
+// loop of the pointer to use a temporary alloca'd variable. We then use
+// the SSAUpdater to construct the appropriate SSA form for the value.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/PredIteratorCache.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include <algorithm>
+using namespace llvm;
+
+#define DEBUG_TYPE "licm"
+
+STATISTIC(NumSunk , "Number of instructions sunk out of loop");
+STATISTIC(NumHoisted , "Number of instructions hoisted out of loop");
+STATISTIC(NumMovedLoads, "Number of load insts hoisted or sunk");
+STATISTIC(NumMovedCalls, "Number of call insts hoisted or sunk");
+STATISTIC(NumPromoted , "Number of memory locations promoted to registers");
+
+static cl::opt<bool>
+DisablePromotion("disable-licm-promotion", cl::Hidden,
+ cl::desc("Disable memory promotion in LICM pass"));
+
+static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI);
+static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop,
+ const LICMSafetyInfo *SafetyInfo);
+static bool hoist(Instruction &I, BasicBlock *Preheader);
+static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT,
+ const Loop *CurLoop, AliasSetTracker *CurAST,
+ const LICMSafetyInfo *SafetyInfo);
+static bool isGuaranteedToExecute(const Instruction &Inst,
+ const DominatorTree *DT,
+ const Loop *CurLoop,
+ const LICMSafetyInfo *SafetyInfo);
+static bool isSafeToExecuteUnconditionally(const Instruction &Inst,
+ const DominatorTree *DT,
+ const TargetLibraryInfo *TLI,
+ const Loop *CurLoop,
+ const LICMSafetyInfo *SafetyInfo,
+ const Instruction *CtxI = nullptr);
+static bool pointerInvalidatedByLoop(Value *V, uint64_t Size,
+ const AAMDNodes &AAInfo,
+ AliasSetTracker *CurAST);
+static Instruction *
+CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN,
+ const LoopInfo *LI,
+ const LICMSafetyInfo *SafetyInfo);
+static bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA,
+ DominatorTree *DT, TargetLibraryInfo *TLI,
+ Loop *CurLoop, AliasSetTracker *CurAST,
+ LICMSafetyInfo *SafetyInfo);
+
+namespace {
+ struct LICM : public LoopPass {
+ static char ID; // Pass identification, replacement for typeid
+ LICM() : LoopPass(ID) {
+ initializeLICMPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+
+ /// This transformation requires natural loop information & requires that
+ /// loop preheaders be inserted into the CFG...
+ ///
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addRequiredID(LoopSimplifyID);
+ AU.addPreservedID(LoopSimplifyID);
+ AU.addRequiredID(LCSSAID);
+ AU.addPreservedID(LCSSAID);
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addPreserved<AAResultsWrapperPass>();
+ AU.addPreserved<BasicAAWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addPreserved<ScalarEvolutionWrapperPass>();
+ AU.addPreserved<SCEVAAWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ }
+
+ using llvm::Pass::doFinalization;
+
+ bool doFinalization() override {
+ assert(LoopToAliasSetMap.empty() && "Didn't free loop alias sets");
+ return false;
+ }
+
+ private:
+ AliasAnalysis *AA; // Current AliasAnalysis information
+ LoopInfo *LI; // Current LoopInfo
+ DominatorTree *DT; // Dominator Tree for the current Loop.
+
+ TargetLibraryInfo *TLI; // TargetLibraryInfo for constant folding.
+
+ // State that is updated as we process loops.
+ bool Changed; // Set to true when we change anything.
+ BasicBlock *Preheader; // The preheader block of the current loop...
+ Loop *CurLoop; // The current loop we are working on...
+ AliasSetTracker *CurAST; // AliasSet information for the current loop...
+ DenseMap<Loop*, AliasSetTracker*> LoopToAliasSetMap;
+
+ /// cloneBasicBlockAnalysis - Simple Analysis hook. Clone alias set info.
+ void cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To,
+ Loop *L) override;
+
+ /// deleteAnalysisValue - Simple Analysis hook. Delete value V from alias
+ /// set.
+ void deleteAnalysisValue(Value *V, Loop *L) override;
+
+ /// Simple Analysis hook. Delete loop L from alias set map.
+ void deleteAnalysisLoop(Loop *L) override;
+ };
+}
+
+char LICM::ID = 0;
+INITIALIZE_PASS_BEGIN(LICM, "licm", "Loop Invariant Code Motion", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(LCSSA)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
+INITIALIZE_PASS_END(LICM, "licm", "Loop Invariant Code Motion", false, false)
+
+Pass *llvm::createLICMPass() { return new LICM(); }
+
+/// Hoist expressions out of the specified loop. Note, alias info for inner
+/// loop is not preserved so it is not a good idea to run LICM multiple
+/// times on one loop.
+///
+bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) {
+ if (skipOptnoneFunction(L))
+ return false;
+
+ Changed = false;
+
+ // Get our Loop and Alias Analysis information...
+ LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
+ TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+
+ assert(L->isLCSSAForm(*DT) && "Loop is not in LCSSA form.");
+
+ CurAST = new AliasSetTracker(*AA);
+ // Collect Alias info from subloops.
+ for (Loop *InnerL : L->getSubLoops()) {
+ AliasSetTracker *InnerAST = LoopToAliasSetMap[InnerL];
+ assert(InnerAST && "Where is my AST?");
+
+ // What if InnerLoop was modified by other passes ?
+ CurAST->add(*InnerAST);
+
+ // Once we've incorporated the inner loop's AST into ours, we don't need the
+ // subloop's anymore.
+ delete InnerAST;
+ LoopToAliasSetMap.erase(InnerL);
+ }
+
+ CurLoop = L;
+
+ // Get the preheader block to move instructions into...
+ Preheader = L->getLoopPreheader();
+
+ // Loop over the body of this loop, looking for calls, invokes, and stores.
+ // Because subloops have already been incorporated into AST, we skip blocks in
+ // subloops.
+ //
+ for (BasicBlock *BB : L->blocks()) {
+ if (LI->getLoopFor(BB) == L) // Ignore blocks in subloops.
+ CurAST->add(*BB); // Incorporate the specified basic block
+ }
+
+ // Compute loop safety information.
+ LICMSafetyInfo SafetyInfo;
+ computeLICMSafetyInfo(&SafetyInfo, CurLoop);
+
+ // We want to visit all of the instructions in this loop... that are not parts
+ // of our subloops (they have already had their invariants hoisted out of
+ // their loop, into this loop, so there is no need to process the BODIES of
+ // the subloops).
+ //
+ // Traverse the body of the loop in depth first order on the dominator tree so
+ // that we are guaranteed to see definitions before we see uses. This allows
+ // us to sink instructions in one pass, without iteration. After sinking
+ // instructions, we perform another pass to hoist them out of the loop.
+ //
+ if (L->hasDedicatedExits())
+ Changed |= sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, CurLoop,
+ CurAST, &SafetyInfo);
+ if (Preheader)
+ Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI,
+ CurLoop, CurAST, &SafetyInfo);
+
+ // Now that all loop invariants have been removed from the loop, promote any
+ // memory references to scalars that we can.
+ if (!DisablePromotion && (Preheader || L->hasDedicatedExits())) {
+ SmallVector<BasicBlock *, 8> ExitBlocks;
+ SmallVector<Instruction *, 8> InsertPts;
+ PredIteratorCache PIC;
+
+ // Loop over all of the alias sets in the tracker object.
+ for (AliasSet &AS : *CurAST)
+ Changed |= promoteLoopAccessesToScalars(AS, ExitBlocks, InsertPts,
+ PIC, LI, DT, CurLoop,
+ CurAST, &SafetyInfo);
+
+ // Once we have promoted values across the loop body we have to recursively
+ // reform LCSSA as any nested loop may now have values defined within the
+ // loop used in the outer loop.
+ // FIXME: This is really heavy handed. It would be a bit better to use an
+ // SSAUpdater strategy during promotion that was LCSSA aware and reformed
+ // it as it went.
+ if (Changed) {
+ auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
+ formLCSSARecursively(*L, *DT, LI, SEWP ? &SEWP->getSE() : nullptr);
+ }
+ }
+
+ // Check that neither this loop nor its parent have had LCSSA broken. LICM is
+ // specifically moving instructions across the loop boundary and so it is
+ // especially in need of sanity checking here.
+ assert(L->isLCSSAForm(*DT) && "Loop not left in LCSSA form after LICM!");
+ assert((!L->getParentLoop() || L->getParentLoop()->isLCSSAForm(*DT)) &&
+ "Parent loop not left in LCSSA form after LICM!");
+
+ // Clear out loops state information for the next iteration
+ CurLoop = nullptr;
+ Preheader = nullptr;
+
+ // If this loop is nested inside of another one, save the alias information
+ // for when we process the outer loop.
+ if (L->getParentLoop())
+ LoopToAliasSetMap[L] = CurAST;
+ else
+ delete CurAST;
+ return Changed;
+}
+
+/// Walk the specified region of the CFG (defined by all blocks dominated by
+/// the specified block, and that are in the current loop) in reverse depth
+/// first order w.r.t the DominatorTree. This allows us to visit uses before
+/// definitions, allowing us to sink a loop body in one pass without iteration.
+///
+bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
+ DominatorTree *DT, TargetLibraryInfo *TLI, Loop *CurLoop,
+ AliasSetTracker *CurAST, LICMSafetyInfo *SafetyInfo) {
+
+ // Verify inputs.
+ assert(N != nullptr && AA != nullptr && LI != nullptr &&
+ DT != nullptr && CurLoop != nullptr && CurAST != nullptr &&
+ SafetyInfo != nullptr && "Unexpected input to sinkRegion");
+
+ // Set changed as false.
+ bool Changed = false;
+ // Get basic block
+ BasicBlock *BB = N->getBlock();
+ // If this subregion is not in the top level loop at all, exit.
+ if (!CurLoop->contains(BB)) return Changed;
+
+ // We are processing blocks in reverse dfo, so process children first.
+ const std::vector<DomTreeNode*> &Children = N->getChildren();
+ for (DomTreeNode *Child : Children)
+ Changed |= sinkRegion(Child, AA, LI, DT, TLI, CurLoop, CurAST, SafetyInfo);
+
+ // Only need to process the contents of this block if it is not part of a
+ // subloop (which would already have been processed).
+ if (inSubLoop(BB,CurLoop,LI)) return Changed;
+
+ for (BasicBlock::iterator II = BB->end(); II != BB->begin(); ) {
+ Instruction &I = *--II;
+
+ // If the instruction is dead, we would try to sink it because it isn't used
+ // in the loop, instead, just delete it.
+ if (isInstructionTriviallyDead(&I, TLI)) {
+ DEBUG(dbgs() << "LICM deleting dead inst: " << I << '\n');
+ ++II;
+ CurAST->deleteValue(&I);
+ I.eraseFromParent();
+ Changed = true;
+ continue;
+ }
+
+ // Check to see if we can sink this instruction to the exit blocks
+ // of the loop. We can do this if the all users of the instruction are
+ // outside of the loop. In this case, it doesn't even matter if the
+ // operands of the instruction are loop invariant.
+ //
+ if (isNotUsedInLoop(I, CurLoop, SafetyInfo) &&
+ canSinkOrHoistInst(I, AA, DT, TLI, CurLoop, CurAST, SafetyInfo)) {
+ ++II;
+ Changed |= sink(I, LI, DT, CurLoop, CurAST, SafetyInfo);
+ }
+ }
+ return Changed;
+}
+
+/// Walk the specified region of the CFG (defined by all blocks dominated by
+/// the specified block, and that are in the current loop) in depth first
+/// order w.r.t the DominatorTree. This allows us to visit definitions before
+/// uses, allowing us to hoist a loop body in one pass without iteration.
+///
+bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
+ DominatorTree *DT, TargetLibraryInfo *TLI, Loop *CurLoop,
+ AliasSetTracker *CurAST, LICMSafetyInfo *SafetyInfo) {
+ // Verify inputs.
+ assert(N != nullptr && AA != nullptr && LI != nullptr &&
+ DT != nullptr && CurLoop != nullptr && CurAST != nullptr &&
+ SafetyInfo != nullptr && "Unexpected input to hoistRegion");
+ // Set changed as false.
+ bool Changed = false;
+ // Get basic block
+ BasicBlock *BB = N->getBlock();
+ // If this subregion is not in the top level loop at all, exit.
+ if (!CurLoop->contains(BB)) return Changed;
+ // Only need to process the contents of this block if it is not part of a
+ // subloop (which would already have been processed).
+ if (!inSubLoop(BB, CurLoop, LI))
+ for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E; ) {
+ Instruction &I = *II++;
+ // Try constant folding this instruction. If all the operands are
+ // constants, it is technically hoistable, but it would be better to just
+ // fold it.
+ if (Constant *C = ConstantFoldInstruction(
+ &I, I.getModule()->getDataLayout(), TLI)) {
+ DEBUG(dbgs() << "LICM folding inst: " << I << " --> " << *C << '\n');
+ CurAST->copyValue(&I, C);
+ CurAST->deleteValue(&I);
+ I.replaceAllUsesWith(C);
+ I.eraseFromParent();
+ continue;
+ }
+
+ // Try hoisting the instruction out to the preheader. We can only do this
+ // if all of the operands of the instruction are loop invariant and if it
+ // is safe to hoist the instruction.
+ //
+ if (CurLoop->hasLoopInvariantOperands(&I) &&
+ canSinkOrHoistInst(I, AA, DT, TLI, CurLoop, CurAST, SafetyInfo) &&
+ isSafeToExecuteUnconditionally(I, DT, TLI, CurLoop, SafetyInfo,
+ CurLoop->getLoopPreheader()->getTerminator()))
+ Changed |= hoist(I, CurLoop->getLoopPreheader());
+ }
+
+ const std::vector<DomTreeNode*> &Children = N->getChildren();
+ for (DomTreeNode *Child : Children)
+ Changed |= hoistRegion(Child, AA, LI, DT, TLI, CurLoop, CurAST, SafetyInfo);
+ return Changed;
+}
+
+/// Computes loop safety information, checks loop body & header
+/// for the possibility of may throw exception.
+///
+void llvm::computeLICMSafetyInfo(LICMSafetyInfo * SafetyInfo, Loop * CurLoop) {
+ assert(CurLoop != nullptr && "CurLoop cant be null");
+ BasicBlock *Header = CurLoop->getHeader();
+ // Setting default safety values.
+ SafetyInfo->MayThrow = false;
+ SafetyInfo->HeaderMayThrow = false;
+ // Iterate over header and compute safety info.
+ for (BasicBlock::iterator I = Header->begin(), E = Header->end();
+ (I != E) && !SafetyInfo->HeaderMayThrow; ++I)
+ SafetyInfo->HeaderMayThrow |= I->mayThrow();
+
+ SafetyInfo->MayThrow = SafetyInfo->HeaderMayThrow;
+ // Iterate over loop instructions and compute safety info.
+ for (Loop::block_iterator BB = CurLoop->block_begin(),
+ BBE = CurLoop->block_end(); (BB != BBE) && !SafetyInfo->MayThrow ; ++BB)
+ for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end();
+ (I != E) && !SafetyInfo->MayThrow; ++I)
+ SafetyInfo->MayThrow |= I->mayThrow();
+
+ // Compute funclet colors if we might sink/hoist in a function with a funclet
+ // personality routine.
+ Function *Fn = CurLoop->getHeader()->getParent();
+ if (Fn->hasPersonalityFn())
+ if (Constant *PersonalityFn = Fn->getPersonalityFn())
+ if (isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)))
+ SafetyInfo->BlockColors = colorEHFunclets(*Fn);
+}
+
+/// canSinkOrHoistInst - Return true if the hoister and sinker can handle this
+/// instruction.
+///
+bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA, DominatorTree *DT,
+ TargetLibraryInfo *TLI, Loop *CurLoop,
+ AliasSetTracker *CurAST, LICMSafetyInfo *SafetyInfo) {
+ // Loads have extra constraints we have to verify before we can hoist them.
+ if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
+ if (!LI->isUnordered())
+ return false; // Don't hoist volatile/atomic loads!
+
+ // Loads from constant memory are always safe to move, even if they end up
+ // in the same alias set as something that ends up being modified.
+ if (AA->pointsToConstantMemory(LI->getOperand(0)))
+ return true;
+ if (LI->getMetadata(LLVMContext::MD_invariant_load))
+ return true;
+
+ // Don't hoist loads which have may-aliased stores in loop.
+ uint64_t Size = 0;
+ if (LI->getType()->isSized())
+ Size = I.getModule()->getDataLayout().getTypeStoreSize(LI->getType());
+
+ AAMDNodes AAInfo;
+ LI->getAAMetadata(AAInfo);
+
+ return !pointerInvalidatedByLoop(LI->getOperand(0), Size, AAInfo, CurAST);
+ } else if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+ // Don't sink or hoist dbg info; it's legal, but not useful.
+ if (isa<DbgInfoIntrinsic>(I))
+ return false;
+
+ // Don't sink calls which can throw.
+ if (CI->mayThrow())
+ return false;
+
+ // Handle simple cases by querying alias analysis.
+ FunctionModRefBehavior Behavior = AA->getModRefBehavior(CI);
+ if (Behavior == FMRB_DoesNotAccessMemory)
+ return true;
+ if (AliasAnalysis::onlyReadsMemory(Behavior)) {
+ // A readonly argmemonly function only reads from memory pointed to by
+ // it's arguments with arbitrary offsets. If we can prove there are no
+ // writes to this memory in the loop, we can hoist or sink.
+ if (AliasAnalysis::onlyAccessesArgPointees(Behavior)) {
+ for (Value *Op : CI->arg_operands())
+ if (Op->getType()->isPointerTy() &&
+ pointerInvalidatedByLoop(Op, MemoryLocation::UnknownSize,
+ AAMDNodes(), CurAST))
+ return false;
+ return true;
+ }
+ // If this call only reads from memory and there are no writes to memory
+ // in the loop, we can hoist or sink the call as appropriate.
+ bool FoundMod = false;
+ for (AliasSet &AS : *CurAST) {
+ if (!AS.isForwardingAliasSet() && AS.isMod()) {
+ FoundMod = true;
+ break;
+ }
+ }
+ if (!FoundMod) return true;
+ }
+
+ // FIXME: This should use mod/ref information to see if we can hoist or
+ // sink the call.
+
+ return false;
+ }
+
+ // Only these instructions are hoistable/sinkable.
+ if (!isa<BinaryOperator>(I) && !isa<CastInst>(I) && !isa<SelectInst>(I) &&
+ !isa<GetElementPtrInst>(I) && !isa<CmpInst>(I) &&
+ !isa<InsertElementInst>(I) && !isa<ExtractElementInst>(I) &&
+ !isa<ShuffleVectorInst>(I) && !isa<ExtractValueInst>(I) &&
+ !isa<InsertValueInst>(I))
+ return false;
+
+ // TODO: Plumb the context instruction through to make hoisting and sinking
+ // more powerful. Hoisting of loads already works due to the special casing
+ // above.
+ return isSafeToExecuteUnconditionally(I, DT, TLI, CurLoop, SafetyInfo,
+ nullptr);
+}
+
+/// Returns true if a PHINode is a trivially replaceable with an
+/// Instruction.
+/// This is true when all incoming values are that instruction.
+/// This pattern occurs most often with LCSSA PHI nodes.
+///
+static bool isTriviallyReplacablePHI(const PHINode &PN, const Instruction &I) {
+ for (const Value *IncValue : PN.incoming_values())
+ if (IncValue != &I)
+ return false;
+
+ return true;
+}
+
+/// Return true if the only users of this instruction are outside of
+/// the loop. If this is true, we can sink the instruction to the exit
+/// blocks of the loop.
+///
+static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop,
+ const LICMSafetyInfo *SafetyInfo) {
+ const auto &BlockColors = SafetyInfo->BlockColors;
+ for (const User *U : I.users()) {
+ const Instruction *UI = cast<Instruction>(U);
+ if (const PHINode *PN = dyn_cast<PHINode>(UI)) {
+ const BasicBlock *BB = PN->getParent();
+ // We cannot sink uses in catchswitches.
+ if (isa<CatchSwitchInst>(BB->getTerminator()))
+ return false;
+
+ // We need to sink a callsite to a unique funclet. Avoid sinking if the
+ // phi use is too muddled.
+ if (isa<CallInst>(I))
+ if (!BlockColors.empty() &&
+ BlockColors.find(const_cast<BasicBlock *>(BB))->second.size() != 1)
+ return false;
+
+ // A PHI node where all of the incoming values are this instruction are
+ // special -- they can just be RAUW'ed with the instruction and thus
+ // don't require a use in the predecessor. This is a particular important
+ // special case because it is the pattern found in LCSSA form.
+ if (isTriviallyReplacablePHI(*PN, I)) {
+ if (CurLoop->contains(PN))
+ return false;
+ else
+ continue;
+ }
+
+ // Otherwise, PHI node uses occur in predecessor blocks if the incoming
+ // values. Check for such a use being inside the loop.
+ for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+ if (PN->getIncomingValue(i) == &I)
+ if (CurLoop->contains(PN->getIncomingBlock(i)))
+ return false;
+
+ continue;
+ }
+
+ if (CurLoop->contains(UI))
+ return false;
+ }
+ return true;
+}
+
+static Instruction *
+CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN,
+ const LoopInfo *LI,
+ const LICMSafetyInfo *SafetyInfo) {
+ Instruction *New;
+ if (auto *CI = dyn_cast<CallInst>(&I)) {
+ const auto &BlockColors = SafetyInfo->BlockColors;
+
+ // Sinking call-sites need to be handled differently from other
+ // instructions. The cloned call-site needs a funclet bundle operand
+ // appropriate for it's location in the CFG.
+ SmallVector<OperandBundleDef, 1> OpBundles;
+ for (unsigned BundleIdx = 0, BundleEnd = CI->getNumOperandBundles();
+ BundleIdx != BundleEnd; ++BundleIdx) {
+ OperandBundleUse Bundle = CI->getOperandBundleAt(BundleIdx);
+ if (Bundle.getTagID() == LLVMContext::OB_funclet)
+ continue;
+
+ OpBundles.emplace_back(Bundle);
+ }
+
+ if (!BlockColors.empty()) {
+ const ColorVector &CV = BlockColors.find(&ExitBlock)->second;
+ assert(CV.size() == 1 && "non-unique color for exit block!");
+ BasicBlock *BBColor = CV.front();
+ Instruction *EHPad = BBColor->getFirstNonPHI();
+ if (EHPad->isEHPad())
+ OpBundles.emplace_back("funclet", EHPad);
+ }
+
+ New = CallInst::Create(CI, OpBundles);
+ } else {
+ New = I.clone();
+ }
+
+ ExitBlock.getInstList().insert(ExitBlock.getFirstInsertionPt(), New);
+ if (!I.getName().empty()) New->setName(I.getName() + ".le");
+
+ // Build LCSSA PHI nodes for any in-loop operands. Note that this is
+ // particularly cheap because we can rip off the PHI node that we're
+ // replacing for the number and blocks of the predecessors.
+ // OPT: If this shows up in a profile, we can instead finish sinking all
+ // invariant instructions, and then walk their operands to re-establish
+ // LCSSA. That will eliminate creating PHI nodes just to nuke them when
+ // sinking bottom-up.
+ for (User::op_iterator OI = New->op_begin(), OE = New->op_end(); OI != OE;
+ ++OI)
+ if (Instruction *OInst = dyn_cast<Instruction>(*OI))
+ if (Loop *OLoop = LI->getLoopFor(OInst->getParent()))
+ if (!OLoop->contains(&PN)) {
+ PHINode *OpPN =
+ PHINode::Create(OInst->getType(), PN.getNumIncomingValues(),
+ OInst->getName() + ".lcssa", &ExitBlock.front());
+ for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i)
+ OpPN->addIncoming(OInst, PN.getIncomingBlock(i));
+ *OI = OpPN;
+ }
+ return New;
+}
+
+/// When an instruction is found to only be used outside of the loop, this
+/// function moves it to the exit blocks and patches up SSA form as needed.
+/// This method is guaranteed to remove the original instruction from its
+/// position, and may either delete it or move it to outside of the loop.
+///
+static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT,
+ const Loop *CurLoop, AliasSetTracker *CurAST,
+ const LICMSafetyInfo *SafetyInfo) {
+ DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n");
+ bool Changed = false;
+ if (isa<LoadInst>(I)) ++NumMovedLoads;
+ else if (isa<CallInst>(I)) ++NumMovedCalls;
+ ++NumSunk;
+ Changed = true;
+
+#ifndef NDEBUG
+ SmallVector<BasicBlock *, 32> ExitBlocks;
+ CurLoop->getUniqueExitBlocks(ExitBlocks);
+ SmallPtrSet<BasicBlock *, 32> ExitBlockSet(ExitBlocks.begin(),
+ ExitBlocks.end());
+#endif
+
+ // Clones of this instruction. Don't create more than one per exit block!
+ SmallDenseMap<BasicBlock *, Instruction *, 32> SunkCopies;
+
+ // If this instruction is only used outside of the loop, then all users are
+ // PHI nodes in exit blocks due to LCSSA form. Just RAUW them with clones of
+ // the instruction.
+ while (!I.use_empty()) {
+ Value::user_iterator UI = I.user_begin();
+ auto *User = cast<Instruction>(*UI);
+ if (!DT->isReachableFromEntry(User->getParent())) {
+ User->replaceUsesOfWith(&I, UndefValue::get(I.getType()));
+ continue;
+ }
+ // The user must be a PHI node.
+ PHINode *PN = cast<PHINode>(User);
+
+ // Surprisingly, instructions can be used outside of loops without any
+ // exits. This can only happen in PHI nodes if the incoming block is
+ // unreachable.
+ Use &U = UI.getUse();
+ BasicBlock *BB = PN->getIncomingBlock(U);
+ if (!DT->isReachableFromEntry(BB)) {
+ U = UndefValue::get(I.getType());
+ continue;
+ }
+
+ BasicBlock *ExitBlock = PN->getParent();
+ assert(ExitBlockSet.count(ExitBlock) &&
+ "The LCSSA PHI is not in an exit block!");
+
+ Instruction *New;
+ auto It = SunkCopies.find(ExitBlock);
+ if (It != SunkCopies.end())
+ New = It->second;
+ else
+ New = SunkCopies[ExitBlock] =
+ CloneInstructionInExitBlock(I, *ExitBlock, *PN, LI, SafetyInfo);
+
+ PN->replaceAllUsesWith(New);
+ PN->eraseFromParent();
+ }
+
+ CurAST->deleteValue(&I);
+ I.eraseFromParent();
+ return Changed;
+}
+
+/// When an instruction is found to only use loop invariant operands that
+/// is safe to hoist, this instruction is called to do the dirty work.
+///
+static bool hoist(Instruction &I, BasicBlock *Preheader) {
+ DEBUG(dbgs() << "LICM hoisting to " << Preheader->getName() << ": "
+ << I << "\n");
+ // Move the new node to the Preheader, before its terminator.
+ I.moveBefore(Preheader->getTerminator());
+
+ // Metadata can be dependent on the condition we are hoisting above.
+ // Conservatively strip all metadata on the instruction.
+ I.dropUnknownNonDebugMetadata();
+
+ if (isa<LoadInst>(I)) ++NumMovedLoads;
+ else if (isa<CallInst>(I)) ++NumMovedCalls;
+ ++NumHoisted;
+ return true;
+}
+
+/// Only sink or hoist an instruction if it is not a trapping instruction,
+/// or if the instruction is known not to trap when moved to the preheader.
+/// or if it is a trapping instruction and is guaranteed to execute.
+static bool isSafeToExecuteUnconditionally(const Instruction &Inst,
+ const DominatorTree *DT,
+ const TargetLibraryInfo *TLI,
+ const Loop *CurLoop,
+ const LICMSafetyInfo *SafetyInfo,
+ const Instruction *CtxI) {
+ if (isSafeToSpeculativelyExecute(&Inst, CtxI, DT, TLI))
+ return true;
+
+ return isGuaranteedToExecute(Inst, DT, CurLoop, SafetyInfo);
+}
+
+static bool isGuaranteedToExecute(const Instruction &Inst,
+ const DominatorTree *DT,
+ const Loop *CurLoop,
+ const LICMSafetyInfo * SafetyInfo) {
+
+ // We have to check to make sure that the instruction dominates all
+ // of the exit blocks. If it doesn't, then there is a path out of the loop
+ // which does not execute this instruction, so we can't hoist it.
+
+ // If the instruction is in the header block for the loop (which is very
+ // common), it is always guaranteed to dominate the exit blocks. Since this
+ // is a common case, and can save some work, check it now.
+ if (Inst.getParent() == CurLoop->getHeader())
+ // If there's a throw in the header block, we can't guarantee we'll reach
+ // Inst.
+ return !SafetyInfo->HeaderMayThrow;
+
+ // Somewhere in this loop there is an instruction which may throw and make us
+ // exit the loop.
+ if (SafetyInfo->MayThrow)
+ return false;
+
+ // Get the exit blocks for the current loop.
+ SmallVector<BasicBlock*, 8> ExitBlocks;
+ CurLoop->getExitBlocks(ExitBlocks);
+
+ // Verify that the block dominates each of the exit blocks of the loop.
+ for (BasicBlock *ExitBlock : ExitBlocks)
+ if (!DT->dominates(Inst.getParent(), ExitBlock))
+ return false;
+
+ // As a degenerate case, if the loop is statically infinite then we haven't
+ // proven anything since there are no exit blocks.
+ if (ExitBlocks.empty())
+ return false;
+
+ return true;
+}
+
+namespace {
+ class LoopPromoter : public LoadAndStorePromoter {
+ Value *SomePtr; // Designated pointer to store to.
+ SmallPtrSetImpl<Value*> &PointerMustAliases;
+ SmallVectorImpl<BasicBlock*> &LoopExitBlocks;
+ SmallVectorImpl<Instruction*> &LoopInsertPts;
+ PredIteratorCache &PredCache;
+ AliasSetTracker &AST;
+ LoopInfo &LI;
+ DebugLoc DL;
+ int Alignment;
+ AAMDNodes AATags;
+
+ Value *maybeInsertLCSSAPHI(Value *V, BasicBlock *BB) const {
+ if (Instruction *I = dyn_cast<Instruction>(V))
+ if (Loop *L = LI.getLoopFor(I->getParent()))
+ if (!L->contains(BB)) {
+ // We need to create an LCSSA PHI node for the incoming value and
+ // store that.
+ PHINode *PN =
+ PHINode::Create(I->getType(), PredCache.size(BB),
+ I->getName() + ".lcssa", &BB->front());
+ for (BasicBlock *Pred : PredCache.get(BB))
+ PN->addIncoming(I, Pred);
+ return PN;
+ }
+ return V;
+ }
+
+ public:
+ LoopPromoter(Value *SP,
+ ArrayRef<const Instruction *> Insts,
+ SSAUpdater &S, SmallPtrSetImpl<Value *> &PMA,
+ SmallVectorImpl<BasicBlock *> &LEB,
+ SmallVectorImpl<Instruction *> &LIP, PredIteratorCache &PIC,
+ AliasSetTracker &ast, LoopInfo &li, DebugLoc dl, int alignment,
+ const AAMDNodes &AATags)
+ : LoadAndStorePromoter(Insts, S), SomePtr(SP), PointerMustAliases(PMA),
+ LoopExitBlocks(LEB), LoopInsertPts(LIP), PredCache(PIC), AST(ast),
+ LI(li), DL(dl), Alignment(alignment), AATags(AATags) {}
+
+ bool isInstInList(Instruction *I,
+ const SmallVectorImpl<Instruction*> &) const override {
+ Value *Ptr;
+ if (LoadInst *LI = dyn_cast<LoadInst>(I))
+ Ptr = LI->getOperand(0);
+ else
+ Ptr = cast<StoreInst>(I)->getPointerOperand();
+ return PointerMustAliases.count(Ptr);
+ }
+
+ void doExtraRewritesBeforeFinalDeletion() const override {
+ // Insert stores after in the loop exit blocks. Each exit block gets a
+ // store of the live-out values that feed them. Since we've already told
+ // the SSA updater about the defs in the loop and the preheader
+ // definition, it is all set and we can start using it.
+ for (unsigned i = 0, e = LoopExitBlocks.size(); i != e; ++i) {
+ BasicBlock *ExitBlock = LoopExitBlocks[i];
+ Value *LiveInValue = SSA.GetValueInMiddleOfBlock(ExitBlock);
+ LiveInValue = maybeInsertLCSSAPHI(LiveInValue, ExitBlock);
+ Value *Ptr = maybeInsertLCSSAPHI(SomePtr, ExitBlock);
+ Instruction *InsertPos = LoopInsertPts[i];
+ StoreInst *NewSI = new StoreInst(LiveInValue, Ptr, InsertPos);
+ NewSI->setAlignment(Alignment);
+ NewSI->setDebugLoc(DL);
+ if (AATags) NewSI->setAAMetadata(AATags);
+ }
+ }
+
+ void replaceLoadWithValue(LoadInst *LI, Value *V) const override {
+ // Update alias analysis.
+ AST.copyValue(LI, V);
+ }
+ void instructionDeleted(Instruction *I) const override {
+ AST.deleteValue(I);
+ }
+ };
+} // end anon namespace
+
+/// Try to promote memory values to scalars by sinking stores out of the
+/// loop and moving loads to before the loop. We do this by looping over
+/// the stores in the loop, looking for stores to Must pointers which are
+/// loop invariant.
+///
+bool llvm::promoteLoopAccessesToScalars(AliasSet &AS,
+ SmallVectorImpl<BasicBlock*>&ExitBlocks,
+ SmallVectorImpl<Instruction*>&InsertPts,
+ PredIteratorCache &PIC, LoopInfo *LI,
+ DominatorTree *DT, Loop *CurLoop,
+ AliasSetTracker *CurAST,
+ LICMSafetyInfo * SafetyInfo) {
+ // Verify inputs.
+ assert(LI != nullptr && DT != nullptr &&
+ CurLoop != nullptr && CurAST != nullptr &&
+ SafetyInfo != nullptr &&
+ "Unexpected Input to promoteLoopAccessesToScalars");
+ // Initially set Changed status to false.
+ bool Changed = false;
+ // We can promote this alias set if it has a store, if it is a "Must" alias
+ // set, if the pointer is loop invariant, and if we are not eliminating any
+ // volatile loads or stores.
+ if (AS.isForwardingAliasSet() || !AS.isMod() || !AS.isMustAlias() ||
+ AS.isVolatile() || !CurLoop->isLoopInvariant(AS.begin()->getValue()))
+ return Changed;
+
+ assert(!AS.empty() &&
+ "Must alias set should have at least one pointer element in it!");
+
+ Value *SomePtr = AS.begin()->getValue();
+ BasicBlock * Preheader = CurLoop->getLoopPreheader();
+
+ // It isn't safe to promote a load/store from the loop if the load/store is
+ // conditional. For example, turning:
+ //
+ // for () { if (c) *P += 1; }
+ //
+ // into:
+ //
+ // tmp = *P; for () { if (c) tmp +=1; } *P = tmp;
+ //
+ // is not safe, because *P may only be valid to access if 'c' is true.
+ //
+ // It is safe to promote P if all uses are direct load/stores and if at
+ // least one is guaranteed to be executed.
+ bool GuaranteedToExecute = false;
+
+ SmallVector<Instruction*, 64> LoopUses;
+ SmallPtrSet<Value*, 4> PointerMustAliases;
+
+ // We start with an alignment of one and try to find instructions that allow
+ // us to prove better alignment.
+ unsigned Alignment = 1;
+ AAMDNodes AATags;
+ bool HasDedicatedExits = CurLoop->hasDedicatedExits();
+
+ // Check that all of the pointers in the alias set have the same type. We
+ // cannot (yet) promote a memory location that is loaded and stored in
+ // different sizes. While we are at it, collect alignment and AA info.
+ for (AliasSet::iterator ASI = AS.begin(), E = AS.end(); ASI != E; ++ASI) {
+ Value *ASIV = ASI->getValue();
+ PointerMustAliases.insert(ASIV);
+
+ // Check that all of the pointers in the alias set have the same type. We
+ // cannot (yet) promote a memory location that is loaded and stored in
+ // different sizes.
+ if (SomePtr->getType() != ASIV->getType())
+ return Changed;
+
+ for (User *U : ASIV->users()) {
+ // Ignore instructions that are outside the loop.
+ Instruction *UI = dyn_cast<Instruction>(U);
+ if (!UI || !CurLoop->contains(UI))
+ continue;
+
+ // If there is an non-load/store instruction in the loop, we can't promote
+ // it.
+ if (const LoadInst *Load = dyn_cast<LoadInst>(UI)) {
+ assert(!Load->isVolatile() && "AST broken");
+ if (!Load->isSimple())
+ return Changed;
+ } else if (const StoreInst *Store = dyn_cast<StoreInst>(UI)) {
+ // Stores *of* the pointer are not interesting, only stores *to* the
+ // pointer.
+ if (UI->getOperand(1) != ASIV)
+ continue;
+ assert(!Store->isVolatile() && "AST broken");
+ if (!Store->isSimple())
+ return Changed;
+ // Don't sink stores from loops without dedicated block exits. Exits
+ // containing indirect branches are not transformed by loop simplify,
+ // make sure we catch that. An additional load may be generated in the
+ // preheader for SSA updater, so also avoid sinking when no preheader
+ // is available.
+ if (!HasDedicatedExits || !Preheader)
+ return Changed;
+
+ // Note that we only check GuaranteedToExecute inside the store case
+ // so that we do not introduce stores where they did not exist before
+ // (which would break the LLVM concurrency model).
+
+ // If the alignment of this instruction allows us to specify a more
+ // restrictive (and performant) alignment and if we are sure this
+ // instruction will be executed, update the alignment.
+ // Larger is better, with the exception of 0 being the best alignment.
+ unsigned InstAlignment = Store->getAlignment();
+ if ((InstAlignment > Alignment || InstAlignment == 0) && Alignment != 0)
+ if (isGuaranteedToExecute(*UI, DT, CurLoop, SafetyInfo)) {
+ GuaranteedToExecute = true;
+ Alignment = InstAlignment;
+ }
+
+ if (!GuaranteedToExecute)
+ GuaranteedToExecute = isGuaranteedToExecute(*UI, DT,
+ CurLoop, SafetyInfo);
+
+ } else
+ return Changed; // Not a load or store.
+
+ // Merge the AA tags.
+ if (LoopUses.empty()) {
+ // On the first load/store, just take its AA tags.
+ UI->getAAMetadata(AATags);
+ } else if (AATags) {
+ UI->getAAMetadata(AATags, /* Merge = */ true);
+ }
+
+ LoopUses.push_back(UI);
+ }
+ }
+
+ // If there isn't a guaranteed-to-execute instruction, we can't promote.
+ if (!GuaranteedToExecute)
+ return Changed;
+
+ // Figure out the loop exits and their insertion points, if this is the
+ // first promotion.
+ if (ExitBlocks.empty()) {
+ CurLoop->getUniqueExitBlocks(ExitBlocks);
+ InsertPts.clear();
+ InsertPts.reserve(ExitBlocks.size());
+ for (BasicBlock *ExitBlock : ExitBlocks)
+ InsertPts.push_back(&*ExitBlock->getFirstInsertionPt());
+ }
+
+ // Can't insert into a catchswitch.
+ for (BasicBlock *ExitBlock : ExitBlocks)
+ if (isa<CatchSwitchInst>(ExitBlock->getTerminator()))
+ return Changed;
+
+ // Otherwise, this is safe to promote, lets do it!
+ DEBUG(dbgs() << "LICM: Promoting value stored to in loop: " <<*SomePtr<<'\n');
+ Changed = true;
+ ++NumPromoted;
+
+ // Grab a debug location for the inserted loads/stores; given that the
+ // inserted loads/stores have little relation to the original loads/stores,
+ // this code just arbitrarily picks a location from one, since any debug
+ // location is better than none.
+ DebugLoc DL = LoopUses[0]->getDebugLoc();
+
+ // We use the SSAUpdater interface to insert phi nodes as required.
+ SmallVector<PHINode*, 16> NewPHIs;
+ SSAUpdater SSA(&NewPHIs);
+ LoopPromoter Promoter(SomePtr, LoopUses, SSA,
+ PointerMustAliases, ExitBlocks,
+ InsertPts, PIC, *CurAST, *LI, DL, Alignment, AATags);
+
+ // Set up the preheader to have a definition of the value. It is the live-out
+ // value from the preheader that uses in the loop will use.
+ LoadInst *PreheaderLoad =
+ new LoadInst(SomePtr, SomePtr->getName()+".promoted",
+ Preheader->getTerminator());
+ PreheaderLoad->setAlignment(Alignment);
+ PreheaderLoad->setDebugLoc(DL);
+ if (AATags) PreheaderLoad->setAAMetadata(AATags);
+ SSA.AddAvailableValue(Preheader, PreheaderLoad);
+
+ // Rewrite all the loads in the loop and remember all the definitions from
+ // stores in the loop.
+ Promoter.run(LoopUses);
+
+ // If the SSAUpdater didn't use the load in the preheader, just zap it now.
+ if (PreheaderLoad->use_empty())
+ PreheaderLoad->eraseFromParent();
+
+ return Changed;
+}
+
+/// Simple analysis hook. Clone alias set info.
+///
+void LICM::cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To, Loop *L) {
+ AliasSetTracker *AST = LoopToAliasSetMap.lookup(L);
+ if (!AST)
+ return;
+
+ AST->copyValue(From, To);
+}
+
+/// Simple Analysis hook. Delete value V from alias set
+///
+void LICM::deleteAnalysisValue(Value *V, Loop *L) {
+ AliasSetTracker *AST = LoopToAliasSetMap.lookup(L);
+ if (!AST)
+ return;
+
+ AST->deleteValue(V);
+}
+
+/// Simple Analysis hook. Delete value L from alias set map.
+///
+void LICM::deleteAnalysisLoop(Loop *L) {
+ AliasSetTracker *AST = LoopToAliasSetMap.lookup(L);
+ if (!AST)
+ return;
+
+ delete AST;
+ LoopToAliasSetMap.erase(L);
+}
+
+
+/// Return true if the body of this loop may store into the memory
+/// location pointed to by V.
+///
+static bool pointerInvalidatedByLoop(Value *V, uint64_t Size,
+ const AAMDNodes &AAInfo,
+ AliasSetTracker *CurAST) {
+ // Check to see if any of the basic blocks in CurLoop invalidate *V.
+ return CurAST->getAliasSetForPointer(V, Size, AAInfo).isMod();
+}
+
+/// Little predicate that returns true if the specified basic block is in
+/// a subloop of the current one, not the current one itself.
+///
+static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI) {
+ assert(CurLoop->contains(BB) && "Only valid if BB is IN the loop");
+ return LI->getLoopFor(BB) != CurLoop;
+}
+
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp b/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp
new file mode 100644
index 0000000..1648878
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp
@@ -0,0 +1,282 @@
+//===- LoadCombine.cpp - Combine Adjacent Loads ---------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This transformation combines adjacent loads.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/TargetFolder.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "load-combine"
+
+STATISTIC(NumLoadsAnalyzed, "Number of loads analyzed for combining");
+STATISTIC(NumLoadsCombined, "Number of loads combined");
+
+namespace {
+struct PointerOffsetPair {
+ Value *Pointer;
+ uint64_t Offset;
+};
+
+struct LoadPOPPair {
+ LoadPOPPair() = default;
+ LoadPOPPair(LoadInst *L, PointerOffsetPair P, unsigned O)
+ : Load(L), POP(P), InsertOrder(O) {}
+ LoadInst *Load;
+ PointerOffsetPair POP;
+ /// \brief The new load needs to be created before the first load in IR order.
+ unsigned InsertOrder;
+};
+
+class LoadCombine : public BasicBlockPass {
+ LLVMContext *C;
+ AliasAnalysis *AA;
+
+public:
+ LoadCombine() : BasicBlockPass(ID), C(nullptr), AA(nullptr) {
+ initializeLoadCombinePass(*PassRegistry::getPassRegistry());
+ }
+
+ using llvm::Pass::doInitialization;
+ bool doInitialization(Function &) override;
+ bool runOnBasicBlock(BasicBlock &BB) override;
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+ const char *getPassName() const override { return "LoadCombine"; }
+ static char ID;
+
+ typedef IRBuilder<true, TargetFolder> BuilderTy;
+
+private:
+ BuilderTy *Builder;
+
+ PointerOffsetPair getPointerOffsetPair(LoadInst &);
+ bool combineLoads(DenseMap<const Value *, SmallVector<LoadPOPPair, 8>> &);
+ bool aggregateLoads(SmallVectorImpl<LoadPOPPair> &);
+ bool combineLoads(SmallVectorImpl<LoadPOPPair> &);
+};
+}
+
+bool LoadCombine::doInitialization(Function &F) {
+ DEBUG(dbgs() << "LoadCombine function: " << F.getName() << "\n");
+ C = &F.getContext();
+ return true;
+}
+
+PointerOffsetPair LoadCombine::getPointerOffsetPair(LoadInst &LI) {
+ PointerOffsetPair POP;
+ POP.Pointer = LI.getPointerOperand();
+ POP.Offset = 0;
+ while (isa<BitCastInst>(POP.Pointer) || isa<GetElementPtrInst>(POP.Pointer)) {
+ if (auto *GEP = dyn_cast<GetElementPtrInst>(POP.Pointer)) {
+ auto &DL = LI.getModule()->getDataLayout();
+ unsigned BitWidth = DL.getPointerTypeSizeInBits(GEP->getType());
+ APInt Offset(BitWidth, 0);
+ if (GEP->accumulateConstantOffset(DL, Offset))
+ POP.Offset += Offset.getZExtValue();
+ else
+ // Can't handle GEPs with variable indices.
+ return POP;
+ POP.Pointer = GEP->getPointerOperand();
+ } else if (auto *BC = dyn_cast<BitCastInst>(POP.Pointer))
+ POP.Pointer = BC->getOperand(0);
+ }
+ return POP;
+}
+
+bool LoadCombine::combineLoads(
+ DenseMap<const Value *, SmallVector<LoadPOPPair, 8>> &LoadMap) {
+ bool Combined = false;
+ for (auto &Loads : LoadMap) {
+ if (Loads.second.size() < 2)
+ continue;
+ std::sort(Loads.second.begin(), Loads.second.end(),
+ [](const LoadPOPPair &A, const LoadPOPPair &B) {
+ return A.POP.Offset < B.POP.Offset;
+ });
+ if (aggregateLoads(Loads.second))
+ Combined = true;
+ }
+ return Combined;
+}
+
+/// \brief Try to aggregate loads from a sorted list of loads to be combined.
+///
+/// It is guaranteed that no writes occur between any of the loads. All loads
+/// have the same base pointer. There are at least two loads.
+bool LoadCombine::aggregateLoads(SmallVectorImpl<LoadPOPPair> &Loads) {
+ assert(Loads.size() >= 2 && "Insufficient loads!");
+ LoadInst *BaseLoad = nullptr;
+ SmallVector<LoadPOPPair, 8> AggregateLoads;
+ bool Combined = false;
+ uint64_t PrevOffset = -1ull;
+ uint64_t PrevSize = 0;
+ for (auto &L : Loads) {
+ if (PrevOffset == -1ull) {
+ BaseLoad = L.Load;
+ PrevOffset = L.POP.Offset;
+ PrevSize = L.Load->getModule()->getDataLayout().getTypeStoreSize(
+ L.Load->getType());
+ AggregateLoads.push_back(L);
+ continue;
+ }
+ if (L.Load->getAlignment() > BaseLoad->getAlignment())
+ continue;
+ if (L.POP.Offset > PrevOffset + PrevSize) {
+ // No other load will be combinable
+ if (combineLoads(AggregateLoads))
+ Combined = true;
+ AggregateLoads.clear();
+ PrevOffset = -1;
+ continue;
+ }
+ if (L.POP.Offset != PrevOffset + PrevSize)
+ // This load is offset less than the size of the last load.
+ // FIXME: We may want to handle this case.
+ continue;
+ PrevOffset = L.POP.Offset;
+ PrevSize = L.Load->getModule()->getDataLayout().getTypeStoreSize(
+ L.Load->getType());
+ AggregateLoads.push_back(L);
+ }
+ if (combineLoads(AggregateLoads))
+ Combined = true;
+ return Combined;
+}
+
+/// \brief Given a list of combinable load. Combine the maximum number of them.
+bool LoadCombine::combineLoads(SmallVectorImpl<LoadPOPPair> &Loads) {
+ // Remove loads from the end while the size is not a power of 2.
+ unsigned TotalSize = 0;
+ for (const auto &L : Loads)
+ TotalSize += L.Load->getType()->getPrimitiveSizeInBits();
+ while (TotalSize != 0 && !isPowerOf2_32(TotalSize))
+ TotalSize -= Loads.pop_back_val().Load->getType()->getPrimitiveSizeInBits();
+ if (Loads.size() < 2)
+ return false;
+
+ DEBUG({
+ dbgs() << "***** Combining Loads ******\n";
+ for (const auto &L : Loads) {
+ dbgs() << L.POP.Offset << ": " << *L.Load << "\n";
+ }
+ });
+
+ // Find first load. This is where we put the new load.
+ LoadPOPPair FirstLP;
+ FirstLP.InsertOrder = -1u;
+ for (const auto &L : Loads)
+ if (L.InsertOrder < FirstLP.InsertOrder)
+ FirstLP = L;
+
+ unsigned AddressSpace =
+ FirstLP.POP.Pointer->getType()->getPointerAddressSpace();
+
+ Builder->SetInsertPoint(FirstLP.Load);
+ Value *Ptr = Builder->CreateConstGEP1_64(
+ Builder->CreatePointerCast(Loads[0].POP.Pointer,
+ Builder->getInt8PtrTy(AddressSpace)),
+ Loads[0].POP.Offset);
+ LoadInst *NewLoad = new LoadInst(
+ Builder->CreatePointerCast(
+ Ptr, PointerType::get(IntegerType::get(Ptr->getContext(), TotalSize),
+ Ptr->getType()->getPointerAddressSpace())),
+ Twine(Loads[0].Load->getName()) + ".combined", false,
+ Loads[0].Load->getAlignment(), FirstLP.Load);
+
+ for (const auto &L : Loads) {
+ Builder->SetInsertPoint(L.Load);
+ Value *V = Builder->CreateExtractInteger(
+ L.Load->getModule()->getDataLayout(), NewLoad,
+ cast<IntegerType>(L.Load->getType()),
+ L.POP.Offset - Loads[0].POP.Offset, "combine.extract");
+ L.Load->replaceAllUsesWith(V);
+ }
+
+ NumLoadsCombined = NumLoadsCombined + Loads.size();
+ return true;
+}
+
+bool LoadCombine::runOnBasicBlock(BasicBlock &BB) {
+ if (skipOptnoneFunction(BB))
+ return false;
+
+ AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+
+ IRBuilder<true, TargetFolder> TheBuilder(
+ BB.getContext(), TargetFolder(BB.getModule()->getDataLayout()));
+ Builder = &TheBuilder;
+
+ DenseMap<const Value *, SmallVector<LoadPOPPair, 8>> LoadMap;
+ AliasSetTracker AST(*AA);
+
+ bool Combined = false;
+ unsigned Index = 0;
+ for (auto &I : BB) {
+ if (I.mayThrow() || (I.mayWriteToMemory() && AST.containsUnknown(&I))) {
+ if (combineLoads(LoadMap))
+ Combined = true;
+ LoadMap.clear();
+ AST.clear();
+ continue;
+ }
+ LoadInst *LI = dyn_cast<LoadInst>(&I);
+ if (!LI)
+ continue;
+ ++NumLoadsAnalyzed;
+ if (!LI->isSimple() || !LI->getType()->isIntegerTy())
+ continue;
+ auto POP = getPointerOffsetPair(*LI);
+ if (!POP.Pointer)
+ continue;
+ LoadMap[POP.Pointer].push_back(LoadPOPPair(LI, POP, Index++));
+ AST.add(LI);
+ }
+ if (combineLoads(LoadMap))
+ Combined = true;
+ return Combined;
+}
+
+void LoadCombine::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.setPreservesCFG();
+
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+}
+
+char LoadCombine::ID = 0;
+
+BasicBlockPass *llvm::createLoadCombinePass() {
+ return new LoadCombine();
+}
+
+INITIALIZE_PASS_BEGIN(LoadCombine, "load-combine", "Combine Adjacent Loads",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_END(LoadCombine, "load-combine", "Combine Adjacent Loads",
+ false, false)
+
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
new file mode 100644
index 0000000..7b1940b
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -0,0 +1,254 @@
+//===- LoopDeletion.cpp - Dead Loop Deletion Pass ---------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Dead Loop Deletion Pass. This pass is responsible
+// for eliminating loops with non-infinite computable trip counts that have no
+// side effects or volatile instructions, and do not contribute to the
+// computation of the function's return value.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/IR/Dominators.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-delete"
+
+STATISTIC(NumDeleted, "Number of loops deleted");
+
+namespace {
+ class LoopDeletion : public LoopPass {
+ public:
+ static char ID; // Pass ID, replacement for typeid
+ LoopDeletion() : LoopPass(ID) {
+ initializeLoopDeletionPass(*PassRegistry::getPassRegistry());
+ }
+
+ // Possibly eliminate loop L if it is dead.
+ bool runOnLoop(Loop *L, LPPassManager &) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addRequiredID(LoopSimplifyID);
+ AU.addRequiredID(LCSSAID);
+
+ AU.addPreserved<ScalarEvolutionWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addPreservedID(LoopSimplifyID);
+ AU.addPreservedID(LCSSAID);
+ }
+
+ private:
+ bool isLoopDead(Loop *L, SmallVectorImpl<BasicBlock *> &exitingBlocks,
+ SmallVectorImpl<BasicBlock *> &exitBlocks,
+ bool &Changed, BasicBlock *Preheader);
+
+ };
+}
+
+char LoopDeletion::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopDeletion, "loop-deletion",
+ "Delete dead loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(LCSSA)
+INITIALIZE_PASS_END(LoopDeletion, "loop-deletion",
+ "Delete dead loops", false, false)
+
+Pass *llvm::createLoopDeletionPass() {
+ return new LoopDeletion();
+}
+
+/// isLoopDead - Determined if a loop is dead. This assumes that we've already
+/// checked for unique exit and exiting blocks, and that the code is in LCSSA
+/// form.
+bool LoopDeletion::isLoopDead(Loop *L,
+ SmallVectorImpl<BasicBlock *> &exitingBlocks,
+ SmallVectorImpl<BasicBlock *> &exitBlocks,
+ bool &Changed, BasicBlock *Preheader) {
+ BasicBlock *exitBlock = exitBlocks[0];
+
+ // Make sure that all PHI entries coming from the loop are loop invariant.
+ // Because the code is in LCSSA form, any values used outside of the loop
+ // must pass through a PHI in the exit block, meaning that this check is
+ // sufficient to guarantee that no loop-variant values are used outside
+ // of the loop.
+ BasicBlock::iterator BI = exitBlock->begin();
+ while (PHINode *P = dyn_cast<PHINode>(BI)) {
+ Value *incoming = P->getIncomingValueForBlock(exitingBlocks[0]);
+
+ // Make sure all exiting blocks produce the same incoming value for the exit
+ // block. If there are different incoming values for different exiting
+ // blocks, then it is impossible to statically determine which value should
+ // be used.
+ for (unsigned i = 1, e = exitingBlocks.size(); i < e; ++i) {
+ if (incoming != P->getIncomingValueForBlock(exitingBlocks[i]))
+ return false;
+ }
+
+ if (Instruction *I = dyn_cast<Instruction>(incoming))
+ if (!L->makeLoopInvariant(I, Changed, Preheader->getTerminator()))
+ return false;
+
+ ++BI;
+ }
+
+ // Make sure that no instructions in the block have potential side-effects.
+ // This includes instructions that could write to memory, and loads that are
+ // marked volatile. This could be made more aggressive by using aliasing
+ // information to identify readonly and readnone calls.
+ for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end();
+ LI != LE; ++LI) {
+ for (BasicBlock::iterator BI = (*LI)->begin(), BE = (*LI)->end();
+ BI != BE; ++BI) {
+ if (BI->mayHaveSideEffects())
+ return false;
+ }
+ }
+
+ return true;
+}
+
+/// runOnLoop - Remove dead loops, by which we mean loops that do not impact the
+/// observable behavior of the program other than finite running time. Note
+/// we do ensure that this never remove a loop that might be infinite, as doing
+/// so could change the halting/non-halting nature of a program.
+/// NOTE: This entire process relies pretty heavily on LoopSimplify and LCSSA
+/// in order to make various safety checks work.
+bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &) {
+ if (skipOptnoneFunction(L))
+ return false;
+
+ // We can only remove the loop if there is a preheader that we can
+ // branch from after removing it.
+ BasicBlock *preheader = L->getLoopPreheader();
+ if (!preheader)
+ return false;
+
+ // If LoopSimplify form is not available, stay out of trouble.
+ if (!L->hasDedicatedExits())
+ return false;
+
+ // We can't remove loops that contain subloops. If the subloops were dead,
+ // they would already have been removed in earlier executions of this pass.
+ if (L->begin() != L->end())
+ return false;
+
+ SmallVector<BasicBlock*, 4> exitingBlocks;
+ L->getExitingBlocks(exitingBlocks);
+
+ SmallVector<BasicBlock*, 4> exitBlocks;
+ L->getUniqueExitBlocks(exitBlocks);
+
+ // We require that the loop only have a single exit block. Otherwise, we'd
+ // be in the situation of needing to be able to solve statically which exit
+ // block will be branched to, or trying to preserve the branching logic in
+ // a loop invariant manner.
+ if (exitBlocks.size() != 1)
+ return false;
+
+ // Finally, we have to check that the loop really is dead.
+ bool Changed = false;
+ if (!isLoopDead(L, exitingBlocks, exitBlocks, Changed, preheader))
+ return Changed;
+
+ // Don't remove loops for which we can't solve the trip count.
+ // They could be infinite, in which case we'd be changing program behavior.
+ ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ const SCEV *S = SE.getMaxBackedgeTakenCount(L);
+ if (isa<SCEVCouldNotCompute>(S))
+ return Changed;
+
+ // Now that we know the removal is safe, remove the loop by changing the
+ // branch from the preheader to go to the single exit block.
+ BasicBlock *exitBlock = exitBlocks[0];
+
+ // Because we're deleting a large chunk of code at once, the sequence in which
+ // we remove things is very important to avoid invalidation issues. Don't
+ // mess with this unless you have good reason and know what you're doing.
+
+ // Tell ScalarEvolution that the loop is deleted. Do this before
+ // deleting the loop so that ScalarEvolution can look at the loop
+ // to determine what it needs to clean up.
+ SE.forgetLoop(L);
+
+ // Connect the preheader directly to the exit block.
+ TerminatorInst *TI = preheader->getTerminator();
+ TI->replaceUsesOfWith(L->getHeader(), exitBlock);
+
+ // Rewrite phis in the exit block to get their inputs from
+ // the preheader instead of the exiting block.
+ BasicBlock *exitingBlock = exitingBlocks[0];
+ BasicBlock::iterator BI = exitBlock->begin();
+ while (PHINode *P = dyn_cast<PHINode>(BI)) {
+ int j = P->getBasicBlockIndex(exitingBlock);
+ assert(j >= 0 && "Can't find exiting block in exit block's phi node!");
+ P->setIncomingBlock(j, preheader);
+ for (unsigned i = 1; i < exitingBlocks.size(); ++i)
+ P->removeIncomingValue(exitingBlocks[i]);
+ ++BI;
+ }
+
+ // Update the dominator tree and remove the instructions and blocks that will
+ // be deleted from the reference counting scheme.
+ DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ SmallVector<DomTreeNode*, 8> ChildNodes;
+ for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end();
+ LI != LE; ++LI) {
+ // Move all of the block's children to be children of the preheader, which
+ // allows us to remove the domtree entry for the block.
+ ChildNodes.insert(ChildNodes.begin(), DT[*LI]->begin(), DT[*LI]->end());
+ for (SmallVectorImpl<DomTreeNode *>::iterator DI = ChildNodes.begin(),
+ DE = ChildNodes.end(); DI != DE; ++DI) {
+ DT.changeImmediateDominator(*DI, DT[preheader]);
+ }
+
+ ChildNodes.clear();
+ DT.eraseNode(*LI);
+
+ // Remove the block from the reference counting scheme, so that we can
+ // delete it freely later.
+ (*LI)->dropAllReferences();
+ }
+
+ // Erase the instructions and the blocks without having to worry
+ // about ordering because we already dropped the references.
+ // NOTE: This iteration is safe because erasing the block does not remove its
+ // entry from the loop's block list. We do that in the next section.
+ for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end();
+ LI != LE; ++LI)
+ (*LI)->eraseFromParent();
+
+ // Finally, the blocks from loopinfo. This has to happen late because
+ // otherwise our loop iterators won't work.
+ LoopInfo &loopInfo = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ SmallPtrSet<BasicBlock*, 8> blocks;
+ blocks.insert(L->block_begin(), L->block_end());
+ for (BasicBlock *BB : blocks)
+ loopInfo.removeBlock(BB);
+
+ // The last step is to update LoopInfo now that we've eliminated this loop.
+ loopInfo.markAsRemoved(L);
+ Changed = true;
+
+ ++NumDeleted;
+
+ return Changed;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
new file mode 100644
index 0000000..3d3cf3e
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
@@ -0,0 +1,836 @@
+//===- LoopDistribute.cpp - Loop Distribution Pass ------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Loop Distribution Pass. Its main focus is to
+// distribute loops that cannot be vectorized due to dependence cycles. It
+// tries to isolate the offending dependences into a new loop allowing
+// vectorization of the remaining parts.
+//
+// For dependence analysis, the pass uses the LoopVectorizer's
+// LoopAccessAnalysis. Because this analysis presumes no change in the order of
+// memory operations, special care is taken to preserve the lexical order of
+// these operations.
+//
+// Similarly to the Vectorizer, the pass also supports loop versioning to
+// run-time disambiguate potentially overlapping arrays.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/LoopVersioning.h"
+#include <list>
+
+#define LDIST_NAME "loop-distribute"
+#define DEBUG_TYPE LDIST_NAME
+
+using namespace llvm;
+
+static cl::opt<bool>
+ LDistVerify("loop-distribute-verify", cl::Hidden,
+ cl::desc("Turn on DominatorTree and LoopInfo verification "
+ "after Loop Distribution"),
+ cl::init(false));
+
+static cl::opt<bool> DistributeNonIfConvertible(
+ "loop-distribute-non-if-convertible", cl::Hidden,
+ cl::desc("Whether to distribute into a loop that may not be "
+ "if-convertible by the loop vectorizer"),
+ cl::init(false));
+
+static cl::opt<unsigned> DistributeSCEVCheckThreshold(
+ "loop-distribute-scev-check-threshold", cl::init(8), cl::Hidden,
+ cl::desc("The maximum number of SCEV checks allowed for Loop "
+ "Distribution"));
+
+STATISTIC(NumLoopsDistributed, "Number of loops distributed");
+
+namespace {
+/// \brief Maintains the set of instructions of the loop for a partition before
+/// cloning. After cloning, it hosts the new loop.
+class InstPartition {
+ typedef SmallPtrSet<Instruction *, 8> InstructionSet;
+
+public:
+ InstPartition(Instruction *I, Loop *L, bool DepCycle = false)
+ : DepCycle(DepCycle), OrigLoop(L), ClonedLoop(nullptr) {
+ Set.insert(I);
+ }
+
+ /// \brief Returns whether this partition contains a dependence cycle.
+ bool hasDepCycle() const { return DepCycle; }
+
+ /// \brief Adds an instruction to this partition.
+ void add(Instruction *I) { Set.insert(I); }
+
+ /// \brief Collection accessors.
+ InstructionSet::iterator begin() { return Set.begin(); }
+ InstructionSet::iterator end() { return Set.end(); }
+ InstructionSet::const_iterator begin() const { return Set.begin(); }
+ InstructionSet::const_iterator end() const { return Set.end(); }
+ bool empty() const { return Set.empty(); }
+
+ /// \brief Moves this partition into \p Other. This partition becomes empty
+ /// after this.
+ void moveTo(InstPartition &Other) {
+ Other.Set.insert(Set.begin(), Set.end());
+ Set.clear();
+ Other.DepCycle |= DepCycle;
+ }
+
+ /// \brief Populates the partition with a transitive closure of all the
+ /// instructions that the seeded instructions dependent on.
+ void populateUsedSet() {
+ // FIXME: We currently don't use control-dependence but simply include all
+ // blocks (possibly empty at the end) and let simplifycfg mostly clean this
+ // up.
+ for (auto *B : OrigLoop->getBlocks())
+ Set.insert(B->getTerminator());
+
+ // Follow the use-def chains to form a transitive closure of all the
+ // instructions that the originally seeded instructions depend on.
+ SmallVector<Instruction *, 8> Worklist(Set.begin(), Set.end());
+ while (!Worklist.empty()) {
+ Instruction *I = Worklist.pop_back_val();
+ // Insert instructions from the loop that we depend on.
+ for (Value *V : I->operand_values()) {
+ auto *I = dyn_cast<Instruction>(V);
+ if (I && OrigLoop->contains(I->getParent()) && Set.insert(I).second)
+ Worklist.push_back(I);
+ }
+ }
+ }
+
+ /// \brief Clones the original loop.
+ ///
+ /// Updates LoopInfo and DominatorTree using the information that block \p
+ /// LoopDomBB dominates the loop.
+ Loop *cloneLoopWithPreheader(BasicBlock *InsertBefore, BasicBlock *LoopDomBB,
+ unsigned Index, LoopInfo *LI,
+ DominatorTree *DT) {
+ ClonedLoop = ::cloneLoopWithPreheader(InsertBefore, LoopDomBB, OrigLoop,
+ VMap, Twine(".ldist") + Twine(Index),
+ LI, DT, ClonedLoopBlocks);
+ return ClonedLoop;
+ }
+
+ /// \brief The cloned loop. If this partition is mapped to the original loop,
+ /// this is null.
+ const Loop *getClonedLoop() const { return ClonedLoop; }
+
+ /// \brief Returns the loop where this partition ends up after distribution.
+ /// If this partition is mapped to the original loop then use the block from
+ /// the loop.
+ const Loop *getDistributedLoop() const {
+ return ClonedLoop ? ClonedLoop : OrigLoop;
+ }
+
+ /// \brief The VMap that is populated by cloning and then used in
+ /// remapinstruction to remap the cloned instructions.
+ ValueToValueMapTy &getVMap() { return VMap; }
+
+ /// \brief Remaps the cloned instructions using VMap.
+ void remapInstructions() {
+ remapInstructionsInBlocks(ClonedLoopBlocks, VMap);
+ }
+
+ /// \brief Based on the set of instructions selected for this partition,
+ /// removes the unnecessary ones.
+ void removeUnusedInsts() {
+ SmallVector<Instruction *, 8> Unused;
+
+ for (auto *Block : OrigLoop->getBlocks())
+ for (auto &Inst : *Block)
+ if (!Set.count(&Inst)) {
+ Instruction *NewInst = &Inst;
+ if (!VMap.empty())
+ NewInst = cast<Instruction>(VMap[NewInst]);
+
+ assert(!isa<BranchInst>(NewInst) &&
+ "Branches are marked used early on");
+ Unused.push_back(NewInst);
+ }
+
+ // Delete the instructions backwards, as it has a reduced likelihood of
+ // having to update as many def-use and use-def chains.
+ for (auto *Inst : make_range(Unused.rbegin(), Unused.rend())) {
+ if (!Inst->use_empty())
+ Inst->replaceAllUsesWith(UndefValue::get(Inst->getType()));
+ Inst->eraseFromParent();
+ }
+ }
+
+ void print() const {
+ if (DepCycle)
+ dbgs() << " (cycle)\n";
+ for (auto *I : Set)
+ // Prefix with the block name.
+ dbgs() << " " << I->getParent()->getName() << ":" << *I << "\n";
+ }
+
+ void printBlocks() const {
+ for (auto *BB : getDistributedLoop()->getBlocks())
+ dbgs() << *BB;
+ }
+
+private:
+ /// \brief Instructions from OrigLoop selected for this partition.
+ InstructionSet Set;
+
+ /// \brief Whether this partition contains a dependence cycle.
+ bool DepCycle;
+
+ /// \brief The original loop.
+ Loop *OrigLoop;
+
+ /// \brief The cloned loop. If this partition is mapped to the original loop,
+ /// this is null.
+ Loop *ClonedLoop;
+
+ /// \brief The blocks of ClonedLoop including the preheader. If this
+ /// partition is mapped to the original loop, this is empty.
+ SmallVector<BasicBlock *, 8> ClonedLoopBlocks;
+
+ /// \brief These gets populated once the set of instructions have been
+ /// finalized. If this partition is mapped to the original loop, these are not
+ /// set.
+ ValueToValueMapTy VMap;
+};
+
+/// \brief Holds the set of Partitions. It populates them, merges them and then
+/// clones the loops.
+class InstPartitionContainer {
+ typedef DenseMap<Instruction *, int> InstToPartitionIdT;
+
+public:
+ InstPartitionContainer(Loop *L, LoopInfo *LI, DominatorTree *DT)
+ : L(L), LI(LI), DT(DT) {}
+
+ /// \brief Returns the number of partitions.
+ unsigned getSize() const { return PartitionContainer.size(); }
+
+ /// \brief Adds \p Inst into the current partition if that is marked to
+ /// contain cycles. Otherwise start a new partition for it.
+ void addToCyclicPartition(Instruction *Inst) {
+ // If the current partition is non-cyclic. Start a new one.
+ if (PartitionContainer.empty() || !PartitionContainer.back().hasDepCycle())
+ PartitionContainer.emplace_back(Inst, L, /*DepCycle=*/true);
+ else
+ PartitionContainer.back().add(Inst);
+ }
+
+ /// \brief Adds \p Inst into a partition that is not marked to contain
+ /// dependence cycles.
+ ///
+ // Initially we isolate memory instructions into as many partitions as
+ // possible, then later we may merge them back together.
+ void addToNewNonCyclicPartition(Instruction *Inst) {
+ PartitionContainer.emplace_back(Inst, L);
+ }
+
+ /// \brief Merges adjacent non-cyclic partitions.
+ ///
+ /// The idea is that we currently only want to isolate the non-vectorizable
+ /// partition. We could later allow more distribution among these partition
+ /// too.
+ void mergeAdjacentNonCyclic() {
+ mergeAdjacentPartitionsIf(
+ [](const InstPartition *P) { return !P->hasDepCycle(); });
+ }
+
+ /// \brief If a partition contains only conditional stores, we won't vectorize
+ /// it. Try to merge it with a previous cyclic partition.
+ void mergeNonIfConvertible() {
+ mergeAdjacentPartitionsIf([&](const InstPartition *Partition) {
+ if (Partition->hasDepCycle())
+ return true;
+
+ // Now, check if all stores are conditional in this partition.
+ bool seenStore = false;
+
+ for (auto *Inst : *Partition)
+ if (isa<StoreInst>(Inst)) {
+ seenStore = true;
+ if (!LoopAccessInfo::blockNeedsPredication(Inst->getParent(), L, DT))
+ return false;
+ }
+ return seenStore;
+ });
+ }
+
+ /// \brief Merges the partitions according to various heuristics.
+ void mergeBeforePopulating() {
+ mergeAdjacentNonCyclic();
+ if (!DistributeNonIfConvertible)
+ mergeNonIfConvertible();
+ }
+
+ /// \brief Merges partitions in order to ensure that no loads are duplicated.
+ ///
+ /// We can't duplicate loads because that could potentially reorder them.
+ /// LoopAccessAnalysis provides dependency information with the context that
+ /// the order of memory operation is preserved.
+ ///
+ /// Return if any partitions were merged.
+ bool mergeToAvoidDuplicatedLoads() {
+ typedef DenseMap<Instruction *, InstPartition *> LoadToPartitionT;
+ typedef EquivalenceClasses<InstPartition *> ToBeMergedT;
+
+ LoadToPartitionT LoadToPartition;
+ ToBeMergedT ToBeMerged;
+
+ // Step through the partitions and create equivalence between partitions
+ // that contain the same load. Also put partitions in between them in the
+ // same equivalence class to avoid reordering of memory operations.
+ for (PartitionContainerT::iterator I = PartitionContainer.begin(),
+ E = PartitionContainer.end();
+ I != E; ++I) {
+ auto *PartI = &*I;
+
+ // If a load occurs in two partitions PartI and PartJ, merge all
+ // partitions (PartI, PartJ] into PartI.
+ for (Instruction *Inst : *PartI)
+ if (isa<LoadInst>(Inst)) {
+ bool NewElt;
+ LoadToPartitionT::iterator LoadToPart;
+
+ std::tie(LoadToPart, NewElt) =
+ LoadToPartition.insert(std::make_pair(Inst, PartI));
+ if (!NewElt) {
+ DEBUG(dbgs() << "Merging partitions due to this load in multiple "
+ << "partitions: " << PartI << ", "
+ << LoadToPart->second << "\n" << *Inst << "\n");
+
+ auto PartJ = I;
+ do {
+ --PartJ;
+ ToBeMerged.unionSets(PartI, &*PartJ);
+ } while (&*PartJ != LoadToPart->second);
+ }
+ }
+ }
+ if (ToBeMerged.empty())
+ return false;
+
+ // Merge the member of an equivalence class into its class leader. This
+ // makes the members empty.
+ for (ToBeMergedT::iterator I = ToBeMerged.begin(), E = ToBeMerged.end();
+ I != E; ++I) {
+ if (!I->isLeader())
+ continue;
+
+ auto PartI = I->getData();
+ for (auto PartJ : make_range(std::next(ToBeMerged.member_begin(I)),
+ ToBeMerged.member_end())) {
+ PartJ->moveTo(*PartI);
+ }
+ }
+
+ // Remove the empty partitions.
+ PartitionContainer.remove_if(
+ [](const InstPartition &P) { return P.empty(); });
+
+ return true;
+ }
+
+ /// \brief Sets up the mapping between instructions to partitions. If the
+ /// instruction is duplicated across multiple partitions, set the entry to -1.
+ void setupPartitionIdOnInstructions() {
+ int PartitionID = 0;
+ for (const auto &Partition : PartitionContainer) {
+ for (Instruction *Inst : Partition) {
+ bool NewElt;
+ InstToPartitionIdT::iterator Iter;
+
+ std::tie(Iter, NewElt) =
+ InstToPartitionId.insert(std::make_pair(Inst, PartitionID));
+ if (!NewElt)
+ Iter->second = -1;
+ }
+ ++PartitionID;
+ }
+ }
+
+ /// \brief Populates the partition with everything that the seeding
+ /// instructions require.
+ void populateUsedSet() {
+ for (auto &P : PartitionContainer)
+ P.populateUsedSet();
+ }
+
+ /// \brief This performs the main chunk of the work of cloning the loops for
+ /// the partitions.
+ void cloneLoops() {
+ BasicBlock *OrigPH = L->getLoopPreheader();
+ // At this point the predecessor of the preheader is either the memcheck
+ // block or the top part of the original preheader.
+ BasicBlock *Pred = OrigPH->getSinglePredecessor();
+ assert(Pred && "Preheader does not have a single predecessor");
+ BasicBlock *ExitBlock = L->getExitBlock();
+ assert(ExitBlock && "No single exit block");
+ Loop *NewLoop;
+
+ assert(!PartitionContainer.empty() && "at least two partitions expected");
+ // We're cloning the preheader along with the loop so we already made sure
+ // it was empty.
+ assert(&*OrigPH->begin() == OrigPH->getTerminator() &&
+ "preheader not empty");
+
+ // Create a loop for each partition except the last. Clone the original
+ // loop before PH along with adding a preheader for the cloned loop. Then
+ // update PH to point to the newly added preheader.
+ BasicBlock *TopPH = OrigPH;
+ unsigned Index = getSize() - 1;
+ for (auto I = std::next(PartitionContainer.rbegin()),
+ E = PartitionContainer.rend();
+ I != E; ++I, --Index, TopPH = NewLoop->getLoopPreheader()) {
+ auto *Part = &*I;
+
+ NewLoop = Part->cloneLoopWithPreheader(TopPH, Pred, Index, LI, DT);
+
+ Part->getVMap()[ExitBlock] = TopPH;
+ Part->remapInstructions();
+ }
+ Pred->getTerminator()->replaceUsesOfWith(OrigPH, TopPH);
+
+ // Now go in forward order and update the immediate dominator for the
+ // preheaders with the exiting block of the previous loop. Dominance
+ // within the loop is updated in cloneLoopWithPreheader.
+ for (auto Curr = PartitionContainer.cbegin(),
+ Next = std::next(PartitionContainer.cbegin()),
+ E = PartitionContainer.cend();
+ Next != E; ++Curr, ++Next)
+ DT->changeImmediateDominator(
+ Next->getDistributedLoop()->getLoopPreheader(),
+ Curr->getDistributedLoop()->getExitingBlock());
+ }
+
+ /// \brief Removes the dead instructions from the cloned loops.
+ void removeUnusedInsts() {
+ for (auto &Partition : PartitionContainer)
+ Partition.removeUnusedInsts();
+ }
+
+ /// \brief For each memory pointer, it computes the partitionId the pointer is
+ /// used in.
+ ///
+ /// This returns an array of int where the I-th entry corresponds to I-th
+ /// entry in LAI.getRuntimePointerCheck(). If the pointer is used in multiple
+ /// partitions its entry is set to -1.
+ SmallVector<int, 8>
+ computePartitionSetForPointers(const LoopAccessInfo &LAI) {
+ const RuntimePointerChecking *RtPtrCheck = LAI.getRuntimePointerChecking();
+
+ unsigned N = RtPtrCheck->Pointers.size();
+ SmallVector<int, 8> PtrToPartitions(N);
+ for (unsigned I = 0; I < N; ++I) {
+ Value *Ptr = RtPtrCheck->Pointers[I].PointerValue;
+ auto Instructions =
+ LAI.getInstructionsForAccess(Ptr, RtPtrCheck->Pointers[I].IsWritePtr);
+
+ int &Partition = PtrToPartitions[I];
+ // First set it to uninitialized.
+ Partition = -2;
+ for (Instruction *Inst : Instructions) {
+ // Note that this could be -1 if Inst is duplicated across multiple
+ // partitions.
+ int ThisPartition = this->InstToPartitionId[Inst];
+ if (Partition == -2)
+ Partition = ThisPartition;
+ // -1 means belonging to multiple partitions.
+ else if (Partition == -1)
+ break;
+ else if (Partition != (int)ThisPartition)
+ Partition = -1;
+ }
+ assert(Partition != -2 && "Pointer not belonging to any partition");
+ }
+
+ return PtrToPartitions;
+ }
+
+ void print(raw_ostream &OS) const {
+ unsigned Index = 0;
+ for (const auto &P : PartitionContainer) {
+ OS << "Partition " << Index++ << " (" << &P << "):\n";
+ P.print();
+ }
+ }
+
+ void dump() const { print(dbgs()); }
+
+#ifndef NDEBUG
+ friend raw_ostream &operator<<(raw_ostream &OS,
+ const InstPartitionContainer &Partitions) {
+ Partitions.print(OS);
+ return OS;
+ }
+#endif
+
+ void printBlocks() const {
+ unsigned Index = 0;
+ for (const auto &P : PartitionContainer) {
+ dbgs() << "\nPartition " << Index++ << " (" << &P << "):\n";
+ P.printBlocks();
+ }
+ }
+
+private:
+ typedef std::list<InstPartition> PartitionContainerT;
+
+ /// \brief List of partitions.
+ PartitionContainerT PartitionContainer;
+
+ /// \brief Mapping from Instruction to partition Id. If the instruction
+ /// belongs to multiple partitions the entry contains -1.
+ InstToPartitionIdT InstToPartitionId;
+
+ Loop *L;
+ LoopInfo *LI;
+ DominatorTree *DT;
+
+ /// \brief The control structure to merge adjacent partitions if both satisfy
+ /// the \p Predicate.
+ template <class UnaryPredicate>
+ void mergeAdjacentPartitionsIf(UnaryPredicate Predicate) {
+ InstPartition *PrevMatch = nullptr;
+ for (auto I = PartitionContainer.begin(); I != PartitionContainer.end();) {
+ auto DoesMatch = Predicate(&*I);
+ if (PrevMatch == nullptr && DoesMatch) {
+ PrevMatch = &*I;
+ ++I;
+ } else if (PrevMatch != nullptr && DoesMatch) {
+ I->moveTo(*PrevMatch);
+ I = PartitionContainer.erase(I);
+ } else {
+ PrevMatch = nullptr;
+ ++I;
+ }
+ }
+ }
+};
+
+/// \brief For each memory instruction, this class maintains difference of the
+/// number of unsafe dependences that start out from this instruction minus
+/// those that end here.
+///
+/// By traversing the memory instructions in program order and accumulating this
+/// number, we know whether any unsafe dependence crosses over a program point.
+class MemoryInstructionDependences {
+ typedef MemoryDepChecker::Dependence Dependence;
+
+public:
+ struct Entry {
+ Instruction *Inst;
+ unsigned NumUnsafeDependencesStartOrEnd;
+
+ Entry(Instruction *Inst) : Inst(Inst), NumUnsafeDependencesStartOrEnd(0) {}
+ };
+
+ typedef SmallVector<Entry, 8> AccessesType;
+
+ AccessesType::const_iterator begin() const { return Accesses.begin(); }
+ AccessesType::const_iterator end() const { return Accesses.end(); }
+
+ MemoryInstructionDependences(
+ const SmallVectorImpl<Instruction *> &Instructions,
+ const SmallVectorImpl<Dependence> &Dependences) {
+ Accesses.append(Instructions.begin(), Instructions.end());
+
+ DEBUG(dbgs() << "Backward dependences:\n");
+ for (auto &Dep : Dependences)
+ if (Dep.isPossiblyBackward()) {
+ // Note that the designations source and destination follow the program
+ // order, i.e. source is always first. (The direction is given by the
+ // DepType.)
+ ++Accesses[Dep.Source].NumUnsafeDependencesStartOrEnd;
+ --Accesses[Dep.Destination].NumUnsafeDependencesStartOrEnd;
+
+ DEBUG(Dep.print(dbgs(), 2, Instructions));
+ }
+ }
+
+private:
+ AccessesType Accesses;
+};
+
+/// \brief The pass class.
+class LoopDistribute : public FunctionPass {
+public:
+ LoopDistribute() : FunctionPass(ID) {
+ initializeLoopDistributePass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ LAA = &getAnalysis<LoopAccessAnalysis>();
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+
+ // Build up a worklist of inner-loops to vectorize. This is necessary as the
+ // act of distributing a loop creates new loops and can invalidate iterators
+ // across the loops.
+ SmallVector<Loop *, 8> Worklist;
+
+ for (Loop *TopLevelLoop : *LI)
+ for (Loop *L : depth_first(TopLevelLoop))
+ // We only handle inner-most loops.
+ if (L->empty())
+ Worklist.push_back(L);
+
+ // Now walk the identified inner loops.
+ bool Changed = false;
+ for (Loop *L : Worklist)
+ Changed |= processLoop(L);
+
+ // Process each loop nest in the function.
+ return Changed;
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
+ AU.addRequired<LoopAccessAnalysis>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ }
+
+ static char ID;
+
+private:
+ /// \brief Filter out checks between pointers from the same partition.
+ ///
+ /// \p PtrToPartition contains the partition number for pointers. Partition
+ /// number -1 means that the pointer is used in multiple partitions. In this
+ /// case we can't safely omit the check.
+ SmallVector<RuntimePointerChecking::PointerCheck, 4>
+ includeOnlyCrossPartitionChecks(
+ const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &AllChecks,
+ const SmallVectorImpl<int> &PtrToPartition,
+ const RuntimePointerChecking *RtPtrChecking) {
+ SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks;
+
+ std::copy_if(AllChecks.begin(), AllChecks.end(), std::back_inserter(Checks),
+ [&](const RuntimePointerChecking::PointerCheck &Check) {
+ for (unsigned PtrIdx1 : Check.first->Members)
+ for (unsigned PtrIdx2 : Check.second->Members)
+ // Only include this check if there is a pair of pointers
+ // that require checking and the pointers fall into
+ // separate partitions.
+ //
+ // (Note that we already know at this point that the two
+ // pointer groups need checking but it doesn't follow
+ // that each pair of pointers within the two groups need
+ // checking as well.
+ //
+ // In other words we don't want to include a check just
+ // because there is a pair of pointers between the two
+ // pointer groups that require checks and a different
+ // pair whose pointers fall into different partitions.)
+ if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) &&
+ !RuntimePointerChecking::arePointersInSamePartition(
+ PtrToPartition, PtrIdx1, PtrIdx2))
+ return true;
+ return false;
+ });
+
+ return Checks;
+ }
+
+ /// \brief Try to distribute an inner-most loop.
+ bool processLoop(Loop *L) {
+ assert(L->empty() && "Only process inner loops.");
+
+ DEBUG(dbgs() << "\nLDist: In \"" << L->getHeader()->getParent()->getName()
+ << "\" checking " << *L << "\n");
+
+ BasicBlock *PH = L->getLoopPreheader();
+ if (!PH) {
+ DEBUG(dbgs() << "Skipping; no preheader");
+ return false;
+ }
+ if (!L->getExitBlock()) {
+ DEBUG(dbgs() << "Skipping; multiple exit blocks");
+ return false;
+ }
+ // LAA will check that we only have a single exiting block.
+
+ const LoopAccessInfo &LAI = LAA->getInfo(L, ValueToValueMap());
+
+ // Currently, we only distribute to isolate the part of the loop with
+ // dependence cycles to enable partial vectorization.
+ if (LAI.canVectorizeMemory()) {
+ DEBUG(dbgs() << "Skipping; memory operations are safe for vectorization");
+ return false;
+ }
+ auto *Dependences = LAI.getDepChecker().getDependences();
+ if (!Dependences || Dependences->empty()) {
+ DEBUG(dbgs() << "Skipping; No unsafe dependences to isolate");
+ return false;
+ }
+
+ InstPartitionContainer Partitions(L, LI, DT);
+
+ // First, go through each memory operation and assign them to consecutive
+ // partitions (the order of partitions follows program order). Put those
+ // with unsafe dependences into "cyclic" partition otherwise put each store
+ // in its own "non-cyclic" partition (we'll merge these later).
+ //
+ // Note that a memory operation (e.g. Load2 below) at a program point that
+ // has an unsafe dependence (Store3->Load1) spanning over it must be
+ // included in the same cyclic partition as the dependent operations. This
+ // is to preserve the original program order after distribution. E.g.:
+ //
+ // NumUnsafeDependencesStartOrEnd NumUnsafeDependencesActive
+ // Load1 -. 1 0->1
+ // Load2 | /Unsafe/ 0 1
+ // Store3 -' -1 1->0
+ // Load4 0 0
+ //
+ // NumUnsafeDependencesActive > 0 indicates this situation and in this case
+ // we just keep assigning to the same cyclic partition until
+ // NumUnsafeDependencesActive reaches 0.
+ const MemoryDepChecker &DepChecker = LAI.getDepChecker();
+ MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(),
+ *Dependences);
+
+ int NumUnsafeDependencesActive = 0;
+ for (auto &InstDep : MID) {
+ Instruction *I = InstDep.Inst;
+ // We update NumUnsafeDependencesActive post-instruction, catch the
+ // start of a dependence directly via NumUnsafeDependencesStartOrEnd.
+ if (NumUnsafeDependencesActive ||
+ InstDep.NumUnsafeDependencesStartOrEnd > 0)
+ Partitions.addToCyclicPartition(I);
+ else
+ Partitions.addToNewNonCyclicPartition(I);
+ NumUnsafeDependencesActive += InstDep.NumUnsafeDependencesStartOrEnd;
+ assert(NumUnsafeDependencesActive >= 0 &&
+ "Negative number of dependences active");
+ }
+
+ // Add partitions for values used outside. These partitions can be out of
+ // order from the original program order. This is OK because if the
+ // partition uses a load we will merge this partition with the original
+ // partition of the load that we set up in the previous loop (see
+ // mergeToAvoidDuplicatedLoads).
+ auto DefsUsedOutside = findDefsUsedOutsideOfLoop(L);
+ for (auto *Inst : DefsUsedOutside)
+ Partitions.addToNewNonCyclicPartition(Inst);
+
+ DEBUG(dbgs() << "Seeded partitions:\n" << Partitions);
+ if (Partitions.getSize() < 2)
+ return false;
+
+ // Run the merge heuristics: Merge non-cyclic adjacent partitions since we
+ // should be able to vectorize these together.
+ Partitions.mergeBeforePopulating();
+ DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions);
+ if (Partitions.getSize() < 2)
+ return false;
+
+ // Now, populate the partitions with non-memory operations.
+ Partitions.populateUsedSet();
+ DEBUG(dbgs() << "\nPopulated partitions:\n" << Partitions);
+
+ // In order to preserve original lexical order for loads, keep them in the
+ // partition that we set up in the MemoryInstructionDependences loop.
+ if (Partitions.mergeToAvoidDuplicatedLoads()) {
+ DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n"
+ << Partitions);
+ if (Partitions.getSize() < 2)
+ return false;
+ }
+
+ // Don't distribute the loop if we need too many SCEV run-time checks.
+ const SCEVUnionPredicate &Pred = LAI.PSE.getUnionPredicate();
+ if (Pred.getComplexity() > DistributeSCEVCheckThreshold) {
+ DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n");
+ return false;
+ }
+
+ DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n");
+ // We're done forming the partitions set up the reverse mapping from
+ // instructions to partitions.
+ Partitions.setupPartitionIdOnInstructions();
+
+ // To keep things simple have an empty preheader before we version or clone
+ // the loop. (Also split if this has no predecessor, i.e. entry, because we
+ // rely on PH having a predecessor.)
+ if (!PH->getSinglePredecessor() || &*PH->begin() != PH->getTerminator())
+ SplitBlock(PH, PH->getTerminator(), DT, LI);
+
+ // If we need run-time checks, version the loop now.
+ auto PtrToPartition = Partitions.computePartitionSetForPointers(LAI);
+ const auto *RtPtrChecking = LAI.getRuntimePointerChecking();
+ const auto &AllChecks = RtPtrChecking->getChecks();
+ auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition,
+ RtPtrChecking);
+
+ if (!Pred.isAlwaysTrue() || !Checks.empty()) {
+ DEBUG(dbgs() << "\nPointers:\n");
+ DEBUG(LAI.getRuntimePointerChecking()->printChecks(dbgs(), Checks));
+ LoopVersioning LVer(LAI, L, LI, DT, SE, false);
+ LVer.setAliasChecks(std::move(Checks));
+ LVer.setSCEVChecks(LAI.PSE.getUnionPredicate());
+ LVer.versionLoop(DefsUsedOutside);
+ }
+
+ // Create identical copies of the original loop for each partition and hook
+ // them up sequentially.
+ Partitions.cloneLoops();
+
+ // Now, we remove the instruction from each loop that don't belong to that
+ // partition.
+ Partitions.removeUnusedInsts();
+ DEBUG(dbgs() << "\nAfter removing unused Instrs:\n");
+ DEBUG(Partitions.printBlocks());
+
+ if (LDistVerify) {
+ LI->verify();
+ DT->verifyDomTree();
+ }
+
+ ++NumLoopsDistributed;
+ return true;
+ }
+
+ // Analyses used.
+ LoopInfo *LI;
+ LoopAccessAnalysis *LAA;
+ DominatorTree *DT;
+ ScalarEvolution *SE;
+};
+} // anonymous namespace
+
+char LoopDistribute::ID;
+static const char ldist_name[] = "Loop Distribition";
+
+INITIALIZE_PASS_BEGIN(LoopDistribute, LDIST_NAME, ldist_name, false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_END(LoopDistribute, LDIST_NAME, ldist_name, false, false)
+
+namespace llvm {
+FunctionPass *createLoopDistributePass() { return new LoopDistribute(); }
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
new file mode 100644
index 0000000..4521640
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -0,0 +1,1112 @@
+//===-- LoopIdiomRecognize.cpp - Loop idiom recognition -------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements an idiom recognizer that transforms simple loops into a
+// non-loop form. In cases that this kicks in, it can be a significant
+// performance win.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO List:
+//
+// Future loop memory idioms to recognize:
+// memcmp, memmove, strlen, etc.
+// Future floating point idioms to recognize in -ffast-math mode:
+// fpowi
+// Future integer operation idioms to recognize:
+// ctpop, ctlz, cttz
+//
+// Beware that isel's default lowering for ctpop is highly inefficient for
+// i64 and larger types when i64 is legal and the value has few bits set. It
+// would be good to enhance isel to emit a loop for ctpop in this case.
+//
+// We should enhance the memset/memcpy recognition to handle multiple stores in
+// the loop. This would handle things like:
+// void foo(_Complex float *P)
+// for (i) { __real__(*P) = 0; __imag__(*P) = 0; }
+//
+// This could recognize common matrix multiplies and dot product idioms and
+// replace them with calls to BLAS (if linked in??).
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-idiom"
+
+STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
+STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
+
+namespace {
+
+class LoopIdiomRecognize : public LoopPass {
+ Loop *CurLoop;
+ AliasAnalysis *AA;
+ DominatorTree *DT;
+ LoopInfo *LI;
+ ScalarEvolution *SE;
+ TargetLibraryInfo *TLI;
+ const TargetTransformInfo *TTI;
+ const DataLayout *DL;
+
+public:
+ static char ID;
+ explicit LoopIdiomRecognize() : LoopPass(ID) {
+ initializeLoopIdiomRecognizePass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+
+ /// This transformation requires natural loop information & requires that
+ /// loop preheaders be inserted into the CFG.
+ ///
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
+ AU.addRequiredID(LoopSimplifyID);
+ AU.addPreservedID(LoopSimplifyID);
+ AU.addRequiredID(LCSSAID);
+ AU.addPreservedID(LCSSAID);
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addPreserved<AAResultsWrapperPass>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addPreserved<ScalarEvolutionWrapperPass>();
+ AU.addPreserved<SCEVAAWrapperPass>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.addPreserved<BasicAAWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ }
+
+private:
+ typedef SmallVector<StoreInst *, 8> StoreList;
+ StoreList StoreRefsForMemset;
+ StoreList StoreRefsForMemcpy;
+ bool HasMemset;
+ bool HasMemsetPattern;
+ bool HasMemcpy;
+
+ /// \name Countable Loop Idiom Handling
+ /// @{
+
+ bool runOnCountableLoop();
+ bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
+ SmallVectorImpl<BasicBlock *> &ExitBlocks);
+
+ void collectStores(BasicBlock *BB);
+ bool isLegalStore(StoreInst *SI, bool &ForMemset, bool &ForMemcpy);
+ bool processLoopStore(StoreInst *SI, const SCEV *BECount);
+ bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
+
+ bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
+ unsigned StoreAlignment, Value *StoredVal,
+ Instruction *TheStore, const SCEVAddRecExpr *Ev,
+ const SCEV *BECount, bool NegStride);
+ bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);
+
+ /// @}
+ /// \name Noncountable Loop Idiom Handling
+ /// @{
+
+ bool runOnNoncountableLoop();
+
+ bool recognizePopcount();
+ void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
+ PHINode *CntPhi, Value *Var);
+
+ /// @}
+};
+
+} // End anonymous namespace.
+
+char LoopIdiomRecognize::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(LCSSA)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms",
+ false, false)
+
+Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognize(); }
+
+/// deleteDeadInstruction - Delete this instruction. Before we do, go through
+/// and zero out all the operands of this instruction. If any of them become
+/// dead, delete them and the computation tree that feeds them.
+///
+static void deleteDeadInstruction(Instruction *I,
+ const TargetLibraryInfo *TLI) {
+ SmallVector<Value *, 16> Operands(I->value_op_begin(), I->value_op_end());
+ I->replaceAllUsesWith(UndefValue::get(I->getType()));
+ I->eraseFromParent();
+ for (Value *Op : Operands)
+ RecursivelyDeleteTriviallyDeadInstructions(Op, TLI);
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Implementation of LoopIdiomRecognize
+//
+//===----------------------------------------------------------------------===//
+
+bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) {
+ if (skipOptnoneFunction(L))
+ return false;
+
+ CurLoop = L;
+ // If the loop could not be converted to canonical form, it must have an
+ // indirectbr in it, just give up.
+ if (!L->getLoopPreheader())
+ return false;
+
+ // Disable loop idiom recognition if the function's name is a common idiom.
+ StringRef Name = L->getHeader()->getParent()->getName();
+ if (Name == "memset" || Name == "memcpy")
+ return false;
+
+ AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+ *CurLoop->getHeader()->getParent());
+ DL = &CurLoop->getHeader()->getModule()->getDataLayout();
+
+ HasMemset = TLI->has(LibFunc::memset);
+ HasMemsetPattern = TLI->has(LibFunc::memset_pattern16);
+ HasMemcpy = TLI->has(LibFunc::memcpy);
+
+ if (HasMemset || HasMemsetPattern || HasMemcpy)
+ if (SE->hasLoopInvariantBackedgeTakenCount(L))
+ return runOnCountableLoop();
+
+ return runOnNoncountableLoop();
+}
+
+bool LoopIdiomRecognize::runOnCountableLoop() {
+ const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop);
+ assert(!isa<SCEVCouldNotCompute>(BECount) &&
+ "runOnCountableLoop() called on a loop without a predictable"
+ "backedge-taken count");
+
+ // If this loop executes exactly one time, then it should be peeled, not
+ // optimized by this pass.
+ if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
+ if (BECst->getAPInt() == 0)
+ return false;
+
+ SmallVector<BasicBlock *, 8> ExitBlocks;
+ CurLoop->getUniqueExitBlocks(ExitBlocks);
+
+ DEBUG(dbgs() << "loop-idiom Scanning: F["
+ << CurLoop->getHeader()->getParent()->getName() << "] Loop %"
+ << CurLoop->getHeader()->getName() << "\n");
+
+ bool MadeChange = false;
+ // Scan all the blocks in the loop that are not in subloops.
+ for (auto *BB : CurLoop->getBlocks()) {
+ // Ignore blocks in subloops.
+ if (LI->getLoopFor(BB) != CurLoop)
+ continue;
+
+ MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks);
+ }
+ return MadeChange;
+}
+
+static unsigned getStoreSizeInBytes(StoreInst *SI, const DataLayout *DL) {
+ uint64_t SizeInBits = DL->getTypeSizeInBits(SI->getValueOperand()->getType());
+ assert(((SizeInBits & 7) || (SizeInBits >> 32) == 0) &&
+ "Don't overflow unsigned.");
+ return (unsigned)SizeInBits >> 3;
+}
+
+static unsigned getStoreStride(const SCEVAddRecExpr *StoreEv) {
+ const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1));
+ return ConstStride->getAPInt().getZExtValue();
+}
+
+/// getMemSetPatternValue - If a strided store of the specified value is safe to
+/// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should
+/// be passed in. Otherwise, return null.
+///
+/// Note that we don't ever attempt to use memset_pattern8 or 4, because these
+/// just replicate their input array and then pass on to memset_pattern16.
+static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) {
+ // If the value isn't a constant, we can't promote it to being in a constant
+ // array. We could theoretically do a store to an alloca or something, but
+ // that doesn't seem worthwhile.
+ Constant *C = dyn_cast<Constant>(V);
+ if (!C)
+ return nullptr;
+
+ // Only handle simple values that are a power of two bytes in size.
+ uint64_t Size = DL->getTypeSizeInBits(V->getType());
+ if (Size == 0 || (Size & 7) || (Size & (Size - 1)))
+ return nullptr;
+
+ // Don't care enough about darwin/ppc to implement this.
+ if (DL->isBigEndian())
+ return nullptr;
+
+ // Convert to size in bytes.
+ Size /= 8;
+
+ // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
+ // if the top and bottom are the same (e.g. for vectors and large integers).
+ if (Size > 16)
+ return nullptr;
+
+ // If the constant is exactly 16 bytes, just use it.
+ if (Size == 16)
+ return C;
+
+ // Otherwise, we'll use an array of the constants.
+ unsigned ArraySize = 16 / Size;
+ ArrayType *AT = ArrayType::get(V->getType(), ArraySize);
+ return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C));
+}
+
+bool LoopIdiomRecognize::isLegalStore(StoreInst *SI, bool &ForMemset,
+ bool &ForMemcpy) {
+ // Don't touch volatile stores.
+ if (!SI->isSimple())
+ return false;
+
+ Value *StoredVal = SI->getValueOperand();
+ Value *StorePtr = SI->getPointerOperand();
+
+ // Reject stores that are so large that they overflow an unsigned.
+ uint64_t SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
+ if ((SizeInBits & 7) || (SizeInBits >> 32) != 0)
+ return false;
+
+ // See if the pointer expression is an AddRec like {base,+,1} on the current
+ // loop, which indicates a strided store. If we have something else, it's a
+ // random store we can't handle.
+ const SCEVAddRecExpr *StoreEv =
+ dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
+ if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
+ return false;
+
+ // Check to see if we have a constant stride.
+ if (!isa<SCEVConstant>(StoreEv->getOperand(1)))
+ return false;
+
+ // See if the store can be turned into a memset.
+
+ // If the stored value is a byte-wise value (like i32 -1), then it may be
+ // turned into a memset of i8 -1, assuming that all the consecutive bytes
+ // are stored. A store of i32 0x01020304 can never be turned into a memset,
+ // but it can be turned into memset_pattern if the target supports it.
+ Value *SplatValue = isBytewiseValue(StoredVal);
+ Constant *PatternValue = nullptr;
+
+ // If we're allowed to form a memset, and the stored value would be
+ // acceptable for memset, use it.
+ if (HasMemset && SplatValue &&
+ // Verify that the stored value is loop invariant. If not, we can't
+ // promote the memset.
+ CurLoop->isLoopInvariant(SplatValue)) {
+ // It looks like we can use SplatValue.
+ ForMemset = true;
+ return true;
+ } else if (HasMemsetPattern &&
+ // Don't create memset_pattern16s with address spaces.
+ StorePtr->getType()->getPointerAddressSpace() == 0 &&
+ (PatternValue = getMemSetPatternValue(StoredVal, DL))) {
+ // It looks like we can use PatternValue!
+ ForMemset = true;
+ return true;
+ }
+
+ // Otherwise, see if the store can be turned into a memcpy.
+ if (HasMemcpy) {
+ // Check to see if the stride matches the size of the store. If so, then we
+ // know that every byte is touched in the loop.
+ unsigned Stride = getStoreStride(StoreEv);
+ unsigned StoreSize = getStoreSizeInBytes(SI, DL);
+ if (StoreSize != Stride && StoreSize != -Stride)
+ return false;
+
+ // The store must be feeding a non-volatile load.
+ LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
+ if (!LI || !LI->isSimple())
+ return false;
+
+ // See if the pointer expression is an AddRec like {base,+,1} on the current
+ // loop, which indicates a strided load. If we have something else, it's a
+ // random load we can't handle.
+ const SCEVAddRecExpr *LoadEv =
+ dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
+ if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
+ return false;
+
+ // The store and load must share the same stride.
+ if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
+ return false;
+
+ // Success. This store can be converted into a memcpy.
+ ForMemcpy = true;
+ return true;
+ }
+ // This store can't be transformed into a memset/memcpy.
+ return false;
+}
+
+void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
+ StoreRefsForMemset.clear();
+ StoreRefsForMemcpy.clear();
+ for (Instruction &I : *BB) {
+ StoreInst *SI = dyn_cast<StoreInst>(&I);
+ if (!SI)
+ continue;
+
+ bool ForMemset = false;
+ bool ForMemcpy = false;
+ // Make sure this is a strided store with a constant stride.
+ if (!isLegalStore(SI, ForMemset, ForMemcpy))
+ continue;
+
+ // Save the store locations.
+ if (ForMemset)
+ StoreRefsForMemset.push_back(SI);
+ else if (ForMemcpy)
+ StoreRefsForMemcpy.push_back(SI);
+ }
+}
+
+/// runOnLoopBlock - Process the specified block, which lives in a counted loop
+/// with the specified backedge count. This block is known to be in the current
+/// loop and not in any subloops.
+bool LoopIdiomRecognize::runOnLoopBlock(
+ BasicBlock *BB, const SCEV *BECount,
+ SmallVectorImpl<BasicBlock *> &ExitBlocks) {
+ // We can only promote stores in this block if they are unconditionally
+ // executed in the loop. For a block to be unconditionally executed, it has
+ // to dominate all the exit blocks of the loop. Verify this now.
+ for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
+ if (!DT->dominates(BB, ExitBlocks[i]))
+ return false;
+
+ bool MadeChange = false;
+ // Look for store instructions, which may be optimized to memset/memcpy.
+ collectStores(BB);
+
+ // Look for a single store which can be optimized into a memset.
+ for (auto &SI : StoreRefsForMemset)
+ MadeChange |= processLoopStore(SI, BECount);
+
+ // Optimize the store into a memcpy, if it feeds an similarly strided load.
+ for (auto &SI : StoreRefsForMemcpy)
+ MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);
+
+ for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
+ Instruction *Inst = &*I++;
+ // Look for memset instructions, which may be optimized to a larger memset.
+ if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) {
+ WeakVH InstPtr(&*I);
+ if (!processLoopMemSet(MSI, BECount))
+ continue;
+ MadeChange = true;
+
+ // If processing the memset invalidated our iterator, start over from the
+ // top of the block.
+ if (!InstPtr)
+ I = BB->begin();
+ continue;
+ }
+ }
+
+ return MadeChange;
+}
+
+/// processLoopStore - See if this store can be promoted to a memset.
+bool LoopIdiomRecognize::processLoopStore(StoreInst *SI, const SCEV *BECount) {
+ assert(SI->isSimple() && "Expected only non-volatile stores.");
+
+ Value *StoredVal = SI->getValueOperand();
+ Value *StorePtr = SI->getPointerOperand();
+
+ // Check to see if the stride matches the size of the store. If so, then we
+ // know that every byte is touched in the loop.
+ const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
+ unsigned Stride = getStoreStride(StoreEv);
+ unsigned StoreSize = getStoreSizeInBytes(SI, DL);
+ if (StoreSize != Stride && StoreSize != -Stride)
+ return false;
+
+ bool NegStride = StoreSize == -Stride;
+
+ // See if we can optimize just this store in isolation.
+ return processLoopStridedStore(StorePtr, StoreSize, SI->getAlignment(),
+ StoredVal, SI, StoreEv, BECount, NegStride);
+}
+
+/// processLoopMemSet - See if this memset can be promoted to a large memset.
+bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
+ const SCEV *BECount) {
+ // We can only handle non-volatile memsets with a constant size.
+ if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength()))
+ return false;
+
+ // If we're not allowed to hack on memset, we fail.
+ if (!TLI->has(LibFunc::memset))
+ return false;
+
+ Value *Pointer = MSI->getDest();
+
+ // See if the pointer expression is an AddRec like {base,+,1} on the current
+ // loop, which indicates a strided store. If we have something else, it's a
+ // random store we can't handle.
+ const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer));
+ if (!Ev || Ev->getLoop() != CurLoop || !Ev->isAffine())
+ return false;
+
+ // Reject memsets that are so large that they overflow an unsigned.
+ uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
+ if ((SizeInBytes >> 32) != 0)
+ return false;
+
+ // Check to see if the stride matches the size of the memset. If so, then we
+ // know that every byte is touched in the loop.
+ const SCEVConstant *Stride = dyn_cast<SCEVConstant>(Ev->getOperand(1));
+
+ // TODO: Could also handle negative stride here someday, that will require the
+ // validity check in mayLoopAccessLocation to be updated though.
+ if (!Stride || MSI->getLength() != Stride->getValue())
+ return false;
+
+ // Verify that the memset value is loop invariant. If not, we can't promote
+ // the memset.
+ Value *SplatValue = MSI->getValue();
+ if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue))
+ return false;
+
+ return processLoopStridedStore(Pointer, (unsigned)SizeInBytes,
+ MSI->getAlignment(), SplatValue, MSI, Ev,
+ BECount, /*NegStride=*/false);
+}
+
+/// mayLoopAccessLocation - Return true if the specified loop might access the
+/// specified pointer location, which is a loop-strided access. The 'Access'
+/// argument specifies what the verboten forms of access are (read or write).
+static bool mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,
+ const SCEV *BECount, unsigned StoreSize,
+ AliasAnalysis &AA,
+ Instruction *IgnoredStore) {
+ // Get the location that may be stored across the loop. Since the access is
+ // strided positively through memory, we say that the modified location starts
+ // at the pointer and has infinite size.
+ uint64_t AccessSize = MemoryLocation::UnknownSize;
+
+ // If the loop iterates a fixed number of times, we can refine the access size
+ // to be exactly the size of the memset, which is (BECount+1)*StoreSize
+ if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
+ AccessSize = (BECst->getValue()->getZExtValue() + 1) * StoreSize;
+
+ // TODO: For this to be really effective, we have to dive into the pointer
+ // operand in the store. Store to &A[i] of 100 will always return may alias
+ // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
+ // which will then no-alias a store to &A[100].
+ MemoryLocation StoreLoc(Ptr, AccessSize);
+
+ for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E;
+ ++BI)
+ for (BasicBlock::iterator I = (*BI)->begin(), E = (*BI)->end(); I != E; ++I)
+ if (&*I != IgnoredStore && (AA.getModRefInfo(&*I, StoreLoc) & Access))
+ return true;
+
+ return false;
+}
+
+// If we have a negative stride, Start refers to the end of the memory location
+// we're trying to memset. Therefore, we need to recompute the base pointer,
+// which is just Start - BECount*Size.
+static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount,
+ Type *IntPtr, unsigned StoreSize,
+ ScalarEvolution *SE) {
+ const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr);
+ if (StoreSize != 1)
+ Index = SE->getMulExpr(Index, SE->getConstant(IntPtr, StoreSize),
+ SCEV::FlagNUW);
+ return SE->getMinusSCEV(Start, Index);
+}
+
+/// processLoopStridedStore - We see a strided store of some value. If we can
+/// transform this into a memset or memset_pattern in the loop preheader, do so.
+bool LoopIdiomRecognize::processLoopStridedStore(
+ Value *DestPtr, unsigned StoreSize, unsigned StoreAlignment,
+ Value *StoredVal, Instruction *TheStore, const SCEVAddRecExpr *Ev,
+ const SCEV *BECount, bool NegStride) {
+ Value *SplatValue = isBytewiseValue(StoredVal);
+ Constant *PatternValue = nullptr;
+
+ if (!SplatValue)
+ PatternValue = getMemSetPatternValue(StoredVal, DL);
+
+ assert((SplatValue || PatternValue) &&
+ "Expected either splat value or pattern value.");
+
+ // The trip count of the loop and the base pointer of the addrec SCEV is
+ // guaranteed to be loop invariant, which means that it should dominate the
+ // header. This allows us to insert code for it in the preheader.
+ unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
+ BasicBlock *Preheader = CurLoop->getLoopPreheader();
+ IRBuilder<> Builder(Preheader->getTerminator());
+ SCEVExpander Expander(*SE, *DL, "loop-idiom");
+
+ Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS);
+ Type *IntPtr = Builder.getIntPtrTy(*DL, DestAS);
+
+ const SCEV *Start = Ev->getStart();
+ // Handle negative strided loops.
+ if (NegStride)
+ Start = getStartForNegStride(Start, BECount, IntPtr, StoreSize, SE);
+
+ // Okay, we have a strided store "p[i]" of a splattable value. We can turn
+ // this into a memset in the loop preheader now if we want. However, this
+ // would be unsafe to do if there is anything else in the loop that may read
+ // or write to the aliased location. Check for any overlap by generating the
+ // base pointer and checking the region.
+ Value *BasePtr =
+ Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());
+ if (mayLoopAccessLocation(BasePtr, MRI_ModRef, CurLoop, BECount, StoreSize,
+ *AA, TheStore)) {
+ Expander.clear();
+ // If we generated new code for the base pointer, clean up.
+ RecursivelyDeleteTriviallyDeadInstructions(BasePtr, TLI);
+ return false;
+ }
+
+ // Okay, everything looks good, insert the memset.
+
+ // The # stored bytes is (BECount+1)*Size. Expand the trip count out to
+ // pointer size if it isn't already.
+ BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr);
+
+ const SCEV *NumBytesS =
+ SE->getAddExpr(BECount, SE->getOne(IntPtr), SCEV::FlagNUW);
+ if (StoreSize != 1) {
+ NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize),
+ SCEV::FlagNUW);
+ }
+
+ Value *NumBytes =
+ Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator());
+
+ CallInst *NewCall;
+ if (SplatValue) {
+ NewCall =
+ Builder.CreateMemSet(BasePtr, SplatValue, NumBytes, StoreAlignment);
+ } else {
+ // Everything is emitted in default address space
+ Type *Int8PtrTy = DestInt8PtrTy;
+
+ Module *M = TheStore->getModule();
+ Value *MSP =
+ M->getOrInsertFunction("memset_pattern16", Builder.getVoidTy(),
+ Int8PtrTy, Int8PtrTy, IntPtr, (void *)nullptr);
+
+ // Otherwise we should form a memset_pattern16. PatternValue is known to be
+ // an constant array of 16-bytes. Plop the value into a mergable global.
+ GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true,
+ GlobalValue::PrivateLinkage,
+ PatternValue, ".memset_pattern");
+ GV->setUnnamedAddr(true); // Ok to merge these.
+ GV->setAlignment(16);
+ Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy);
+ NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
+ }
+
+ DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n"
+ << " from store to: " << *Ev << " at: " << *TheStore << "\n");
+ NewCall->setDebugLoc(TheStore->getDebugLoc());
+
+ // Okay, the memset has been formed. Zap the original store and anything that
+ // feeds into it.
+ deleteDeadInstruction(TheStore, TLI);
+ ++NumMemSet;
+ return true;
+}
+
+/// If the stored value is a strided load in the same loop with the same stride
+/// this may be transformable into a memcpy. This kicks in for stuff like
+/// for (i) A[i] = B[i];
+bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
+ const SCEV *BECount) {
+ assert(SI->isSimple() && "Expected only non-volatile stores.");
+
+ Value *StorePtr = SI->getPointerOperand();
+ const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
+ unsigned Stride = getStoreStride(StoreEv);
+ unsigned StoreSize = getStoreSizeInBytes(SI, DL);
+ bool NegStride = StoreSize == -Stride;
+
+ // The store must be feeding a non-volatile load.
+ LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
+ assert(LI->isSimple() && "Expected only non-volatile stores.");
+
+ // See if the pointer expression is an AddRec like {base,+,1} on the current
+ // loop, which indicates a strided load. If we have something else, it's a
+ // random load we can't handle.
+ const SCEVAddRecExpr *LoadEv =
+ cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
+
+ // The trip count of the loop and the base pointer of the addrec SCEV is
+ // guaranteed to be loop invariant, which means that it should dominate the
+ // header. This allows us to insert code for it in the preheader.
+ BasicBlock *Preheader = CurLoop->getLoopPreheader();
+ IRBuilder<> Builder(Preheader->getTerminator());
+ SCEVExpander Expander(*SE, *DL, "loop-idiom");
+
+ const SCEV *StrStart = StoreEv->getStart();
+ unsigned StrAS = SI->getPointerAddressSpace();
+ Type *IntPtrTy = Builder.getIntPtrTy(*DL, StrAS);
+
+ // Handle negative strided loops.
+ if (NegStride)
+ StrStart = getStartForNegStride(StrStart, BECount, IntPtrTy, StoreSize, SE);
+
+ // Okay, we have a strided store "p[i]" of a loaded value. We can turn
+ // this into a memcpy in the loop preheader now if we want. However, this
+ // would be unsafe to do if there is anything else in the loop that may read
+ // or write the memory region we're storing to. This includes the load that
+ // feeds the stores. Check for an alias by generating the base address and
+ // checking everything.
+ Value *StoreBasePtr = Expander.expandCodeFor(
+ StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator());
+
+ if (mayLoopAccessLocation(StoreBasePtr, MRI_ModRef, CurLoop, BECount,
+ StoreSize, *AA, SI)) {
+ Expander.clear();
+ // If we generated new code for the base pointer, clean up.
+ RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI);
+ return false;
+ }
+
+ const SCEV *LdStart = LoadEv->getStart();
+ unsigned LdAS = LI->getPointerAddressSpace();
+
+ // Handle negative strided loops.
+ if (NegStride)
+ LdStart = getStartForNegStride(LdStart, BECount, IntPtrTy, StoreSize, SE);
+
+ // For a memcpy, we have to make sure that the input array is not being
+ // mutated by the loop.
+ Value *LoadBasePtr = Expander.expandCodeFor(
+ LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator());
+
+ if (mayLoopAccessLocation(LoadBasePtr, MRI_Mod, CurLoop, BECount, StoreSize,
+ *AA, SI)) {
+ Expander.clear();
+ // If we generated new code for the base pointer, clean up.
+ RecursivelyDeleteTriviallyDeadInstructions(LoadBasePtr, TLI);
+ RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI);
+ return false;
+ }
+
+ // Okay, everything is safe, we can transform this!
+
+ // The # stored bytes is (BECount+1)*Size. Expand the trip count out to
+ // pointer size if it isn't already.
+ BECount = SE->getTruncateOrZeroExtend(BECount, IntPtrTy);
+
+ const SCEV *NumBytesS =
+ SE->getAddExpr(BECount, SE->getOne(IntPtrTy), SCEV::FlagNUW);
+ if (StoreSize != 1)
+ NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtrTy, StoreSize),
+ SCEV::FlagNUW);
+
+ Value *NumBytes =
+ Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator());
+
+ CallInst *NewCall =
+ Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes,
+ std::min(SI->getAlignment(), LI->getAlignment()));
+ NewCall->setDebugLoc(SI->getDebugLoc());
+
+ DEBUG(dbgs() << " Formed memcpy: " << *NewCall << "\n"
+ << " from load ptr=" << *LoadEv << " at: " << *LI << "\n"
+ << " from store ptr=" << *StoreEv << " at: " << *SI << "\n");
+
+ // Okay, the memcpy has been formed. Zap the original store and anything that
+ // feeds into it.
+ deleteDeadInstruction(SI, TLI);
+ ++NumMemCpy;
+ return true;
+}
+
+bool LoopIdiomRecognize::runOnNoncountableLoop() {
+ return recognizePopcount();
+}
+
+/// Check if the given conditional branch is based on the comparison between
+/// a variable and zero, and if the variable is non-zero, the control yields to
+/// the loop entry. If the branch matches the behavior, the variable involved
+/// in the comparion is returned. This function will be called to see if the
+/// precondition and postcondition of the loop are in desirable form.
+static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry) {
+ if (!BI || !BI->isConditional())
+ return nullptr;
+
+ ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
+ if (!Cond)
+ return nullptr;
+
+ ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
+ if (!CmpZero || !CmpZero->isZero())
+ return nullptr;
+
+ ICmpInst::Predicate Pred = Cond->getPredicate();
+ if ((Pred == ICmpInst::ICMP_NE && BI->getSuccessor(0) == LoopEntry) ||
+ (Pred == ICmpInst::ICMP_EQ && BI->getSuccessor(1) == LoopEntry))
+ return Cond->getOperand(0);
+
+ return nullptr;
+}
+
+/// Return true iff the idiom is detected in the loop.
+///
+/// Additionally:
+/// 1) \p CntInst is set to the instruction counting the population bit.
+/// 2) \p CntPhi is set to the corresponding phi node.
+/// 3) \p Var is set to the value whose population bits are being counted.
+///
+/// The core idiom we are trying to detect is:
+/// \code
+/// if (x0 != 0)
+/// goto loop-exit // the precondition of the loop
+/// cnt0 = init-val;
+/// do {
+/// x1 = phi (x0, x2);
+/// cnt1 = phi(cnt0, cnt2);
+///
+/// cnt2 = cnt1 + 1;
+/// ...
+/// x2 = x1 & (x1 - 1);
+/// ...
+/// } while(x != 0);
+///
+/// loop-exit:
+/// \endcode
+static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
+ Instruction *&CntInst, PHINode *&CntPhi,
+ Value *&Var) {
+ // step 1: Check to see if the look-back branch match this pattern:
+ // "if (a!=0) goto loop-entry".
+ BasicBlock *LoopEntry;
+ Instruction *DefX2, *CountInst;
+ Value *VarX1, *VarX0;
+ PHINode *PhiX, *CountPhi;
+
+ DefX2 = CountInst = nullptr;
+ VarX1 = VarX0 = nullptr;
+ PhiX = CountPhi = nullptr;
+ LoopEntry = *(CurLoop->block_begin());
+
+ // step 1: Check if the loop-back branch is in desirable form.
+ {
+ if (Value *T = matchCondition(
+ dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
+ DefX2 = dyn_cast<Instruction>(T);
+ else
+ return false;
+ }
+
+ // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
+ {
+ if (!DefX2 || DefX2->getOpcode() != Instruction::And)
+ return false;
+
+ BinaryOperator *SubOneOp;
+
+ if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0))))
+ VarX1 = DefX2->getOperand(1);
+ else {
+ VarX1 = DefX2->getOperand(0);
+ SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));
+ }
+ if (!SubOneOp)
+ return false;
+
+ Instruction *SubInst = cast<Instruction>(SubOneOp);
+ ConstantInt *Dec = dyn_cast<ConstantInt>(SubInst->getOperand(1));
+ if (!Dec ||
+ !((SubInst->getOpcode() == Instruction::Sub && Dec->isOne()) ||
+ (SubInst->getOpcode() == Instruction::Add &&
+ Dec->isAllOnesValue()))) {
+ return false;
+ }
+ }
+
+ // step 3: Check the recurrence of variable X
+ {
+ PhiX = dyn_cast<PHINode>(VarX1);
+ if (!PhiX ||
+ (PhiX->getOperand(0) != DefX2 && PhiX->getOperand(1) != DefX2)) {
+ return false;
+ }
+ }
+
+ // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
+ {
+ CountInst = nullptr;
+ for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
+ IterE = LoopEntry->end();
+ Iter != IterE; Iter++) {
+ Instruction *Inst = &*Iter;
+ if (Inst->getOpcode() != Instruction::Add)
+ continue;
+
+ ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
+ if (!Inc || !Inc->isOne())
+ continue;
+
+ PHINode *Phi = dyn_cast<PHINode>(Inst->getOperand(0));
+ if (!Phi || Phi->getParent() != LoopEntry)
+ continue;
+
+ // Check if the result of the instruction is live of the loop.
+ bool LiveOutLoop = false;
+ for (User *U : Inst->users()) {
+ if ((cast<Instruction>(U))->getParent() != LoopEntry) {
+ LiveOutLoop = true;
+ break;
+ }
+ }
+
+ if (LiveOutLoop) {
+ CountInst = Inst;
+ CountPhi = Phi;
+ break;
+ }
+ }
+
+ if (!CountInst)
+ return false;
+ }
+
+ // step 5: check if the precondition is in this form:
+ // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
+ {
+ auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());
+ Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader());
+ if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1))
+ return false;
+
+ CntInst = CountInst;
+ CntPhi = CountPhi;
+ Var = T;
+ }
+
+ return true;
+}
+
+/// Recognizes a population count idiom in a non-countable loop.
+///
+/// If detected, transforms the relevant code to issue the popcount intrinsic
+/// function call, and returns true; otherwise, returns false.
+bool LoopIdiomRecognize::recognizePopcount() {
+ if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware)
+ return false;
+
+ // Counting population are usually conducted by few arithmetic instructions.
+ // Such instructions can be easily "absorbed" by vacant slots in a
+ // non-compact loop. Therefore, recognizing popcount idiom only makes sense
+ // in a compact loop.
+
+ // Give up if the loop has multiple blocks or multiple backedges.
+ if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
+ return false;
+
+ BasicBlock *LoopBody = *(CurLoop->block_begin());
+ if (LoopBody->size() >= 20) {
+ // The loop is too big, bail out.
+ return false;
+ }
+
+ // It should have a preheader containing nothing but an unconditional branch.
+ BasicBlock *PH = CurLoop->getLoopPreheader();
+ if (!PH)
+ return false;
+ if (&PH->front() != PH->getTerminator())
+ return false;
+ auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator());
+ if (!EntryBI || EntryBI->isConditional())
+ return false;
+
+ // It should have a precondition block where the generated popcount instrinsic
+ // function can be inserted.
+ auto *PreCondBB = PH->getSinglePredecessor();
+ if (!PreCondBB)
+ return false;
+ auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
+ if (!PreCondBI || PreCondBI->isUnconditional())
+ return false;
+
+ Instruction *CntInst;
+ PHINode *CntPhi;
+ Value *Val;
+ if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val))
+ return false;
+
+ transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val);
+ return true;
+}
+
+static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
+ DebugLoc DL) {
+ Value *Ops[] = {Val};
+ Type *Tys[] = {Val->getType()};
+
+ Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
+ Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys);
+ CallInst *CI = IRBuilder.CreateCall(Func, Ops);
+ CI->setDebugLoc(DL);
+
+ return CI;
+}
+
+void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
+ Instruction *CntInst,
+ PHINode *CntPhi, Value *Var) {
+ BasicBlock *PreHead = CurLoop->getLoopPreheader();
+ auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());
+ const DebugLoc DL = CntInst->getDebugLoc();
+
+ // Assuming before transformation, the loop is following:
+ // if (x) // the precondition
+ // do { cnt++; x &= x - 1; } while(x);
+
+ // Step 1: Insert the ctpop instruction at the end of the precondition block
+ IRBuilder<> Builder(PreCondBr);
+ Value *PopCnt, *PopCntZext, *NewCount, *TripCnt;
+ {
+ PopCnt = createPopcntIntrinsic(Builder, Var, DL);
+ NewCount = PopCntZext =
+ Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType()));
+
+ if (NewCount != PopCnt)
+ (cast<Instruction>(NewCount))->setDebugLoc(DL);
+
+ // TripCnt is exactly the number of iterations the loop has
+ TripCnt = NewCount;
+
+ // If the population counter's initial value is not zero, insert Add Inst.
+ Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead);
+ ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
+ if (!InitConst || !InitConst->isZero()) {
+ NewCount = Builder.CreateAdd(NewCount, CntInitVal);
+ (cast<Instruction>(NewCount))->setDebugLoc(DL);
+ }
+ }
+
+ // Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to
+ // "if (NewCount == 0) loop-exit". Without this change, the intrinsic
+ // function would be partial dead code, and downstream passes will drag
+ // it back from the precondition block to the preheader.
+ {
+ ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition());
+
+ Value *Opnd0 = PopCntZext;
+ Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0);
+ if (PreCond->getOperand(0) != Var)
+ std::swap(Opnd0, Opnd1);
+
+ ICmpInst *NewPreCond = cast<ICmpInst>(
+ Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
+ PreCondBr->setCondition(NewPreCond);
+
+ RecursivelyDeleteTriviallyDeadInstructions(PreCond, TLI);
+ }
+
+ // Step 3: Note that the population count is exactly the trip count of the
+ // loop in question, which enable us to to convert the loop from noncountable
+ // loop into a countable one. The benefit is twofold:
+ //
+ // - If the loop only counts population, the entire loop becomes dead after
+ // the transformation. It is a lot easier to prove a countable loop dead
+ // than to prove a noncountable one. (In some C dialects, an infinite loop
+ // isn't dead even if it computes nothing useful. In general, DCE needs
+ // to prove a noncountable loop finite before safely delete it.)
+ //
+ // - If the loop also performs something else, it remains alive.
+ // Since it is transformed to countable form, it can be aggressively
+ // optimized by some optimizations which are in general not applicable
+ // to a noncountable loop.
+ //
+ // After this step, this loop (conceptually) would look like following:
+ // newcnt = __builtin_ctpop(x);
+ // t = newcnt;
+ // if (x)
+ // do { cnt++; x &= x-1; t--) } while (t > 0);
+ BasicBlock *Body = *(CurLoop->block_begin());
+ {
+ auto *LbBr = dyn_cast<BranchInst>(Body->getTerminator());
+ ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
+ Type *Ty = TripCnt->getType();
+
+ PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
+
+ Builder.SetInsertPoint(LbCond);
+ Instruction *TcDec = cast<Instruction>(
+ Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
+ "tcdec", false, true));
+
+ TcPhi->addIncoming(TripCnt, PreHead);
+ TcPhi->addIncoming(TcDec, Body);
+
+ CmpInst::Predicate Pred =
+ (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;
+ LbCond->setPredicate(Pred);
+ LbCond->setOperand(0, TcDec);
+ LbCond->setOperand(1, ConstantInt::get(Ty, 0));
+ }
+
+ // Step 4: All the references to the original population counter outside
+ // the loop are replaced with the NewCount -- the value returned from
+ // __builtin_ctpop().
+ CntInst->replaceUsesOutsideBlock(NewCount, Body);
+
+ // step 5: Forget the "non-computable" trip-count SCEV associated with the
+ // loop. The loop would otherwise not be deleted even if it becomes empty.
+ SE->forgetLoop(CurLoop);
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
new file mode 100644
index 0000000..b4102fe
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
@@ -0,0 +1,195 @@
+//===- LoopInstSimplify.cpp - Loop Instruction Simplification Pass --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs lightweight instruction simplification on loop bodies.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-instsimplify"
+
+STATISTIC(NumSimplified, "Number of redundant instructions simplified");
+
+namespace {
+ class LoopInstSimplify : public LoopPass {
+ public:
+ static char ID; // Pass ID, replacement for typeid
+ LoopInstSimplify() : LoopPass(ID) {
+ initializeLoopInstSimplifyPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnLoop(Loop*, LPPassManager&) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addRequiredID(LoopSimplifyID);
+ AU.addPreservedID(LoopSimplifyID);
+ AU.addPreservedID(LCSSAID);
+ AU.addPreserved<ScalarEvolutionWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ }
+ };
+}
+
+char LoopInstSimplify::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopInstSimplify, "loop-instsimplify",
+ "Simplify instructions in loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LCSSA)
+INITIALIZE_PASS_END(LoopInstSimplify, "loop-instsimplify",
+ "Simplify instructions in loops", false, false)
+
+Pass *llvm::createLoopInstSimplifyPass() {
+ return new LoopInstSimplify();
+}
+
+bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
+ if (skipOptnoneFunction(L))
+ return false;
+
+ DominatorTreeWrapperPass *DTWP =
+ getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+ DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
+ LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ const TargetLibraryInfo *TLI =
+ &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
+ *L->getHeader()->getParent());
+
+ SmallVector<BasicBlock*, 8> ExitBlocks;
+ L->getUniqueExitBlocks(ExitBlocks);
+ array_pod_sort(ExitBlocks.begin(), ExitBlocks.end());
+
+ SmallPtrSet<const Instruction*, 8> S1, S2, *ToSimplify = &S1, *Next = &S2;
+
+ // The bit we are stealing from the pointer represents whether this basic
+ // block is the header of a subloop, in which case we only process its phis.
+ typedef PointerIntPair<BasicBlock*, 1> WorklistItem;
+ SmallVector<WorklistItem, 16> VisitStack;
+ SmallPtrSet<BasicBlock*, 32> Visited;
+
+ bool Changed = false;
+ bool LocalChanged;
+ do {
+ LocalChanged = false;
+
+ VisitStack.clear();
+ Visited.clear();
+
+ VisitStack.push_back(WorklistItem(L->getHeader(), false));
+
+ while (!VisitStack.empty()) {
+ WorklistItem Item = VisitStack.pop_back_val();
+ BasicBlock *BB = Item.getPointer();
+ bool IsSubloopHeader = Item.getInt();
+ const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+
+ // Simplify instructions in the current basic block.
+ for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) {
+ Instruction *I = &*BI++;
+
+ // The first time through the loop ToSimplify is empty and we try to
+ // simplify all instructions. On later iterations ToSimplify is not
+ // empty and we only bother simplifying instructions that are in it.
+ if (!ToSimplify->empty() && !ToSimplify->count(I))
+ continue;
+
+ // Don't bother simplifying unused instructions.
+ if (!I->use_empty()) {
+ Value *V = SimplifyInstruction(I, DL, TLI, DT, &AC);
+ if (V && LI->replacementPreservesLCSSAForm(I, V)) {
+ // Mark all uses for resimplification next time round the loop.
+ for (User *U : I->users())
+ Next->insert(cast<Instruction>(U));
+
+ I->replaceAllUsesWith(V);
+ LocalChanged = true;
+ ++NumSimplified;
+ }
+ }
+ bool res = RecursivelyDeleteTriviallyDeadInstructions(I, TLI);
+ if (res) {
+ // RecursivelyDeleteTriviallyDeadInstruction can remove
+ // more than one instruction, so simply incrementing the
+ // iterator does not work. When instructions get deleted
+ // re-iterate instead.
+ BI = BB->begin(); BE = BB->end();
+ LocalChanged |= res;
+ }
+
+ if (IsSubloopHeader && !isa<PHINode>(I))
+ break;
+ }
+
+ // Add all successors to the worklist, except for loop exit blocks and the
+ // bodies of subloops. We visit the headers of loops so that we can process
+ // their phis, but we contract the rest of the subloop body and only follow
+ // edges leading back to the original loop.
+ for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE;
+ ++SI) {
+ BasicBlock *SuccBB = *SI;
+ if (!Visited.insert(SuccBB).second)
+ continue;
+
+ const Loop *SuccLoop = LI->getLoopFor(SuccBB);
+ if (SuccLoop && SuccLoop->getHeader() == SuccBB
+ && L->contains(SuccLoop)) {
+ VisitStack.push_back(WorklistItem(SuccBB, true));
+
+ SmallVector<BasicBlock*, 8> SubLoopExitBlocks;
+ SuccLoop->getExitBlocks(SubLoopExitBlocks);
+
+ for (unsigned i = 0; i < SubLoopExitBlocks.size(); ++i) {
+ BasicBlock *ExitBB = SubLoopExitBlocks[i];
+ if (LI->getLoopFor(ExitBB) == L && Visited.insert(ExitBB).second)
+ VisitStack.push_back(WorklistItem(ExitBB, false));
+ }
+
+ continue;
+ }
+
+ bool IsExitBlock = std::binary_search(ExitBlocks.begin(),
+ ExitBlocks.end(), SuccBB);
+ if (IsExitBlock)
+ continue;
+
+ VisitStack.push_back(WorklistItem(SuccBB, false));
+ }
+ }
+
+ // Place the list of instructions to simplify on the next loop iteration
+ // into ToSimplify.
+ std::swap(ToSimplify, Next);
+ Next->clear();
+
+ Changed |= LocalChanged;
+ } while (LocalChanged);
+
+ return Changed;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
new file mode 100644
index 0000000..4295235
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -0,0 +1,1307 @@
+//===- LoopInterchange.cpp - Loop interchange pass------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This Pass handles loop interchange transform.
+// This pass interchanges loops to provide a more cache-friendly memory access
+// patterns.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/DependenceAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-interchange"
+
+namespace {
+
+typedef SmallVector<Loop *, 8> LoopVector;
+
+// TODO: Check if we can use a sparse matrix here.
+typedef std::vector<std::vector<char>> CharMatrix;
+
+// Maximum number of dependencies that can be handled in the dependency matrix.
+static const unsigned MaxMemInstrCount = 100;
+
+// Maximum loop depth supported.
+static const unsigned MaxLoopNestDepth = 10;
+
+struct LoopInterchange;
+
+#ifdef DUMP_DEP_MATRICIES
+void printDepMatrix(CharMatrix &DepMatrix) {
+ for (auto I = DepMatrix.begin(), E = DepMatrix.end(); I != E; ++I) {
+ std::vector<char> Vec = *I;
+ for (auto II = Vec.begin(), EE = Vec.end(); II != EE; ++II)
+ DEBUG(dbgs() << *II << " ");
+ DEBUG(dbgs() << "\n");
+ }
+}
+#endif
+
+static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
+ Loop *L, DependenceAnalysis *DA) {
+ typedef SmallVector<Value *, 16> ValueVector;
+ ValueVector MemInstr;
+
+ if (Level > MaxLoopNestDepth) {
+ DEBUG(dbgs() << "Cannot handle loops of depth greater than "
+ << MaxLoopNestDepth << "\n");
+ return false;
+ }
+
+ // For each block.
+ for (Loop::block_iterator BB = L->block_begin(), BE = L->block_end();
+ BB != BE; ++BB) {
+ // Scan the BB and collect legal loads and stores.
+ for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end(); I != E;
+ ++I) {
+ Instruction *Ins = dyn_cast<Instruction>(I);
+ if (!Ins)
+ return false;
+ LoadInst *Ld = dyn_cast<LoadInst>(I);
+ StoreInst *St = dyn_cast<StoreInst>(I);
+ if (!St && !Ld)
+ continue;
+ if (Ld && !Ld->isSimple())
+ return false;
+ if (St && !St->isSimple())
+ return false;
+ MemInstr.push_back(&*I);
+ }
+ }
+
+ DEBUG(dbgs() << "Found " << MemInstr.size()
+ << " Loads and Stores to analyze\n");
+
+ ValueVector::iterator I, IE, J, JE;
+
+ for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
+ for (J = I, JE = MemInstr.end(); J != JE; ++J) {
+ std::vector<char> Dep;
+ Instruction *Src = dyn_cast<Instruction>(*I);
+ Instruction *Des = dyn_cast<Instruction>(*J);
+ if (Src == Des)
+ continue;
+ if (isa<LoadInst>(Src) && isa<LoadInst>(Des))
+ continue;
+ if (auto D = DA->depends(Src, Des, true)) {
+ DEBUG(dbgs() << "Found Dependency between Src=" << Src << " Des=" << Des
+ << "\n");
+ if (D->isFlow()) {
+ // TODO: Handle Flow dependence.Check if it is sufficient to populate
+ // the Dependence Matrix with the direction reversed.
+ DEBUG(dbgs() << "Flow dependence not handled");
+ return false;
+ }
+ if (D->isAnti()) {
+ DEBUG(dbgs() << "Found Anti dependence \n");
+ unsigned Levels = D->getLevels();
+ char Direction;
+ for (unsigned II = 1; II <= Levels; ++II) {
+ const SCEV *Distance = D->getDistance(II);
+ const SCEVConstant *SCEVConst =
+ dyn_cast_or_null<SCEVConstant>(Distance);
+ if (SCEVConst) {
+ const ConstantInt *CI = SCEVConst->getValue();
+ if (CI->isNegative())
+ Direction = '<';
+ else if (CI->isZero())
+ Direction = '=';
+ else
+ Direction = '>';
+ Dep.push_back(Direction);
+ } else if (D->isScalar(II)) {
+ Direction = 'S';
+ Dep.push_back(Direction);
+ } else {
+ unsigned Dir = D->getDirection(II);
+ if (Dir == Dependence::DVEntry::LT ||
+ Dir == Dependence::DVEntry::LE)
+ Direction = '<';
+ else if (Dir == Dependence::DVEntry::GT ||
+ Dir == Dependence::DVEntry::GE)
+ Direction = '>';
+ else if (Dir == Dependence::DVEntry::EQ)
+ Direction = '=';
+ else
+ Direction = '*';
+ Dep.push_back(Direction);
+ }
+ }
+ while (Dep.size() != Level) {
+ Dep.push_back('I');
+ }
+
+ DepMatrix.push_back(Dep);
+ if (DepMatrix.size() > MaxMemInstrCount) {
+ DEBUG(dbgs() << "Cannot handle more than " << MaxMemInstrCount
+ << " dependencies inside loop\n");
+ return false;
+ }
+ }
+ }
+ }
+ }
+
+ // We don't have a DepMatrix to check legality return false.
+ if (DepMatrix.size() == 0)
+ return false;
+ return true;
+}
+
+// A loop is moved from index 'from' to an index 'to'. Update the Dependence
+// matrix by exchanging the two columns.
+static void interChangeDepedencies(CharMatrix &DepMatrix, unsigned FromIndx,
+ unsigned ToIndx) {
+ unsigned numRows = DepMatrix.size();
+ for (unsigned i = 0; i < numRows; ++i) {
+ char TmpVal = DepMatrix[i][ToIndx];
+ DepMatrix[i][ToIndx] = DepMatrix[i][FromIndx];
+ DepMatrix[i][FromIndx] = TmpVal;
+ }
+}
+
+// Checks if outermost non '=','S'or'I' dependence in the dependence matrix is
+// '>'
+static bool isOuterMostDepPositive(CharMatrix &DepMatrix, unsigned Row,
+ unsigned Column) {
+ for (unsigned i = 0; i <= Column; ++i) {
+ if (DepMatrix[Row][i] == '<')
+ return false;
+ if (DepMatrix[Row][i] == '>')
+ return true;
+ }
+ // All dependencies were '=','S' or 'I'
+ return false;
+}
+
+// Checks if no dependence exist in the dependency matrix in Row before Column.
+static bool containsNoDependence(CharMatrix &DepMatrix, unsigned Row,
+ unsigned Column) {
+ for (unsigned i = 0; i < Column; ++i) {
+ if (DepMatrix[Row][i] != '=' || DepMatrix[Row][i] != 'S' ||
+ DepMatrix[Row][i] != 'I')
+ return false;
+ }
+ return true;
+}
+
+static bool validDepInterchange(CharMatrix &DepMatrix, unsigned Row,
+ unsigned OuterLoopId, char InnerDep,
+ char OuterDep) {
+
+ if (isOuterMostDepPositive(DepMatrix, Row, OuterLoopId))
+ return false;
+
+ if (InnerDep == OuterDep)
+ return true;
+
+ // It is legal to interchange if and only if after interchange no row has a
+ // '>' direction as the leftmost non-'='.
+
+ if (InnerDep == '=' || InnerDep == 'S' || InnerDep == 'I')
+ return true;
+
+ if (InnerDep == '<')
+ return true;
+
+ if (InnerDep == '>') {
+ // If OuterLoopId represents outermost loop then interchanging will make the
+ // 1st dependency as '>'
+ if (OuterLoopId == 0)
+ return false;
+
+ // If all dependencies before OuterloopId are '=','S'or 'I'. Then
+ // interchanging will result in this row having an outermost non '='
+ // dependency of '>'
+ if (!containsNoDependence(DepMatrix, Row, OuterLoopId))
+ return true;
+ }
+
+ return false;
+}
+
+// Checks if it is legal to interchange 2 loops.
+// [Theorem] A permutation of the loops in a perfect nest is legal if and only
+// if
+// the direction matrix, after the same permutation is applied to its columns,
+// has no ">" direction as the leftmost non-"=" direction in any row.
+static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
+ unsigned InnerLoopId,
+ unsigned OuterLoopId) {
+
+ unsigned NumRows = DepMatrix.size();
+ // For each row check if it is valid to interchange.
+ for (unsigned Row = 0; Row < NumRows; ++Row) {
+ char InnerDep = DepMatrix[Row][InnerLoopId];
+ char OuterDep = DepMatrix[Row][OuterLoopId];
+ if (InnerDep == '*' || OuterDep == '*')
+ return false;
+ else if (!validDepInterchange(DepMatrix, Row, OuterLoopId, InnerDep,
+ OuterDep))
+ return false;
+ }
+ return true;
+}
+
+static void populateWorklist(Loop &L, SmallVector<LoopVector, 8> &V) {
+
+ DEBUG(dbgs() << "Calling populateWorklist called\n");
+ LoopVector LoopList;
+ Loop *CurrentLoop = &L;
+ const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops();
+ while (!Vec->empty()) {
+ // The current loop has multiple subloops in it hence it is not tightly
+ // nested.
+ // Discard all loops above it added into Worklist.
+ if (Vec->size() != 1) {
+ LoopList.clear();
+ return;
+ }
+ LoopList.push_back(CurrentLoop);
+ CurrentLoop = Vec->front();
+ Vec = &CurrentLoop->getSubLoops();
+ }
+ LoopList.push_back(CurrentLoop);
+ V.push_back(std::move(LoopList));
+}
+
+static PHINode *getInductionVariable(Loop *L, ScalarEvolution *SE) {
+ PHINode *InnerIndexVar = L->getCanonicalInductionVariable();
+ if (InnerIndexVar)
+ return InnerIndexVar;
+ if (L->getLoopLatch() == nullptr || L->getLoopPredecessor() == nullptr)
+ return nullptr;
+ for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
+ PHINode *PhiVar = cast<PHINode>(I);
+ Type *PhiTy = PhiVar->getType();
+ if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
+ !PhiTy->isPointerTy())
+ return nullptr;
+ const SCEVAddRecExpr *AddRec =
+ dyn_cast<SCEVAddRecExpr>(SE->getSCEV(PhiVar));
+ if (!AddRec || !AddRec->isAffine())
+ continue;
+ const SCEV *Step = AddRec->getStepRecurrence(*SE);
+ const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
+ if (!C)
+ continue;
+ // Found the induction variable.
+ // FIXME: Handle loops with more than one induction variable. Note that,
+ // currently, legality makes sure we have only one induction variable.
+ return PhiVar;
+ }
+ return nullptr;
+}
+
+/// LoopInterchangeLegality checks if it is legal to interchange the loop.
+class LoopInterchangeLegality {
+public:
+ LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
+ LoopInfo *LI, DominatorTree *DT, bool PreserveLCSSA)
+ : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT),
+ PreserveLCSSA(PreserveLCSSA), InnerLoopHasReduction(false) {}
+
+ /// Check if the loops can be interchanged.
+ bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId,
+ CharMatrix &DepMatrix);
+ /// Check if the loop structure is understood. We do not handle triangular
+ /// loops for now.
+ bool isLoopStructureUnderstood(PHINode *InnerInductionVar);
+
+ bool currentLimitations();
+
+ bool hasInnerLoopReduction() { return InnerLoopHasReduction; }
+
+private:
+ bool tightlyNested(Loop *Outer, Loop *Inner);
+ bool containsUnsafeInstructionsInHeader(BasicBlock *BB);
+ bool areAllUsesReductions(Instruction *Ins, Loop *L);
+ bool containsUnsafeInstructionsInLatch(BasicBlock *BB);
+ bool findInductionAndReductions(Loop *L,
+ SmallVector<PHINode *, 8> &Inductions,
+ SmallVector<PHINode *, 8> &Reductions);
+ Loop *OuterLoop;
+ Loop *InnerLoop;
+
+ ScalarEvolution *SE;
+ LoopInfo *LI;
+ DominatorTree *DT;
+ bool PreserveLCSSA;
+
+ bool InnerLoopHasReduction;
+};
+
+/// LoopInterchangeProfitability checks if it is profitable to interchange the
+/// loop.
+class LoopInterchangeProfitability {
+public:
+ LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE)
+ : OuterLoop(Outer), InnerLoop(Inner), SE(SE) {}
+
+ /// Check if the loop interchange is profitable.
+ bool isProfitable(unsigned InnerLoopId, unsigned OuterLoopId,
+ CharMatrix &DepMatrix);
+
+private:
+ int getInstrOrderCost();
+
+ Loop *OuterLoop;
+ Loop *InnerLoop;
+
+ /// Scev analysis.
+ ScalarEvolution *SE;
+};
+
+/// LoopInterchangeTransform interchanges the loop.
+class LoopInterchangeTransform {
+public:
+ LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
+ LoopInfo *LI, DominatorTree *DT,
+ BasicBlock *LoopNestExit,
+ bool InnerLoopContainsReductions)
+ : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT),
+ LoopExit(LoopNestExit),
+ InnerLoopHasReduction(InnerLoopContainsReductions) {}
+
+ /// Interchange OuterLoop and InnerLoop.
+ bool transform();
+ void restructureLoops(Loop *InnerLoop, Loop *OuterLoop);
+ void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop);
+
+private:
+ void splitInnerLoopLatch(Instruction *);
+ void splitOuterLoopLatch();
+ void splitInnerLoopHeader();
+ bool adjustLoopLinks();
+ void adjustLoopPreheaders();
+ void adjustOuterLoopPreheader();
+ void adjustInnerLoopPreheader();
+ bool adjustLoopBranches();
+ void updateIncomingBlock(BasicBlock *CurrBlock, BasicBlock *OldPred,
+ BasicBlock *NewPred);
+
+ Loop *OuterLoop;
+ Loop *InnerLoop;
+
+ /// Scev analysis.
+ ScalarEvolution *SE;
+ LoopInfo *LI;
+ DominatorTree *DT;
+ BasicBlock *LoopExit;
+ bool InnerLoopHasReduction;
+};
+
+// Main LoopInterchange Pass.
+struct LoopInterchange : public FunctionPass {
+ static char ID;
+ ScalarEvolution *SE;
+ LoopInfo *LI;
+ DependenceAnalysis *DA;
+ DominatorTree *DT;
+ bool PreserveLCSSA;
+ LoopInterchange()
+ : FunctionPass(ID), SE(nullptr), LI(nullptr), DA(nullptr), DT(nullptr) {
+ initializeLoopInterchangePass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addRequired<DependenceAnalysis>();
+ AU.addRequiredID(LoopSimplifyID);
+ AU.addRequiredID(LCSSAID);
+ }
+
+ bool runOnFunction(Function &F) override {
+ SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ DA = &getAnalysis<DependenceAnalysis>();
+ auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+ DT = DTWP ? &DTWP->getDomTree() : nullptr;
+ PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
+
+ // Build up a worklist of loop pairs to analyze.
+ SmallVector<LoopVector, 8> Worklist;
+
+ for (Loop *L : *LI)
+ populateWorklist(*L, Worklist);
+
+ DEBUG(dbgs() << "Worklist size = " << Worklist.size() << "\n");
+ bool Changed = true;
+ while (!Worklist.empty()) {
+ LoopVector LoopList = Worklist.pop_back_val();
+ Changed = processLoopList(LoopList, F);
+ }
+ return Changed;
+ }
+
+ bool isComputableLoopNest(LoopVector LoopList) {
+ for (auto I = LoopList.begin(), E = LoopList.end(); I != E; ++I) {
+ Loop *L = *I;
+ const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L);
+ if (ExitCountOuter == SE->getCouldNotCompute()) {
+ DEBUG(dbgs() << "Couldn't compute Backedge count\n");
+ return false;
+ }
+ if (L->getNumBackEdges() != 1) {
+ DEBUG(dbgs() << "NumBackEdges is not equal to 1\n");
+ return false;
+ }
+ if (!L->getExitingBlock()) {
+ DEBUG(dbgs() << "Loop Doesn't have unique exit block\n");
+ return false;
+ }
+ }
+ return true;
+ }
+
+ unsigned selectLoopForInterchange(LoopVector LoopList) {
+ // TODO: Add a better heuristic to select the loop to be interchanged based
+ // on the dependence matrix. Currently we select the innermost loop.
+ return LoopList.size() - 1;
+ }
+
+ bool processLoopList(LoopVector LoopList, Function &F) {
+
+ bool Changed = false;
+ CharMatrix DependencyMatrix;
+ if (LoopList.size() < 2) {
+ DEBUG(dbgs() << "Loop doesn't contain minimum nesting level.\n");
+ return false;
+ }
+ if (!isComputableLoopNest(LoopList)) {
+ DEBUG(dbgs() << "Not vaild loop candidate for interchange\n");
+ return false;
+ }
+ Loop *OuterMostLoop = *(LoopList.begin());
+
+ DEBUG(dbgs() << "Processing LoopList of size = " << LoopList.size()
+ << "\n");
+
+ if (!populateDependencyMatrix(DependencyMatrix, LoopList.size(),
+ OuterMostLoop, DA)) {
+ DEBUG(dbgs() << "Populating Dependency matrix failed\n");
+ return false;
+ }
+#ifdef DUMP_DEP_MATRICIES
+ DEBUG(dbgs() << "Dependence before inter change \n");
+ printDepMatrix(DependencyMatrix);
+#endif
+
+ BasicBlock *OuterMostLoopLatch = OuterMostLoop->getLoopLatch();
+ BranchInst *OuterMostLoopLatchBI =
+ dyn_cast<BranchInst>(OuterMostLoopLatch->getTerminator());
+ if (!OuterMostLoopLatchBI)
+ return false;
+
+ // Since we currently do not handle LCSSA PHI's any failure in loop
+ // condition will now branch to LoopNestExit.
+ // TODO: This should be removed once we handle LCSSA PHI nodes.
+
+ // Get the Outermost loop exit.
+ BasicBlock *LoopNestExit;
+ if (OuterMostLoopLatchBI->getSuccessor(0) == OuterMostLoop->getHeader())
+ LoopNestExit = OuterMostLoopLatchBI->getSuccessor(1);
+ else
+ LoopNestExit = OuterMostLoopLatchBI->getSuccessor(0);
+
+ if (isa<PHINode>(LoopNestExit->begin())) {
+ DEBUG(dbgs() << "PHI Nodes in loop nest exit is not handled for now "
+ "since on failure all loops branch to loop nest exit.\n");
+ return false;
+ }
+
+ unsigned SelecLoopId = selectLoopForInterchange(LoopList);
+ // Move the selected loop outwards to the best possible position.
+ for (unsigned i = SelecLoopId; i > 0; i--) {
+ bool Interchanged =
+ processLoop(LoopList, i, i - 1, LoopNestExit, DependencyMatrix);
+ if (!Interchanged)
+ return Changed;
+ // Loops interchanged reflect the same in LoopList
+ std::swap(LoopList[i - 1], LoopList[i]);
+
+ // Update the DependencyMatrix
+ interChangeDepedencies(DependencyMatrix, i, i - 1);
+ DT->recalculate(F);
+#ifdef DUMP_DEP_MATRICIES
+ DEBUG(dbgs() << "Dependence after inter change \n");
+ printDepMatrix(DependencyMatrix);
+#endif
+ Changed |= Interchanged;
+ }
+ return Changed;
+ }
+
+ bool processLoop(LoopVector LoopList, unsigned InnerLoopId,
+ unsigned OuterLoopId, BasicBlock *LoopNestExit,
+ std::vector<std::vector<char>> &DependencyMatrix) {
+
+ DEBUG(dbgs() << "Processing Innder Loop Id = " << InnerLoopId
+ << " and OuterLoopId = " << OuterLoopId << "\n");
+ Loop *InnerLoop = LoopList[InnerLoopId];
+ Loop *OuterLoop = LoopList[OuterLoopId];
+
+ LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, LI, DT,
+ PreserveLCSSA);
+ if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) {
+ DEBUG(dbgs() << "Not interchanging Loops. Cannot prove legality\n");
+ return false;
+ }
+ DEBUG(dbgs() << "Loops are legal to interchange\n");
+ LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE);
+ if (!LIP.isProfitable(InnerLoopId, OuterLoopId, DependencyMatrix)) {
+ DEBUG(dbgs() << "Interchanging Loops not profitable\n");
+ return false;
+ }
+
+ LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT,
+ LoopNestExit, LIL.hasInnerLoopReduction());
+ LIT.transform();
+ DEBUG(dbgs() << "Loops interchanged\n");
+ return true;
+ }
+};
+
+} // end of namespace
+bool LoopInterchangeLegality::areAllUsesReductions(Instruction *Ins, Loop *L) {
+ return !std::any_of(Ins->user_begin(), Ins->user_end(), [=](User *U) -> bool {
+ PHINode *UserIns = dyn_cast<PHINode>(U);
+ RecurrenceDescriptor RD;
+ return !UserIns || !RecurrenceDescriptor::isReductionPHI(UserIns, L, RD);
+ });
+}
+
+bool LoopInterchangeLegality::containsUnsafeInstructionsInHeader(
+ BasicBlock *BB) {
+ for (auto I = BB->begin(), E = BB->end(); I != E; ++I) {
+ // Load corresponding to reduction PHI's are safe while concluding if
+ // tightly nested.
+ if (LoadInst *L = dyn_cast<LoadInst>(I)) {
+ if (!areAllUsesReductions(L, InnerLoop))
+ return true;
+ } else if (I->mayHaveSideEffects() || I->mayReadFromMemory())
+ return true;
+ }
+ return false;
+}
+
+bool LoopInterchangeLegality::containsUnsafeInstructionsInLatch(
+ BasicBlock *BB) {
+ for (auto I = BB->begin(), E = BB->end(); I != E; ++I) {
+ // Stores corresponding to reductions are safe while concluding if tightly
+ // nested.
+ if (StoreInst *L = dyn_cast<StoreInst>(I)) {
+ PHINode *PHI = dyn_cast<PHINode>(L->getOperand(0));
+ if (!PHI)
+ return true;
+ } else if (I->mayHaveSideEffects() || I->mayReadFromMemory())
+ return true;
+ }
+ return false;
+}
+
+bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
+ BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
+ BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+ BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
+
+ DEBUG(dbgs() << "Checking if Loops are Tightly Nested\n");
+
+ // A perfectly nested loop will not have any branch in between the outer and
+ // inner block i.e. outer header will branch to either inner preheader and
+ // outerloop latch.
+ BranchInst *outerLoopHeaderBI =
+ dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
+ if (!outerLoopHeaderBI)
+ return false;
+ unsigned num = outerLoopHeaderBI->getNumSuccessors();
+ for (unsigned i = 0; i < num; i++) {
+ if (outerLoopHeaderBI->getSuccessor(i) != InnerLoopPreHeader &&
+ outerLoopHeaderBI->getSuccessor(i) != OuterLoopLatch)
+ return false;
+ }
+
+ DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch \n");
+ // We do not have any basic block in between now make sure the outer header
+ // and outer loop latch doesn't contain any unsafe instructions.
+ if (containsUnsafeInstructionsInHeader(OuterLoopHeader) ||
+ containsUnsafeInstructionsInLatch(OuterLoopLatch))
+ return false;
+
+ DEBUG(dbgs() << "Loops are perfectly nested \n");
+ // We have a perfect loop nest.
+ return true;
+}
+
+
+bool LoopInterchangeLegality::isLoopStructureUnderstood(
+ PHINode *InnerInduction) {
+
+ unsigned Num = InnerInduction->getNumOperands();
+ BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader();
+ for (unsigned i = 0; i < Num; ++i) {
+ Value *Val = InnerInduction->getOperand(i);
+ if (isa<Constant>(Val))
+ continue;
+ Instruction *I = dyn_cast<Instruction>(Val);
+ if (!I)
+ return false;
+ // TODO: Handle triangular loops.
+ // e.g. for(int i=0;i<N;i++)
+ // for(int j=i;j<N;j++)
+ unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i);
+ if (InnerInduction->getIncomingBlock(IncomBlockIndx) ==
+ InnerLoopPreheader &&
+ !OuterLoop->isLoopInvariant(I)) {
+ return false;
+ }
+ }
+ return true;
+}
+
+bool LoopInterchangeLegality::findInductionAndReductions(
+ Loop *L, SmallVector<PHINode *, 8> &Inductions,
+ SmallVector<PHINode *, 8> &Reductions) {
+ if (!L->getLoopLatch() || !L->getLoopPredecessor())
+ return false;
+ for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
+ RecurrenceDescriptor RD;
+ InductionDescriptor ID;
+ PHINode *PHI = cast<PHINode>(I);
+ if (InductionDescriptor::isInductionPHI(PHI, SE, ID))
+ Inductions.push_back(PHI);
+ else if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD))
+ Reductions.push_back(PHI);
+ else {
+ DEBUG(
+ dbgs() << "Failed to recognize PHI as an induction or reduction.\n");
+ return false;
+ }
+ }
+ return true;
+}
+
+static bool containsSafePHI(BasicBlock *Block, bool isOuterLoopExitBlock) {
+ for (auto I = Block->begin(); isa<PHINode>(I); ++I) {
+ PHINode *PHI = cast<PHINode>(I);
+ // Reduction lcssa phi will have only 1 incoming block that from loop latch.
+ if (PHI->getNumIncomingValues() > 1)
+ return false;
+ Instruction *Ins = dyn_cast<Instruction>(PHI->getIncomingValue(0));
+ if (!Ins)
+ return false;
+ // Incoming value for lcssa phi's in outer loop exit can only be inner loop
+ // exits lcssa phi else it would not be tightly nested.
+ if (!isa<PHINode>(Ins) && isOuterLoopExitBlock)
+ return false;
+ }
+ return true;
+}
+
+static BasicBlock *getLoopLatchExitBlock(BasicBlock *LatchBlock,
+ BasicBlock *LoopHeader) {
+ if (BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator())) {
+ unsigned Num = BI->getNumSuccessors();
+ assert(Num == 2);
+ for (unsigned i = 0; i < Num; ++i) {
+ if (BI->getSuccessor(i) == LoopHeader)
+ continue;
+ return BI->getSuccessor(i);
+ }
+ }
+ return nullptr;
+}
+
+// This function indicates the current limitations in the transform as a result
+// of which we do not proceed.
+bool LoopInterchangeLegality::currentLimitations() {
+
+ BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+ BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
+ BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
+ BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
+ BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
+
+ PHINode *InnerInductionVar;
+ SmallVector<PHINode *, 8> Inductions;
+ SmallVector<PHINode *, 8> Reductions;
+ if (!findInductionAndReductions(InnerLoop, Inductions, Reductions))
+ return true;
+
+ // TODO: Currently we handle only loops with 1 induction variable.
+ if (Inductions.size() != 1) {
+ DEBUG(dbgs() << "We currently only support loops with 1 induction variable."
+ << "Failed to interchange due to current limitation\n");
+ return true;
+ }
+ if (Reductions.size() > 0)
+ InnerLoopHasReduction = true;
+
+ InnerInductionVar = Inductions.pop_back_val();
+ Reductions.clear();
+ if (!findInductionAndReductions(OuterLoop, Inductions, Reductions))
+ return true;
+
+ // Outer loop cannot have reduction because then loops will not be tightly
+ // nested.
+ if (!Reductions.empty())
+ return true;
+ // TODO: Currently we handle only loops with 1 induction variable.
+ if (Inductions.size() != 1)
+ return true;
+
+ // TODO: Triangular loops are not handled for now.
+ if (!isLoopStructureUnderstood(InnerInductionVar)) {
+ DEBUG(dbgs() << "Loop structure not understood by pass\n");
+ return true;
+ }
+
+ // TODO: We only handle LCSSA PHI's corresponding to reduction for now.
+ BasicBlock *LoopExitBlock =
+ getLoopLatchExitBlock(OuterLoopLatch, OuterLoopHeader);
+ if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, true))
+ return true;
+
+ LoopExitBlock = getLoopLatchExitBlock(InnerLoopLatch, InnerLoopHeader);
+ if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, false))
+ return true;
+
+ // TODO: Current limitation: Since we split the inner loop latch at the point
+ // were induction variable is incremented (induction.next); We cannot have
+ // more than 1 user of induction.next since it would result in broken code
+ // after split.
+ // e.g.
+ // for(i=0;i<N;i++) {
+ // for(j = 0;j<M;j++) {
+ // A[j+1][i+2] = A[j][i]+k;
+ // }
+ // }
+ bool FoundInduction = false;
+ Instruction *InnerIndexVarInc = nullptr;
+ if (InnerInductionVar->getIncomingBlock(0) == InnerLoopPreHeader)
+ InnerIndexVarInc =
+ dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(1));
+ else
+ InnerIndexVarInc =
+ dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(0));
+
+ if (!InnerIndexVarInc)
+ return true;
+
+ // Since we split the inner loop latch on this induction variable. Make sure
+ // we do not have any instruction between the induction variable and branch
+ // instruction.
+
+ for (auto I = InnerLoopLatch->rbegin(), E = InnerLoopLatch->rend();
+ I != E && !FoundInduction; ++I) {
+ if (isa<BranchInst>(*I) || isa<CmpInst>(*I) || isa<TruncInst>(*I))
+ continue;
+ const Instruction &Ins = *I;
+ // We found an instruction. If this is not induction variable then it is not
+ // safe to split this loop latch.
+ if (!Ins.isIdenticalTo(InnerIndexVarInc))
+ return true;
+ else
+ FoundInduction = true;
+ }
+ // The loop latch ended and we didn't find the induction variable return as
+ // current limitation.
+ if (!FoundInduction)
+ return true;
+
+ return false;
+}
+
+bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
+ unsigned OuterLoopId,
+ CharMatrix &DepMatrix) {
+
+ if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) {
+ DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId
+ << "and OuterLoopId = " << OuterLoopId
+ << "due to dependence\n");
+ return false;
+ }
+
+ // Create unique Preheaders if we already do not have one.
+ BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
+ BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+
+ // Create a unique outer preheader -
+ // 1) If OuterLoop preheader is not present.
+ // 2) If OuterLoop Preheader is same as OuterLoop Header
+ // 3) If OuterLoop Preheader is same as Header of the previous loop.
+ // 4) If OuterLoop Preheader is Entry node.
+ if (!OuterLoopPreHeader || OuterLoopPreHeader == OuterLoop->getHeader() ||
+ isa<PHINode>(OuterLoopPreHeader->begin()) ||
+ !OuterLoopPreHeader->getUniquePredecessor()) {
+ OuterLoopPreHeader =
+ InsertPreheaderForLoop(OuterLoop, DT, LI, PreserveLCSSA);
+ }
+
+ if (!InnerLoopPreHeader || InnerLoopPreHeader == InnerLoop->getHeader() ||
+ InnerLoopPreHeader == OuterLoop->getHeader()) {
+ InnerLoopPreHeader =
+ InsertPreheaderForLoop(InnerLoop, DT, LI, PreserveLCSSA);
+ }
+
+ // TODO: The loops could not be interchanged due to current limitations in the
+ // transform module.
+ if (currentLimitations()) {
+ DEBUG(dbgs() << "Not legal because of current transform limitation\n");
+ return false;
+ }
+
+ // Check if the loops are tightly nested.
+ if (!tightlyNested(OuterLoop, InnerLoop)) {
+ DEBUG(dbgs() << "Loops not tightly nested\n");
+ return false;
+ }
+
+ return true;
+}
+
+int LoopInterchangeProfitability::getInstrOrderCost() {
+ unsigned GoodOrder, BadOrder;
+ BadOrder = GoodOrder = 0;
+ for (auto BI = InnerLoop->block_begin(), BE = InnerLoop->block_end();
+ BI != BE; ++BI) {
+ for (auto I = (*BI)->begin(), E = (*BI)->end(); I != E; ++I) {
+ const Instruction &Ins = *I;
+ if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&Ins)) {
+ unsigned NumOp = GEP->getNumOperands();
+ bool FoundInnerInduction = false;
+ bool FoundOuterInduction = false;
+ for (unsigned i = 0; i < NumOp; ++i) {
+ const SCEV *OperandVal = SE->getSCEV(GEP->getOperand(i));
+ const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OperandVal);
+ if (!AR)
+ continue;
+
+ // If we find the inner induction after an outer induction e.g.
+ // for(int i=0;i<N;i++)
+ // for(int j=0;j<N;j++)
+ // A[i][j] = A[i-1][j-1]+k;
+ // then it is a good order.
+ if (AR->getLoop() == InnerLoop) {
+ // We found an InnerLoop induction after OuterLoop induction. It is
+ // a good order.
+ FoundInnerInduction = true;
+ if (FoundOuterInduction) {
+ GoodOrder++;
+ break;
+ }
+ }
+ // If we find the outer induction after an inner induction e.g.
+ // for(int i=0;i<N;i++)
+ // for(int j=0;j<N;j++)
+ // A[j][i] = A[j-1][i-1]+k;
+ // then it is a bad order.
+ if (AR->getLoop() == OuterLoop) {
+ // We found an OuterLoop induction after InnerLoop induction. It is
+ // a bad order.
+ FoundOuterInduction = true;
+ if (FoundInnerInduction) {
+ BadOrder++;
+ break;
+ }
+ }
+ }
+ }
+ }
+ }
+ return GoodOrder - BadOrder;
+}
+
+static bool isProfitabileForVectorization(unsigned InnerLoopId,
+ unsigned OuterLoopId,
+ CharMatrix &DepMatrix) {
+ // TODO: Improve this heuristic to catch more cases.
+ // If the inner loop is loop independent or doesn't carry any dependency it is
+ // profitable to move this to outer position.
+ unsigned Row = DepMatrix.size();
+ for (unsigned i = 0; i < Row; ++i) {
+ if (DepMatrix[i][InnerLoopId] != 'S' && DepMatrix[i][InnerLoopId] != 'I')
+ return false;
+ // TODO: We need to improve this heuristic.
+ if (DepMatrix[i][OuterLoopId] != '=')
+ return false;
+ }
+ // If outer loop has dependence and inner loop is loop independent then it is
+ // profitable to interchange to enable parallelism.
+ return true;
+}
+
+bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId,
+ unsigned OuterLoopId,
+ CharMatrix &DepMatrix) {
+
+ // TODO: Add better profitability checks.
+ // e.g
+ // 1) Construct dependency matrix and move the one with no loop carried dep
+ // inside to enable vectorization.
+
+ // This is rough cost estimation algorithm. It counts the good and bad order
+ // of induction variables in the instruction and allows reordering if number
+ // of bad orders is more than good.
+ int Cost = 0;
+ Cost += getInstrOrderCost();
+ DEBUG(dbgs() << "Cost = " << Cost << "\n");
+ if (Cost < 0)
+ return true;
+
+ // It is not profitable as per current cache profitability model. But check if
+ // we can move this loop outside to improve parallelism.
+ bool ImprovesPar =
+ isProfitabileForVectorization(InnerLoopId, OuterLoopId, DepMatrix);
+ return ImprovesPar;
+}
+
+void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop,
+ Loop *InnerLoop) {
+ for (Loop::iterator I = OuterLoop->begin(), E = OuterLoop->end(); I != E;
+ ++I) {
+ if (*I == InnerLoop) {
+ OuterLoop->removeChildLoop(I);
+ return;
+ }
+ }
+ llvm_unreachable("Couldn't find loop");
+}
+
+void LoopInterchangeTransform::restructureLoops(Loop *InnerLoop,
+ Loop *OuterLoop) {
+ Loop *OuterLoopParent = OuterLoop->getParentLoop();
+ if (OuterLoopParent) {
+ // Remove the loop from its parent loop.
+ removeChildLoop(OuterLoopParent, OuterLoop);
+ removeChildLoop(OuterLoop, InnerLoop);
+ OuterLoopParent->addChildLoop(InnerLoop);
+ } else {
+ removeChildLoop(OuterLoop, InnerLoop);
+ LI->changeTopLevelLoop(OuterLoop, InnerLoop);
+ }
+
+ while (!InnerLoop->empty())
+ OuterLoop->addChildLoop(InnerLoop->removeChildLoop(InnerLoop->begin()));
+
+ InnerLoop->addChildLoop(OuterLoop);
+}
+
+bool LoopInterchangeTransform::transform() {
+
+ DEBUG(dbgs() << "transform\n");
+ bool Transformed = false;
+ Instruction *InnerIndexVar;
+
+ if (InnerLoop->getSubLoops().size() == 0) {
+ BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+ DEBUG(dbgs() << "Calling Split Inner Loop\n");
+ PHINode *InductionPHI = getInductionVariable(InnerLoop, SE);
+ if (!InductionPHI) {
+ DEBUG(dbgs() << "Failed to find the point to split loop latch \n");
+ return false;
+ }
+
+ if (InductionPHI->getIncomingBlock(0) == InnerLoopPreHeader)
+ InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(1));
+ else
+ InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(0));
+
+ //
+ // Split at the place were the induction variable is
+ // incremented/decremented.
+ // TODO: This splitting logic may not work always. Fix this.
+ splitInnerLoopLatch(InnerIndexVar);
+ DEBUG(dbgs() << "splitInnerLoopLatch Done\n");
+
+ // Splits the inner loops phi nodes out into a separate basic block.
+ splitInnerLoopHeader();
+ DEBUG(dbgs() << "splitInnerLoopHeader Done\n");
+ }
+
+ Transformed |= adjustLoopLinks();
+ if (!Transformed) {
+ DEBUG(dbgs() << "adjustLoopLinks Failed\n");
+ return false;
+ }
+
+ restructureLoops(InnerLoop, OuterLoop);
+ return true;
+}
+
+void LoopInterchangeTransform::splitInnerLoopLatch(Instruction *Inc) {
+ BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
+ BasicBlock *InnerLoopLatchPred = InnerLoopLatch;
+ InnerLoopLatch = SplitBlock(InnerLoopLatchPred, Inc, DT, LI);
+}
+
+void LoopInterchangeTransform::splitOuterLoopLatch() {
+ BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
+ BasicBlock *OuterLatchLcssaPhiBlock = OuterLoopLatch;
+ OuterLoopLatch = SplitBlock(OuterLatchLcssaPhiBlock,
+ OuterLoopLatch->getFirstNonPHI(), DT, LI);
+}
+
+void LoopInterchangeTransform::splitInnerLoopHeader() {
+
+ // Split the inner loop header out. Here make sure that the reduction PHI's
+ // stay in the innerloop body.
+ BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
+ BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+ if (InnerLoopHasReduction) {
+ // FIXME: Check if the induction PHI will always be the first PHI.
+ BasicBlock *New = InnerLoopHeader->splitBasicBlock(
+ ++(InnerLoopHeader->begin()), InnerLoopHeader->getName() + ".split");
+ if (LI)
+ if (Loop *L = LI->getLoopFor(InnerLoopHeader))
+ L->addBasicBlockToLoop(New, *LI);
+
+ // Adjust Reduction PHI's in the block.
+ SmallVector<PHINode *, 8> PHIVec;
+ for (auto I = New->begin(); isa<PHINode>(I); ++I) {
+ PHINode *PHI = dyn_cast<PHINode>(I);
+ Value *V = PHI->getIncomingValueForBlock(InnerLoopPreHeader);
+ PHI->replaceAllUsesWith(V);
+ PHIVec.push_back((PHI));
+ }
+ for (auto I = PHIVec.begin(), E = PHIVec.end(); I != E; ++I) {
+ PHINode *P = *I;
+ P->eraseFromParent();
+ }
+ } else {
+ SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHI(), DT, LI);
+ }
+
+ DEBUG(dbgs() << "Output of splitInnerLoopHeader InnerLoopHeaderSucc & "
+ "InnerLoopHeader \n");
+}
+
+/// \brief Move all instructions except the terminator from FromBB right before
+/// InsertBefore
+static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) {
+ auto &ToList = InsertBefore->getParent()->getInstList();
+ auto &FromList = FromBB->getInstList();
+
+ ToList.splice(InsertBefore->getIterator(), FromList, FromList.begin(),
+ FromBB->getTerminator()->getIterator());
+}
+
+void LoopInterchangeTransform::adjustOuterLoopPreheader() {
+ BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
+ BasicBlock *InnerPreHeader = InnerLoop->getLoopPreheader();
+
+ moveBBContents(OuterLoopPreHeader, InnerPreHeader->getTerminator());
+}
+
+void LoopInterchangeTransform::adjustInnerLoopPreheader() {
+ BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+ BasicBlock *OuterHeader = OuterLoop->getHeader();
+
+ moveBBContents(InnerLoopPreHeader, OuterHeader->getTerminator());
+}
+
+void LoopInterchangeTransform::updateIncomingBlock(BasicBlock *CurrBlock,
+ BasicBlock *OldPred,
+ BasicBlock *NewPred) {
+ for (auto I = CurrBlock->begin(); isa<PHINode>(I); ++I) {
+ PHINode *PHI = cast<PHINode>(I);
+ unsigned Num = PHI->getNumIncomingValues();
+ for (unsigned i = 0; i < Num; ++i) {
+ if (PHI->getIncomingBlock(i) == OldPred)
+ PHI->setIncomingBlock(i, NewPred);
+ }
+ }
+}
+
+bool LoopInterchangeTransform::adjustLoopBranches() {
+
+ DEBUG(dbgs() << "adjustLoopBranches called\n");
+ // Adjust the loop preheader
+ BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
+ BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
+ BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
+ BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
+ BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
+ BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+ BasicBlock *OuterLoopPredecessor = OuterLoopPreHeader->getUniquePredecessor();
+ BasicBlock *InnerLoopLatchPredecessor =
+ InnerLoopLatch->getUniquePredecessor();
+ BasicBlock *InnerLoopLatchSuccessor;
+ BasicBlock *OuterLoopLatchSuccessor;
+
+ BranchInst *OuterLoopLatchBI =
+ dyn_cast<BranchInst>(OuterLoopLatch->getTerminator());
+ BranchInst *InnerLoopLatchBI =
+ dyn_cast<BranchInst>(InnerLoopLatch->getTerminator());
+ BranchInst *OuterLoopHeaderBI =
+ dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
+ BranchInst *InnerLoopHeaderBI =
+ dyn_cast<BranchInst>(InnerLoopHeader->getTerminator());
+
+ if (!OuterLoopPredecessor || !InnerLoopLatchPredecessor ||
+ !OuterLoopLatchBI || !InnerLoopLatchBI || !OuterLoopHeaderBI ||
+ !InnerLoopHeaderBI)
+ return false;
+
+ BranchInst *InnerLoopLatchPredecessorBI =
+ dyn_cast<BranchInst>(InnerLoopLatchPredecessor->getTerminator());
+ BranchInst *OuterLoopPredecessorBI =
+ dyn_cast<BranchInst>(OuterLoopPredecessor->getTerminator());
+
+ if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI)
+ return false;
+ BasicBlock *InnerLoopHeaderSuccessor = InnerLoopHeader->getUniqueSuccessor();
+ if (!InnerLoopHeaderSuccessor)
+ return false;
+
+ // Adjust Loop Preheader and headers
+
+ unsigned NumSucc = OuterLoopPredecessorBI->getNumSuccessors();
+ for (unsigned i = 0; i < NumSucc; ++i) {
+ if (OuterLoopPredecessorBI->getSuccessor(i) == OuterLoopPreHeader)
+ OuterLoopPredecessorBI->setSuccessor(i, InnerLoopPreHeader);
+ }
+
+ NumSucc = OuterLoopHeaderBI->getNumSuccessors();
+ for (unsigned i = 0; i < NumSucc; ++i) {
+ if (OuterLoopHeaderBI->getSuccessor(i) == OuterLoopLatch)
+ OuterLoopHeaderBI->setSuccessor(i, LoopExit);
+ else if (OuterLoopHeaderBI->getSuccessor(i) == InnerLoopPreHeader)
+ OuterLoopHeaderBI->setSuccessor(i, InnerLoopHeaderSuccessor);
+ }
+
+ // Adjust reduction PHI's now that the incoming block has changed.
+ updateIncomingBlock(InnerLoopHeaderSuccessor, InnerLoopHeader,
+ OuterLoopHeader);
+
+ BranchInst::Create(OuterLoopPreHeader, InnerLoopHeaderBI);
+ InnerLoopHeaderBI->eraseFromParent();
+
+ // -------------Adjust loop latches-----------
+ if (InnerLoopLatchBI->getSuccessor(0) == InnerLoopHeader)
+ InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(1);
+ else
+ InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(0);
+
+ NumSucc = InnerLoopLatchPredecessorBI->getNumSuccessors();
+ for (unsigned i = 0; i < NumSucc; ++i) {
+ if (InnerLoopLatchPredecessorBI->getSuccessor(i) == InnerLoopLatch)
+ InnerLoopLatchPredecessorBI->setSuccessor(i, InnerLoopLatchSuccessor);
+ }
+
+ // Adjust PHI nodes in InnerLoopLatchSuccessor. Update all uses of PHI with
+ // the value and remove this PHI node from inner loop.
+ SmallVector<PHINode *, 8> LcssaVec;
+ for (auto I = InnerLoopLatchSuccessor->begin(); isa<PHINode>(I); ++I) {
+ PHINode *LcssaPhi = cast<PHINode>(I);
+ LcssaVec.push_back(LcssaPhi);
+ }
+ for (auto I = LcssaVec.begin(), E = LcssaVec.end(); I != E; ++I) {
+ PHINode *P = *I;
+ Value *Incoming = P->getIncomingValueForBlock(InnerLoopLatch);
+ P->replaceAllUsesWith(Incoming);
+ P->eraseFromParent();
+ }
+
+ if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopHeader)
+ OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(1);
+ else
+ OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(0);
+
+ if (InnerLoopLatchBI->getSuccessor(1) == InnerLoopLatchSuccessor)
+ InnerLoopLatchBI->setSuccessor(1, OuterLoopLatchSuccessor);
+ else
+ InnerLoopLatchBI->setSuccessor(0, OuterLoopLatchSuccessor);
+
+ updateIncomingBlock(OuterLoopLatchSuccessor, OuterLoopLatch, InnerLoopLatch);
+
+ if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopLatchSuccessor) {
+ OuterLoopLatchBI->setSuccessor(0, InnerLoopLatch);
+ } else {
+ OuterLoopLatchBI->setSuccessor(1, InnerLoopLatch);
+ }
+
+ return true;
+}
+void LoopInterchangeTransform::adjustLoopPreheaders() {
+
+ // We have interchanged the preheaders so we need to interchange the data in
+ // the preheader as well.
+ // This is because the content of inner preheader was previously executed
+ // inside the outer loop.
+ BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
+ BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+ BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
+ BranchInst *InnerTermBI =
+ cast<BranchInst>(InnerLoopPreHeader->getTerminator());
+
+ // These instructions should now be executed inside the loop.
+ // Move instruction into a new block after outer header.
+ moveBBContents(InnerLoopPreHeader, OuterLoopHeader->getTerminator());
+ // These instructions were not executed previously in the loop so move them to
+ // the older inner loop preheader.
+ moveBBContents(OuterLoopPreHeader, InnerTermBI);
+}
+
+bool LoopInterchangeTransform::adjustLoopLinks() {
+
+ // Adjust all branches in the inner and outer loop.
+ bool Changed = adjustLoopBranches();
+ if (Changed)
+ adjustLoopPreheaders();
+ return Changed;
+}
+
+char LoopInterchange::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopInterchange, "loop-interchange",
+ "Interchanges loops for cache reuse", false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DependenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(LCSSA)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+
+INITIALIZE_PASS_END(LoopInterchange, "loop-interchange",
+ "Interchanges loops for cache reuse", false, false)
+
+Pass *llvm::createLoopInterchangePass() { return new LoopInterchange(); }
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
new file mode 100644
index 0000000..1064d08
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -0,0 +1,566 @@
+//===- LoopLoadElimination.cpp - Loop Load Elimination Pass ---------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implement a loop-aware load elimination pass.
+//
+// It uses LoopAccessAnalysis to identify loop-carried dependences with a
+// distance of one between stores and loads. These form the candidates for the
+// transformation. The source value of each store then propagated to the user
+// of the corresponding load. This makes the load dead.
+//
+// The pass can also version the loop and add memchecks in order to prove that
+// may-aliasing stores can't change the value in memory before it's read by the
+// load.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/LoopVersioning.h"
+#include <forward_list>
+
+#define LLE_OPTION "loop-load-elim"
+#define DEBUG_TYPE LLE_OPTION
+
+using namespace llvm;
+
+static cl::opt<unsigned> CheckPerElim(
+ "runtime-check-per-loop-load-elim", cl::Hidden,
+ cl::desc("Max number of memchecks allowed per eliminated load on average"),
+ cl::init(1));
+
+static cl::opt<unsigned> LoadElimSCEVCheckThreshold(
+ "loop-load-elimination-scev-check-threshold", cl::init(8), cl::Hidden,
+ cl::desc("The maximum number of SCEV checks allowed for Loop "
+ "Load Elimination"));
+
+
+STATISTIC(NumLoopLoadEliminted, "Number of loads eliminated by LLE");
+
+namespace {
+
+/// \brief Represent a store-to-forwarding candidate.
+struct StoreToLoadForwardingCandidate {
+ LoadInst *Load;
+ StoreInst *Store;
+
+ StoreToLoadForwardingCandidate(LoadInst *Load, StoreInst *Store)
+ : Load(Load), Store(Store) {}
+
+ /// \brief Return true if the dependence from the store to the load has a
+ /// distance of one. E.g. A[i+1] = A[i]
+ bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE) const {
+ Value *LoadPtr = Load->getPointerOperand();
+ Value *StorePtr = Store->getPointerOperand();
+ Type *LoadPtrType = LoadPtr->getType();
+ Type *LoadType = LoadPtrType->getPointerElementType();
+
+ assert(LoadPtrType->getPointerAddressSpace() ==
+ StorePtr->getType()->getPointerAddressSpace() &&
+ LoadType == StorePtr->getType()->getPointerElementType() &&
+ "Should be a known dependence");
+
+ auto &DL = Load->getParent()->getModule()->getDataLayout();
+ unsigned TypeByteSize = DL.getTypeAllocSize(const_cast<Type *>(LoadType));
+
+ auto *LoadPtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(LoadPtr));
+ auto *StorePtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(StorePtr));
+
+ // We don't need to check non-wrapping here because forward/backward
+ // dependence wouldn't be valid if these weren't monotonic accesses.
+ auto *Dist = cast<SCEVConstant>(
+ PSE.getSE()->getMinusSCEV(StorePtrSCEV, LoadPtrSCEV));
+ const APInt &Val = Dist->getAPInt();
+ return Val.abs() == TypeByteSize;
+ }
+
+ Value *getLoadPtr() const { return Load->getPointerOperand(); }
+
+#ifndef NDEBUG
+ friend raw_ostream &operator<<(raw_ostream &OS,
+ const StoreToLoadForwardingCandidate &Cand) {
+ OS << *Cand.Store << " -->\n";
+ OS.indent(2) << *Cand.Load << "\n";
+ return OS;
+ }
+#endif
+};
+
+/// \brief Check if the store dominates all latches, so as long as there is no
+/// intervening store this value will be loaded in the next iteration.
+bool doesStoreDominatesAllLatches(BasicBlock *StoreBlock, Loop *L,
+ DominatorTree *DT) {
+ SmallVector<BasicBlock *, 8> Latches;
+ L->getLoopLatches(Latches);
+ return std::all_of(Latches.begin(), Latches.end(),
+ [&](const BasicBlock *Latch) {
+ return DT->dominates(StoreBlock, Latch);
+ });
+}
+
+/// \brief The per-loop class that does most of the work.
+class LoadEliminationForLoop {
+public:
+ LoadEliminationForLoop(Loop *L, LoopInfo *LI, const LoopAccessInfo &LAI,
+ DominatorTree *DT)
+ : L(L), LI(LI), LAI(LAI), DT(DT), PSE(LAI.PSE) {}
+
+ /// \brief Look through the loop-carried and loop-independent dependences in
+ /// this loop and find store->load dependences.
+ ///
+ /// Note that no candidate is returned if LAA has failed to analyze the loop
+ /// (e.g. if it's not bottom-tested, contains volatile memops, etc.)
+ std::forward_list<StoreToLoadForwardingCandidate>
+ findStoreToLoadDependences(const LoopAccessInfo &LAI) {
+ std::forward_list<StoreToLoadForwardingCandidate> Candidates;
+
+ const auto *Deps = LAI.getDepChecker().getDependences();
+ if (!Deps)
+ return Candidates;
+
+ // Find store->load dependences (consequently true dep). Both lexically
+ // forward and backward dependences qualify. Disqualify loads that have
+ // other unknown dependences.
+
+ SmallSet<Instruction *, 4> LoadsWithUnknownDepedence;
+
+ for (const auto &Dep : *Deps) {
+ Instruction *Source = Dep.getSource(LAI);
+ Instruction *Destination = Dep.getDestination(LAI);
+
+ if (Dep.Type == MemoryDepChecker::Dependence::Unknown) {
+ if (isa<LoadInst>(Source))
+ LoadsWithUnknownDepedence.insert(Source);
+ if (isa<LoadInst>(Destination))
+ LoadsWithUnknownDepedence.insert(Destination);
+ continue;
+ }
+
+ if (Dep.isBackward())
+ // Note that the designations source and destination follow the program
+ // order, i.e. source is always first. (The direction is given by the
+ // DepType.)
+ std::swap(Source, Destination);
+ else
+ assert(Dep.isForward() && "Needs to be a forward dependence");
+
+ auto *Store = dyn_cast<StoreInst>(Source);
+ if (!Store)
+ continue;
+ auto *Load = dyn_cast<LoadInst>(Destination);
+ if (!Load)
+ continue;
+ Candidates.emplace_front(Load, Store);
+ }
+
+ if (!LoadsWithUnknownDepedence.empty())
+ Candidates.remove_if([&](const StoreToLoadForwardingCandidate &C) {
+ return LoadsWithUnknownDepedence.count(C.Load);
+ });
+
+ return Candidates;
+ }
+
+ /// \brief Return the index of the instruction according to program order.
+ unsigned getInstrIndex(Instruction *Inst) {
+ auto I = InstOrder.find(Inst);
+ assert(I != InstOrder.end() && "No index for instruction");
+ return I->second;
+ }
+
+ /// \brief If a load has multiple candidates associated (i.e. different
+ /// stores), it means that it could be forwarding from multiple stores
+ /// depending on control flow. Remove these candidates.
+ ///
+ /// Here, we rely on LAA to include the relevant loop-independent dependences.
+ /// LAA is known to omit these in the very simple case when the read and the
+ /// write within an alias set always takes place using the *same* pointer.
+ ///
+ /// However, we know that this is not the case here, i.e. we can rely on LAA
+ /// to provide us with loop-independent dependences for the cases we're
+ /// interested. Consider the case for example where a loop-independent
+ /// dependece S1->S2 invalidates the forwarding S3->S2.
+ ///
+ /// A[i] = ... (S1)
+ /// ... = A[i] (S2)
+ /// A[i+1] = ... (S3)
+ ///
+ /// LAA will perform dependence analysis here because there are two
+ /// *different* pointers involved in the same alias set (&A[i] and &A[i+1]).
+ void removeDependencesFromMultipleStores(
+ std::forward_list<StoreToLoadForwardingCandidate> &Candidates) {
+ // If Store is nullptr it means that we have multiple stores forwarding to
+ // this store.
+ typedef DenseMap<LoadInst *, const StoreToLoadForwardingCandidate *>
+ LoadToSingleCandT;
+ LoadToSingleCandT LoadToSingleCand;
+
+ for (const auto &Cand : Candidates) {
+ bool NewElt;
+ LoadToSingleCandT::iterator Iter;
+
+ std::tie(Iter, NewElt) =
+ LoadToSingleCand.insert(std::make_pair(Cand.Load, &Cand));
+ if (!NewElt) {
+ const StoreToLoadForwardingCandidate *&OtherCand = Iter->second;
+ // Already multiple stores forward to this load.
+ if (OtherCand == nullptr)
+ continue;
+
+ // Handle the very basic of case when the two stores are in the same
+ // block so deciding which one forwards is easy. The later one forwards
+ // as long as they both have a dependence distance of one to the load.
+ if (Cand.Store->getParent() == OtherCand->Store->getParent() &&
+ Cand.isDependenceDistanceOfOne(PSE) &&
+ OtherCand->isDependenceDistanceOfOne(PSE)) {
+ // They are in the same block, the later one will forward to the load.
+ if (getInstrIndex(OtherCand->Store) < getInstrIndex(Cand.Store))
+ OtherCand = &Cand;
+ } else
+ OtherCand = nullptr;
+ }
+ }
+
+ Candidates.remove_if([&](const StoreToLoadForwardingCandidate &Cand) {
+ if (LoadToSingleCand[Cand.Load] != &Cand) {
+ DEBUG(dbgs() << "Removing from candidates: \n" << Cand
+ << " The load may have multiple stores forwarding to "
+ << "it\n");
+ return true;
+ }
+ return false;
+ });
+ }
+
+ /// \brief Given two pointers operations by their RuntimePointerChecking
+ /// indices, return true if they require an alias check.
+ ///
+ /// We need a check if one is a pointer for a candidate load and the other is
+ /// a pointer for a possibly intervening store.
+ bool needsChecking(unsigned PtrIdx1, unsigned PtrIdx2,
+ const SmallSet<Value *, 4> &PtrsWrittenOnFwdingPath,
+ const std::set<Value *> &CandLoadPtrs) {
+ Value *Ptr1 =
+ LAI.getRuntimePointerChecking()->getPointerInfo(PtrIdx1).PointerValue;
+ Value *Ptr2 =
+ LAI.getRuntimePointerChecking()->getPointerInfo(PtrIdx2).PointerValue;
+ return ((PtrsWrittenOnFwdingPath.count(Ptr1) && CandLoadPtrs.count(Ptr2)) ||
+ (PtrsWrittenOnFwdingPath.count(Ptr2) && CandLoadPtrs.count(Ptr1)));
+ }
+
+ /// \brief Return pointers that are possibly written to on the path from a
+ /// forwarding store to a load.
+ ///
+ /// These pointers need to be alias-checked against the forwarding candidates.
+ SmallSet<Value *, 4> findPointersWrittenOnForwardingPath(
+ const SmallVectorImpl<StoreToLoadForwardingCandidate> &Candidates) {
+ // From FirstStore to LastLoad neither of the elimination candidate loads
+ // should overlap with any of the stores.
+ //
+ // E.g.:
+ //
+ // st1 C[i]
+ // ld1 B[i] <-------,
+ // ld0 A[i] <----, | * LastLoad
+ // ... | |
+ // st2 E[i] | |
+ // st3 B[i+1] -- | -' * FirstStore
+ // st0 A[i+1] ---'
+ // st4 D[i]
+ //
+ // st0 forwards to ld0 if the accesses in st4 and st1 don't overlap with
+ // ld0.
+
+ LoadInst *LastLoad =
+ std::max_element(Candidates.begin(), Candidates.end(),
+ [&](const StoreToLoadForwardingCandidate &A,
+ const StoreToLoadForwardingCandidate &B) {
+ return getInstrIndex(A.Load) < getInstrIndex(B.Load);
+ })
+ ->Load;
+ StoreInst *FirstStore =
+ std::min_element(Candidates.begin(), Candidates.end(),
+ [&](const StoreToLoadForwardingCandidate &A,
+ const StoreToLoadForwardingCandidate &B) {
+ return getInstrIndex(A.Store) <
+ getInstrIndex(B.Store);
+ })
+ ->Store;
+
+ // We're looking for stores after the first forwarding store until the end
+ // of the loop, then from the beginning of the loop until the last
+ // forwarded-to load. Collect the pointer for the stores.
+ SmallSet<Value *, 4> PtrsWrittenOnFwdingPath;
+
+ auto InsertStorePtr = [&](Instruction *I) {
+ if (auto *S = dyn_cast<StoreInst>(I))
+ PtrsWrittenOnFwdingPath.insert(S->getPointerOperand());
+ };
+ const auto &MemInstrs = LAI.getDepChecker().getMemoryInstructions();
+ std::for_each(MemInstrs.begin() + getInstrIndex(FirstStore) + 1,
+ MemInstrs.end(), InsertStorePtr);
+ std::for_each(MemInstrs.begin(), &MemInstrs[getInstrIndex(LastLoad)],
+ InsertStorePtr);
+
+ return PtrsWrittenOnFwdingPath;
+ }
+
+ /// \brief Determine the pointer alias checks to prove that there are no
+ /// intervening stores.
+ SmallVector<RuntimePointerChecking::PointerCheck, 4> collectMemchecks(
+ const SmallVectorImpl<StoreToLoadForwardingCandidate> &Candidates) {
+
+ SmallSet<Value *, 4> PtrsWrittenOnFwdingPath =
+ findPointersWrittenOnForwardingPath(Candidates);
+
+ // Collect the pointers of the candidate loads.
+ // FIXME: SmallSet does not work with std::inserter.
+ std::set<Value *> CandLoadPtrs;
+ std::transform(Candidates.begin(), Candidates.end(),
+ std::inserter(CandLoadPtrs, CandLoadPtrs.begin()),
+ std::mem_fn(&StoreToLoadForwardingCandidate::getLoadPtr));
+
+ const auto &AllChecks = LAI.getRuntimePointerChecking()->getChecks();
+ SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks;
+
+ std::copy_if(AllChecks.begin(), AllChecks.end(), std::back_inserter(Checks),
+ [&](const RuntimePointerChecking::PointerCheck &Check) {
+ for (auto PtrIdx1 : Check.first->Members)
+ for (auto PtrIdx2 : Check.second->Members)
+ if (needsChecking(PtrIdx1, PtrIdx2,
+ PtrsWrittenOnFwdingPath, CandLoadPtrs))
+ return true;
+ return false;
+ });
+
+ DEBUG(dbgs() << "\nPointer Checks (count: " << Checks.size() << "):\n");
+ DEBUG(LAI.getRuntimePointerChecking()->printChecks(dbgs(), Checks));
+
+ return Checks;
+ }
+
+ /// \brief Perform the transformation for a candidate.
+ void
+ propagateStoredValueToLoadUsers(const StoreToLoadForwardingCandidate &Cand,
+ SCEVExpander &SEE) {
+ //
+ // loop:
+ // %x = load %gep_i
+ // = ... %x
+ // store %y, %gep_i_plus_1
+ //
+ // =>
+ //
+ // ph:
+ // %x.initial = load %gep_0
+ // loop:
+ // %x.storeforward = phi [%x.initial, %ph] [%y, %loop]
+ // %x = load %gep_i <---- now dead
+ // = ... %x.storeforward
+ // store %y, %gep_i_plus_1
+
+ Value *Ptr = Cand.Load->getPointerOperand();
+ auto *PtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(Ptr));
+ auto *PH = L->getLoopPreheader();
+ Value *InitialPtr = SEE.expandCodeFor(PtrSCEV->getStart(), Ptr->getType(),
+ PH->getTerminator());
+ Value *Initial =
+ new LoadInst(InitialPtr, "load_initial", PH->getTerminator());
+ PHINode *PHI = PHINode::Create(Initial->getType(), 2, "store_forwarded",
+ &L->getHeader()->front());
+ PHI->addIncoming(Initial, PH);
+ PHI->addIncoming(Cand.Store->getOperand(0), L->getLoopLatch());
+
+ Cand.Load->replaceAllUsesWith(PHI);
+ }
+
+ /// \brief Top-level driver for each loop: find store->load forwarding
+ /// candidates, add run-time checks and perform transformation.
+ bool processLoop() {
+ DEBUG(dbgs() << "\nIn \"" << L->getHeader()->getParent()->getName()
+ << "\" checking " << *L << "\n");
+ // Look for store-to-load forwarding cases across the
+ // backedge. E.g.:
+ //
+ // loop:
+ // %x = load %gep_i
+ // = ... %x
+ // store %y, %gep_i_plus_1
+ //
+ // =>
+ //
+ // ph:
+ // %x.initial = load %gep_0
+ // loop:
+ // %x.storeforward = phi [%x.initial, %ph] [%y, %loop]
+ // %x = load %gep_i <---- now dead
+ // = ... %x.storeforward
+ // store %y, %gep_i_plus_1
+
+ // First start with store->load dependences.
+ auto StoreToLoadDependences = findStoreToLoadDependences(LAI);
+ if (StoreToLoadDependences.empty())
+ return false;
+
+ // Generate an index for each load and store according to the original
+ // program order. This will be used later.
+ InstOrder = LAI.getDepChecker().generateInstructionOrderMap();
+
+ // To keep things simple for now, remove those where the load is potentially
+ // fed by multiple stores.
+ removeDependencesFromMultipleStores(StoreToLoadDependences);
+ if (StoreToLoadDependences.empty())
+ return false;
+
+ // Filter the candidates further.
+ SmallVector<StoreToLoadForwardingCandidate, 4> Candidates;
+ unsigned NumForwarding = 0;
+ for (const StoreToLoadForwardingCandidate Cand : StoreToLoadDependences) {
+ DEBUG(dbgs() << "Candidate " << Cand);
+ // Make sure that the stored values is available everywhere in the loop in
+ // the next iteration.
+ if (!doesStoreDominatesAllLatches(Cand.Store->getParent(), L, DT))
+ continue;
+
+ // Check whether the SCEV difference is the same as the induction step,
+ // thus we load the value in the next iteration.
+ if (!Cand.isDependenceDistanceOfOne(PSE))
+ continue;
+
+ ++NumForwarding;
+ DEBUG(dbgs()
+ << NumForwarding
+ << ". Valid store-to-load forwarding across the loop backedge\n");
+ Candidates.push_back(Cand);
+ }
+ if (Candidates.empty())
+ return false;
+
+ // Check intervening may-alias stores. These need runtime checks for alias
+ // disambiguation.
+ SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks =
+ collectMemchecks(Candidates);
+
+ // Too many checks are likely to outweigh the benefits of forwarding.
+ if (Checks.size() > Candidates.size() * CheckPerElim) {
+ DEBUG(dbgs() << "Too many run-time checks needed.\n");
+ return false;
+ }
+
+ if (LAI.PSE.getUnionPredicate().getComplexity() >
+ LoadElimSCEVCheckThreshold) {
+ DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n");
+ return false;
+ }
+
+ // Point of no-return, start the transformation. First, version the loop if
+ // necessary.
+ if (!Checks.empty() || !LAI.PSE.getUnionPredicate().isAlwaysTrue()) {
+ LoopVersioning LV(LAI, L, LI, DT, PSE.getSE(), false);
+ LV.setAliasChecks(std::move(Checks));
+ LV.setSCEVChecks(LAI.PSE.getUnionPredicate());
+ LV.versionLoop();
+ }
+
+ // Next, propagate the value stored by the store to the users of the load.
+ // Also for the first iteration, generate the initial value of the load.
+ SCEVExpander SEE(*PSE.getSE(), L->getHeader()->getModule()->getDataLayout(),
+ "storeforward");
+ for (const auto &Cand : Candidates)
+ propagateStoredValueToLoadUsers(Cand, SEE);
+ NumLoopLoadEliminted += NumForwarding;
+
+ return true;
+ }
+
+private:
+ Loop *L;
+
+ /// \brief Maps the load/store instructions to their index according to
+ /// program order.
+ DenseMap<Instruction *, unsigned> InstOrder;
+
+ // Analyses used.
+ LoopInfo *LI;
+ const LoopAccessInfo &LAI;
+ DominatorTree *DT;
+ PredicatedScalarEvolution PSE;
+};
+
+/// \brief The pass. Most of the work is delegated to the per-loop
+/// LoadEliminationForLoop class.
+class LoopLoadElimination : public FunctionPass {
+public:
+ LoopLoadElimination() : FunctionPass(ID) {
+ initializeLoopLoadEliminationPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ auto *LAA = &getAnalysis<LoopAccessAnalysis>();
+ auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
+ // Build up a worklist of inner-loops to vectorize. This is necessary as the
+ // act of distributing a loop creates new loops and can invalidate iterators
+ // across the loops.
+ SmallVector<Loop *, 8> Worklist;
+
+ for (Loop *TopLevelLoop : *LI)
+ for (Loop *L : depth_first(TopLevelLoop))
+ // We only handle inner-most loops.
+ if (L->empty())
+ Worklist.push_back(L);
+
+ // Now walk the identified inner loops.
+ bool Changed = false;
+ for (Loop *L : Worklist) {
+ const LoopAccessInfo &LAI = LAA->getInfo(L, ValueToValueMap());
+ // The actual work is performed by LoadEliminationForLoop.
+ LoadEliminationForLoop LEL(L, LI, LAI, DT);
+ Changed |= LEL.processLoop();
+ }
+
+ // Process each loop nest in the function.
+ return Changed;
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
+ AU.addRequired<LoopAccessAnalysis>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ }
+
+ static char ID;
+};
+}
+
+char LoopLoadElimination::ID;
+static const char LLE_name[] = "Loop Load Elimination";
+
+INITIALIZE_PASS_BEGIN(LoopLoadElimination, LLE_OPTION, LLE_name, false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_END(LoopLoadElimination, LLE_OPTION, LLE_name, false, false)
+
+namespace llvm {
+FunctionPass *createLoopLoadEliminationPass() {
+ return new LoopLoadElimination();
+}
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
new file mode 100644
index 0000000..27c2d88
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
@@ -0,0 +1,1527 @@
+//===-- LoopReroll.cpp - Loop rerolling pass ------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements a simple loop reroller.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-reroll"
+
+STATISTIC(NumRerolledLoops, "Number of rerolled loops");
+
+static cl::opt<unsigned>
+MaxInc("max-reroll-increment", cl::init(2048), cl::Hidden,
+ cl::desc("The maximum increment for loop rerolling"));
+
+static cl::opt<unsigned>
+NumToleratedFailedMatches("reroll-num-tolerated-failed-matches", cl::init(400),
+ cl::Hidden,
+ cl::desc("The maximum number of failures to tolerate"
+ " during fuzzy matching. (default: 400)"));
+
+// This loop re-rolling transformation aims to transform loops like this:
+//
+// int foo(int a);
+// void bar(int *x) {
+// for (int i = 0; i < 500; i += 3) {
+// foo(i);
+// foo(i+1);
+// foo(i+2);
+// }
+// }
+//
+// into a loop like this:
+//
+// void bar(int *x) {
+// for (int i = 0; i < 500; ++i)
+// foo(i);
+// }
+//
+// It does this by looking for loops that, besides the latch code, are composed
+// of isomorphic DAGs of instructions, with each DAG rooted at some increment
+// to the induction variable, and where each DAG is isomorphic to the DAG
+// rooted at the induction variable (excepting the sub-DAGs which root the
+// other induction-variable increments). In other words, we're looking for loop
+// bodies of the form:
+//
+// %iv = phi [ (preheader, ...), (body, %iv.next) ]
+// f(%iv)
+// %iv.1 = add %iv, 1 <-- a root increment
+// f(%iv.1)
+// %iv.2 = add %iv, 2 <-- a root increment
+// f(%iv.2)
+// %iv.scale_m_1 = add %iv, scale-1 <-- a root increment
+// f(%iv.scale_m_1)
+// ...
+// %iv.next = add %iv, scale
+// %cmp = icmp(%iv, ...)
+// br %cmp, header, exit
+//
+// where each f(i) is a set of instructions that, collectively, are a function
+// only of i (and other loop-invariant values).
+//
+// As a special case, we can also reroll loops like this:
+//
+// int foo(int);
+// void bar(int *x) {
+// for (int i = 0; i < 500; ++i) {
+// x[3*i] = foo(0);
+// x[3*i+1] = foo(0);
+// x[3*i+2] = foo(0);
+// }
+// }
+//
+// into this:
+//
+// void bar(int *x) {
+// for (int i = 0; i < 1500; ++i)
+// x[i] = foo(0);
+// }
+//
+// in which case, we're looking for inputs like this:
+//
+// %iv = phi [ (preheader, ...), (body, %iv.next) ]
+// %scaled.iv = mul %iv, scale
+// f(%scaled.iv)
+// %scaled.iv.1 = add %scaled.iv, 1
+// f(%scaled.iv.1)
+// %scaled.iv.2 = add %scaled.iv, 2
+// f(%scaled.iv.2)
+// %scaled.iv.scale_m_1 = add %scaled.iv, scale-1
+// f(%scaled.iv.scale_m_1)
+// ...
+// %iv.next = add %iv, 1
+// %cmp = icmp(%iv, ...)
+// br %cmp, header, exit
+
+namespace {
+ enum IterationLimits {
+ /// The maximum number of iterations that we'll try and reroll. This
+ /// has to be less than 25 in order to fit into a SmallBitVector.
+ IL_MaxRerollIterations = 16,
+ /// The bitvector index used by loop induction variables and other
+ /// instructions that belong to all iterations.
+ IL_All,
+ IL_End
+ };
+
+ class LoopReroll : public LoopPass {
+ public:
+ static char ID; // Pass ID, replacement for typeid
+ LoopReroll() : LoopPass(ID) {
+ initializeLoopRerollPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ }
+
+ protected:
+ AliasAnalysis *AA;
+ LoopInfo *LI;
+ ScalarEvolution *SE;
+ TargetLibraryInfo *TLI;
+ DominatorTree *DT;
+ bool PreserveLCSSA;
+
+ typedef SmallVector<Instruction *, 16> SmallInstructionVector;
+ typedef SmallSet<Instruction *, 16> SmallInstructionSet;
+
+ // Map between induction variable and its increment
+ DenseMap<Instruction *, int64_t> IVToIncMap;
+
+ // A chain of isomorphic instructions, identified by a single-use PHI
+ // representing a reduction. Only the last value may be used outside the
+ // loop.
+ struct SimpleLoopReduction {
+ SimpleLoopReduction(Instruction *P, Loop *L)
+ : Valid(false), Instructions(1, P) {
+ assert(isa<PHINode>(P) && "First reduction instruction must be a PHI");
+ add(L);
+ }
+
+ bool valid() const {
+ return Valid;
+ }
+
+ Instruction *getPHI() const {
+ assert(Valid && "Using invalid reduction");
+ return Instructions.front();
+ }
+
+ Instruction *getReducedValue() const {
+ assert(Valid && "Using invalid reduction");
+ return Instructions.back();
+ }
+
+ Instruction *get(size_t i) const {
+ assert(Valid && "Using invalid reduction");
+ return Instructions[i+1];
+ }
+
+ Instruction *operator [] (size_t i) const { return get(i); }
+
+ // The size, ignoring the initial PHI.
+ size_t size() const {
+ assert(Valid && "Using invalid reduction");
+ return Instructions.size()-1;
+ }
+
+ typedef SmallInstructionVector::iterator iterator;
+ typedef SmallInstructionVector::const_iterator const_iterator;
+
+ iterator begin() {
+ assert(Valid && "Using invalid reduction");
+ return std::next(Instructions.begin());
+ }
+
+ const_iterator begin() const {
+ assert(Valid && "Using invalid reduction");
+ return std::next(Instructions.begin());
+ }
+
+ iterator end() { return Instructions.end(); }
+ const_iterator end() const { return Instructions.end(); }
+
+ protected:
+ bool Valid;
+ SmallInstructionVector Instructions;
+
+ void add(Loop *L);
+ };
+
+ // The set of all reductions, and state tracking of possible reductions
+ // during loop instruction processing.
+ struct ReductionTracker {
+ typedef SmallVector<SimpleLoopReduction, 16> SmallReductionVector;
+
+ // Add a new possible reduction.
+ void addSLR(SimpleLoopReduction &SLR) { PossibleReds.push_back(SLR); }
+
+ // Setup to track possible reductions corresponding to the provided
+ // rerolling scale. Only reductions with a number of non-PHI instructions
+ // that is divisible by the scale are considered. Three instructions sets
+ // are filled in:
+ // - A set of all possible instructions in eligible reductions.
+ // - A set of all PHIs in eligible reductions
+ // - A set of all reduced values (last instructions) in eligible
+ // reductions.
+ void restrictToScale(uint64_t Scale,
+ SmallInstructionSet &PossibleRedSet,
+ SmallInstructionSet &PossibleRedPHISet,
+ SmallInstructionSet &PossibleRedLastSet) {
+ PossibleRedIdx.clear();
+ PossibleRedIter.clear();
+ Reds.clear();
+
+ for (unsigned i = 0, e = PossibleReds.size(); i != e; ++i)
+ if (PossibleReds[i].size() % Scale == 0) {
+ PossibleRedLastSet.insert(PossibleReds[i].getReducedValue());
+ PossibleRedPHISet.insert(PossibleReds[i].getPHI());
+
+ PossibleRedSet.insert(PossibleReds[i].getPHI());
+ PossibleRedIdx[PossibleReds[i].getPHI()] = i;
+ for (Instruction *J : PossibleReds[i]) {
+ PossibleRedSet.insert(J);
+ PossibleRedIdx[J] = i;
+ }
+ }
+ }
+
+ // The functions below are used while processing the loop instructions.
+
+ // Are the two instructions both from reductions, and furthermore, from
+ // the same reduction?
+ bool isPairInSame(Instruction *J1, Instruction *J2) {
+ DenseMap<Instruction *, int>::iterator J1I = PossibleRedIdx.find(J1);
+ if (J1I != PossibleRedIdx.end()) {
+ DenseMap<Instruction *, int>::iterator J2I = PossibleRedIdx.find(J2);
+ if (J2I != PossibleRedIdx.end() && J1I->second == J2I->second)
+ return true;
+ }
+
+ return false;
+ }
+
+ // The two provided instructions, the first from the base iteration, and
+ // the second from iteration i, form a matched pair. If these are part of
+ // a reduction, record that fact.
+ void recordPair(Instruction *J1, Instruction *J2, unsigned i) {
+ if (PossibleRedIdx.count(J1)) {
+ assert(PossibleRedIdx.count(J2) &&
+ "Recording reduction vs. non-reduction instruction?");
+
+ PossibleRedIter[J1] = 0;
+ PossibleRedIter[J2] = i;
+
+ int Idx = PossibleRedIdx[J1];
+ assert(Idx == PossibleRedIdx[J2] &&
+ "Recording pair from different reductions?");
+ Reds.insert(Idx);
+ }
+ }
+
+ // The functions below can be called after we've finished processing all
+ // instructions in the loop, and we know which reductions were selected.
+
+ bool validateSelected();
+ void replaceSelected();
+
+ protected:
+ // The vector of all possible reductions (for any scale).
+ SmallReductionVector PossibleReds;
+
+ DenseMap<Instruction *, int> PossibleRedIdx;
+ DenseMap<Instruction *, int> PossibleRedIter;
+ DenseSet<int> Reds;
+ };
+
+ // A DAGRootSet models an induction variable being used in a rerollable
+ // loop. For example,
+ //
+ // x[i*3+0] = y1
+ // x[i*3+1] = y2
+ // x[i*3+2] = y3
+ //
+ // Base instruction -> i*3
+ // +---+----+
+ // / | \
+ // ST[y1] +1 +2 <-- Roots
+ // | |
+ // ST[y2] ST[y3]
+ //
+ // There may be multiple DAGRoots, for example:
+ //
+ // x[i*2+0] = ... (1)
+ // x[i*2+1] = ... (1)
+ // x[i*2+4] = ... (2)
+ // x[i*2+5] = ... (2)
+ // x[(i+1234)*2+5678] = ... (3)
+ // x[(i+1234)*2+5679] = ... (3)
+ //
+ // The loop will be rerolled by adding a new loop induction variable,
+ // one for the Base instruction in each DAGRootSet.
+ //
+ struct DAGRootSet {
+ Instruction *BaseInst;
+ SmallInstructionVector Roots;
+ // The instructions between IV and BaseInst (but not including BaseInst).
+ SmallInstructionSet SubsumedInsts;
+ };
+
+ // The set of all DAG roots, and state tracking of all roots
+ // for a particular induction variable.
+ struct DAGRootTracker {
+ DAGRootTracker(LoopReroll *Parent, Loop *L, Instruction *IV,
+ ScalarEvolution *SE, AliasAnalysis *AA,
+ TargetLibraryInfo *TLI, DominatorTree *DT, LoopInfo *LI,
+ bool PreserveLCSSA,
+ DenseMap<Instruction *, int64_t> &IncrMap)
+ : Parent(Parent), L(L), SE(SE), AA(AA), TLI(TLI), DT(DT), LI(LI),
+ PreserveLCSSA(PreserveLCSSA), IV(IV), IVToIncMap(IncrMap) {}
+
+ /// Stage 1: Find all the DAG roots for the induction variable.
+ bool findRoots();
+ /// Stage 2: Validate if the found roots are valid.
+ bool validate(ReductionTracker &Reductions);
+ /// Stage 3: Assuming validate() returned true, perform the
+ /// replacement.
+ /// @param IterCount The maximum iteration count of L.
+ void replace(const SCEV *IterCount);
+
+ protected:
+ typedef MapVector<Instruction*, SmallBitVector> UsesTy;
+
+ bool findRootsRecursive(Instruction *IVU,
+ SmallInstructionSet SubsumedInsts);
+ bool findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts);
+ bool collectPossibleRoots(Instruction *Base,
+ std::map<int64_t,Instruction*> &Roots);
+
+ bool collectUsedInstructions(SmallInstructionSet &PossibleRedSet);
+ void collectInLoopUserSet(const SmallInstructionVector &Roots,
+ const SmallInstructionSet &Exclude,
+ const SmallInstructionSet &Final,
+ DenseSet<Instruction *> &Users);
+ void collectInLoopUserSet(Instruction *Root,
+ const SmallInstructionSet &Exclude,
+ const SmallInstructionSet &Final,
+ DenseSet<Instruction *> &Users);
+
+ UsesTy::iterator nextInstr(int Val, UsesTy &In,
+ const SmallInstructionSet &Exclude,
+ UsesTy::iterator *StartI=nullptr);
+ bool isBaseInst(Instruction *I);
+ bool isRootInst(Instruction *I);
+ bool instrDependsOn(Instruction *I,
+ UsesTy::iterator Start,
+ UsesTy::iterator End);
+
+ LoopReroll *Parent;
+
+ // Members of Parent, replicated here for brevity.
+ Loop *L;
+ ScalarEvolution *SE;
+ AliasAnalysis *AA;
+ TargetLibraryInfo *TLI;
+ DominatorTree *DT;
+ LoopInfo *LI;
+ bool PreserveLCSSA;
+
+ // The loop induction variable.
+ Instruction *IV;
+ // Loop step amount.
+ int64_t Inc;
+ // Loop reroll count; if Inc == 1, this records the scaling applied
+ // to the indvar: a[i*2+0] = ...; a[i*2+1] = ... ;
+ // If Inc is not 1, Scale = Inc.
+ uint64_t Scale;
+ // The roots themselves.
+ SmallVector<DAGRootSet,16> RootSets;
+ // All increment instructions for IV.
+ SmallInstructionVector LoopIncs;
+ // Map of all instructions in the loop (in order) to the iterations
+ // they are used in (or specially, IL_All for instructions
+ // used in the loop increment mechanism).
+ UsesTy Uses;
+ // Map between induction variable and its increment
+ DenseMap<Instruction *, int64_t> &IVToIncMap;
+ };
+
+ void collectPossibleIVs(Loop *L, SmallInstructionVector &PossibleIVs);
+ void collectPossibleReductions(Loop *L,
+ ReductionTracker &Reductions);
+ bool reroll(Instruction *IV, Loop *L, BasicBlock *Header, const SCEV *IterCount,
+ ReductionTracker &Reductions);
+ };
+}
+
+char LoopReroll::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopReroll, "loop-reroll", "Reroll loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(LoopReroll, "loop-reroll", "Reroll loops", false, false)
+
+Pass *llvm::createLoopRerollPass() {
+ return new LoopReroll;
+}
+
+// Returns true if the provided instruction is used outside the given loop.
+// This operates like Instruction::isUsedOutsideOfBlock, but considers PHIs in
+// non-loop blocks to be outside the loop.
+static bool hasUsesOutsideLoop(Instruction *I, Loop *L) {
+ for (User *U : I->users()) {
+ if (!L->contains(cast<Instruction>(U)))
+ return true;
+ }
+ return false;
+}
+
+// Collect the list of loop induction variables with respect to which it might
+// be possible to reroll the loop.
+void LoopReroll::collectPossibleIVs(Loop *L,
+ SmallInstructionVector &PossibleIVs) {
+ BasicBlock *Header = L->getHeader();
+ for (BasicBlock::iterator I = Header->begin(),
+ IE = Header->getFirstInsertionPt(); I != IE; ++I) {
+ if (!isa<PHINode>(I))
+ continue;
+ if (!I->getType()->isIntegerTy())
+ continue;
+
+ if (const SCEVAddRecExpr *PHISCEV =
+ dyn_cast<SCEVAddRecExpr>(SE->getSCEV(&*I))) {
+ if (PHISCEV->getLoop() != L)
+ continue;
+ if (!PHISCEV->isAffine())
+ continue;
+ if (const SCEVConstant *IncSCEV =
+ dyn_cast<SCEVConstant>(PHISCEV->getStepRecurrence(*SE))) {
+ const APInt &AInt = IncSCEV->getAPInt().abs();
+ if (IncSCEV->getValue()->isZero() || AInt.uge(MaxInc))
+ continue;
+ IVToIncMap[&*I] = IncSCEV->getValue()->getSExtValue();
+ DEBUG(dbgs() << "LRR: Possible IV: " << *I << " = " << *PHISCEV
+ << "\n");
+ PossibleIVs.push_back(&*I);
+ }
+ }
+ }
+}
+
+// Add the remainder of the reduction-variable chain to the instruction vector
+// (the initial PHINode has already been added). If successful, the object is
+// marked as valid.
+void LoopReroll::SimpleLoopReduction::add(Loop *L) {
+ assert(!Valid && "Cannot add to an already-valid chain");
+
+ // The reduction variable must be a chain of single-use instructions
+ // (including the PHI), except for the last value (which is used by the PHI
+ // and also outside the loop).
+ Instruction *C = Instructions.front();
+ if (C->user_empty())
+ return;
+
+ do {
+ C = cast<Instruction>(*C->user_begin());
+ if (C->hasOneUse()) {
+ if (!C->isBinaryOp())
+ return;
+
+ if (!(isa<PHINode>(Instructions.back()) ||
+ C->isSameOperationAs(Instructions.back())))
+ return;
+
+ Instructions.push_back(C);
+ }
+ } while (C->hasOneUse());
+
+ if (Instructions.size() < 2 ||
+ !C->isSameOperationAs(Instructions.back()) ||
+ C->use_empty())
+ return;
+
+ // C is now the (potential) last instruction in the reduction chain.
+ for (User *U : C->users()) {
+ // The only in-loop user can be the initial PHI.
+ if (L->contains(cast<Instruction>(U)))
+ if (cast<Instruction>(U) != Instructions.front())
+ return;
+ }
+
+ Instructions.push_back(C);
+ Valid = true;
+}
+
+// Collect the vector of possible reduction variables.
+void LoopReroll::collectPossibleReductions(Loop *L,
+ ReductionTracker &Reductions) {
+ BasicBlock *Header = L->getHeader();
+ for (BasicBlock::iterator I = Header->begin(),
+ IE = Header->getFirstInsertionPt(); I != IE; ++I) {
+ if (!isa<PHINode>(I))
+ continue;
+ if (!I->getType()->isSingleValueType())
+ continue;
+
+ SimpleLoopReduction SLR(&*I, L);
+ if (!SLR.valid())
+ continue;
+
+ DEBUG(dbgs() << "LRR: Possible reduction: " << *I << " (with " <<
+ SLR.size() << " chained instructions)\n");
+ Reductions.addSLR(SLR);
+ }
+}
+
+// Collect the set of all users of the provided root instruction. This set of
+// users contains not only the direct users of the root instruction, but also
+// all users of those users, and so on. There are two exceptions:
+//
+// 1. Instructions in the set of excluded instructions are never added to the
+// use set (even if they are users). This is used, for example, to exclude
+// including root increments in the use set of the primary IV.
+//
+// 2. Instructions in the set of final instructions are added to the use set
+// if they are users, but their users are not added. This is used, for
+// example, to prevent a reduction update from forcing all later reduction
+// updates into the use set.
+void LoopReroll::DAGRootTracker::collectInLoopUserSet(
+ Instruction *Root, const SmallInstructionSet &Exclude,
+ const SmallInstructionSet &Final,
+ DenseSet<Instruction *> &Users) {
+ SmallInstructionVector Queue(1, Root);
+ while (!Queue.empty()) {
+ Instruction *I = Queue.pop_back_val();
+ if (!Users.insert(I).second)
+ continue;
+
+ if (!Final.count(I))
+ for (Use &U : I->uses()) {
+ Instruction *User = cast<Instruction>(U.getUser());
+ if (PHINode *PN = dyn_cast<PHINode>(User)) {
+ // Ignore "wrap-around" uses to PHIs of this loop's header.
+ if (PN->getIncomingBlock(U) == L->getHeader())
+ continue;
+ }
+
+ if (L->contains(User) && !Exclude.count(User)) {
+ Queue.push_back(User);
+ }
+ }
+
+ // We also want to collect single-user "feeder" values.
+ for (User::op_iterator OI = I->op_begin(),
+ OIE = I->op_end(); OI != OIE; ++OI) {
+ if (Instruction *Op = dyn_cast<Instruction>(*OI))
+ if (Op->hasOneUse() && L->contains(Op) && !Exclude.count(Op) &&
+ !Final.count(Op))
+ Queue.push_back(Op);
+ }
+ }
+}
+
+// Collect all of the users of all of the provided root instructions (combined
+// into a single set).
+void LoopReroll::DAGRootTracker::collectInLoopUserSet(
+ const SmallInstructionVector &Roots,
+ const SmallInstructionSet &Exclude,
+ const SmallInstructionSet &Final,
+ DenseSet<Instruction *> &Users) {
+ for (SmallInstructionVector::const_iterator I = Roots.begin(),
+ IE = Roots.end(); I != IE; ++I)
+ collectInLoopUserSet(*I, Exclude, Final, Users);
+}
+
+static bool isSimpleLoadStore(Instruction *I) {
+ if (LoadInst *LI = dyn_cast<LoadInst>(I))
+ return LI->isSimple();
+ if (StoreInst *SI = dyn_cast<StoreInst>(I))
+ return SI->isSimple();
+ if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
+ return !MI->isVolatile();
+ return false;
+}
+
+/// Return true if IVU is a "simple" arithmetic operation.
+/// This is used for narrowing the search space for DAGRoots; only arithmetic
+/// and GEPs can be part of a DAGRoot.
+static bool isSimpleArithmeticOp(User *IVU) {
+ if (Instruction *I = dyn_cast<Instruction>(IVU)) {
+ switch (I->getOpcode()) {
+ default: return false;
+ case Instruction::Add:
+ case Instruction::Sub:
+ case Instruction::Mul:
+ case Instruction::Shl:
+ case Instruction::AShr:
+ case Instruction::LShr:
+ case Instruction::GetElementPtr:
+ case Instruction::Trunc:
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ return true;
+ }
+ }
+ return false;
+}
+
+static bool isLoopIncrement(User *U, Instruction *IV) {
+ BinaryOperator *BO = dyn_cast<BinaryOperator>(U);
+ if (!BO || BO->getOpcode() != Instruction::Add)
+ return false;
+
+ for (auto *UU : BO->users()) {
+ PHINode *PN = dyn_cast<PHINode>(UU);
+ if (PN && PN == IV)
+ return true;
+ }
+ return false;
+}
+
+bool LoopReroll::DAGRootTracker::
+collectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) {
+ SmallInstructionVector BaseUsers;
+
+ for (auto *I : Base->users()) {
+ ConstantInt *CI = nullptr;
+
+ if (isLoopIncrement(I, IV)) {
+ LoopIncs.push_back(cast<Instruction>(I));
+ continue;
+ }
+
+ // The root nodes must be either GEPs, ORs or ADDs.
+ if (auto *BO = dyn_cast<BinaryOperator>(I)) {
+ if (BO->getOpcode() == Instruction::Add ||
+ BO->getOpcode() == Instruction::Or)
+ CI = dyn_cast<ConstantInt>(BO->getOperand(1));
+ } else if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
+ Value *LastOperand = GEP->getOperand(GEP->getNumOperands()-1);
+ CI = dyn_cast<ConstantInt>(LastOperand);
+ }
+
+ if (!CI) {
+ if (Instruction *II = dyn_cast<Instruction>(I)) {
+ BaseUsers.push_back(II);
+ continue;
+ } else {
+ DEBUG(dbgs() << "LRR: Aborting due to non-instruction: " << *I << "\n");
+ return false;
+ }
+ }
+
+ int64_t V = std::abs(CI->getValue().getSExtValue());
+ if (Roots.find(V) != Roots.end())
+ // No duplicates, please.
+ return false;
+
+ Roots[V] = cast<Instruction>(I);
+ }
+
+ if (Roots.empty())
+ return false;
+
+ // If we found non-loop-inc, non-root users of Base, assume they are
+ // for the zeroth root index. This is because "add %a, 0" gets optimized
+ // away.
+ if (BaseUsers.size()) {
+ if (Roots.find(0) != Roots.end()) {
+ DEBUG(dbgs() << "LRR: Multiple roots found for base - aborting!\n");
+ return false;
+ }
+ Roots[0] = Base;
+ }
+
+ // Calculate the number of users of the base, or lowest indexed, iteration.
+ unsigned NumBaseUses = BaseUsers.size();
+ if (NumBaseUses == 0)
+ NumBaseUses = Roots.begin()->second->getNumUses();
+
+ // Check that every node has the same number of users.
+ for (auto &KV : Roots) {
+ if (KV.first == 0)
+ continue;
+ if (KV.second->getNumUses() != NumBaseUses) {
+ DEBUG(dbgs() << "LRR: Aborting - Root and Base #users not the same: "
+ << "#Base=" << NumBaseUses << ", #Root=" <<
+ KV.second->getNumUses() << "\n");
+ return false;
+ }
+ }
+
+ return true;
+}
+
+bool LoopReroll::DAGRootTracker::
+findRootsRecursive(Instruction *I, SmallInstructionSet SubsumedInsts) {
+ // Does the user look like it could be part of a root set?
+ // All its users must be simple arithmetic ops.
+ if (I->getNumUses() > IL_MaxRerollIterations)
+ return false;
+
+ if ((I->getOpcode() == Instruction::Mul ||
+ I->getOpcode() == Instruction::PHI) &&
+ I != IV &&
+ findRootsBase(I, SubsumedInsts))
+ return true;
+
+ SubsumedInsts.insert(I);
+
+ for (User *V : I->users()) {
+ Instruction *I = dyn_cast<Instruction>(V);
+ if (std::find(LoopIncs.begin(), LoopIncs.end(), I) != LoopIncs.end())
+ continue;
+
+ if (!I || !isSimpleArithmeticOp(I) ||
+ !findRootsRecursive(I, SubsumedInsts))
+ return false;
+ }
+ return true;
+}
+
+bool LoopReroll::DAGRootTracker::
+findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts) {
+
+ // The base instruction needs to be a multiply so
+ // that we can erase it.
+ if (IVU->getOpcode() != Instruction::Mul &&
+ IVU->getOpcode() != Instruction::PHI)
+ return false;
+
+ std::map<int64_t, Instruction*> V;
+ if (!collectPossibleRoots(IVU, V))
+ return false;
+
+ // If we didn't get a root for index zero, then IVU must be
+ // subsumed.
+ if (V.find(0) == V.end())
+ SubsumedInsts.insert(IVU);
+
+ // Partition the vector into monotonically increasing indexes.
+ DAGRootSet DRS;
+ DRS.BaseInst = nullptr;
+
+ for (auto &KV : V) {
+ if (!DRS.BaseInst) {
+ DRS.BaseInst = KV.second;
+ DRS.SubsumedInsts = SubsumedInsts;
+ } else if (DRS.Roots.empty()) {
+ DRS.Roots.push_back(KV.second);
+ } else if (V.find(KV.first - 1) != V.end()) {
+ DRS.Roots.push_back(KV.second);
+ } else {
+ // Linear sequence terminated.
+ RootSets.push_back(DRS);
+ DRS.BaseInst = KV.second;
+ DRS.SubsumedInsts = SubsumedInsts;
+ DRS.Roots.clear();
+ }
+ }
+ RootSets.push_back(DRS);
+
+ return true;
+}
+
+bool LoopReroll::DAGRootTracker::findRoots() {
+ Inc = IVToIncMap[IV];
+
+ assert(RootSets.empty() && "Unclean state!");
+ if (std::abs(Inc) == 1) {
+ for (auto *IVU : IV->users()) {
+ if (isLoopIncrement(IVU, IV))
+ LoopIncs.push_back(cast<Instruction>(IVU));
+ }
+ if (!findRootsRecursive(IV, SmallInstructionSet()))
+ return false;
+ LoopIncs.push_back(IV);
+ } else {
+ if (!findRootsBase(IV, SmallInstructionSet()))
+ return false;
+ }
+
+ // Ensure all sets have the same size.
+ if (RootSets.empty()) {
+ DEBUG(dbgs() << "LRR: Aborting because no root sets found!\n");
+ return false;
+ }
+ for (auto &V : RootSets) {
+ if (V.Roots.empty() || V.Roots.size() != RootSets[0].Roots.size()) {
+ DEBUG(dbgs()
+ << "LRR: Aborting because not all root sets have the same size\n");
+ return false;
+ }
+ }
+
+ // And ensure all loop iterations are consecutive. We rely on std::map
+ // providing ordered traversal.
+ for (auto &V : RootSets) {
+ const auto *ADR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(V.BaseInst));
+ if (!ADR)
+ return false;
+
+ // Consider a DAGRootSet with N-1 roots (so N different values including
+ // BaseInst).
+ // Define d = Roots[0] - BaseInst, which should be the same as
+ // Roots[I] - Roots[I-1] for all I in [1..N).
+ // Define D = BaseInst@J - BaseInst@J-1, where "@J" means the value at the
+ // loop iteration J.
+ //
+ // Now, For the loop iterations to be consecutive:
+ // D = d * N
+
+ unsigned N = V.Roots.size() + 1;
+ const SCEV *StepSCEV = SE->getMinusSCEV(SE->getSCEV(V.Roots[0]), ADR);
+ const SCEV *ScaleSCEV = SE->getConstant(StepSCEV->getType(), N);
+ if (ADR->getStepRecurrence(*SE) != SE->getMulExpr(StepSCEV, ScaleSCEV)) {
+ DEBUG(dbgs() << "LRR: Aborting because iterations are not consecutive\n");
+ return false;
+ }
+ }
+ Scale = RootSets[0].Roots.size() + 1;
+
+ if (Scale > IL_MaxRerollIterations) {
+ DEBUG(dbgs() << "LRR: Aborting - too many iterations found. "
+ << "#Found=" << Scale << ", #Max=" << IL_MaxRerollIterations
+ << "\n");
+ return false;
+ }
+
+ DEBUG(dbgs() << "LRR: Successfully found roots: Scale=" << Scale << "\n");
+
+ return true;
+}
+
+bool LoopReroll::DAGRootTracker::collectUsedInstructions(SmallInstructionSet &PossibleRedSet) {
+ // Populate the MapVector with all instructions in the block, in order first,
+ // so we can iterate over the contents later in perfect order.
+ for (auto &I : *L->getHeader()) {
+ Uses[&I].resize(IL_End);
+ }
+
+ SmallInstructionSet Exclude;
+ for (auto &DRS : RootSets) {
+ Exclude.insert(DRS.Roots.begin(), DRS.Roots.end());
+ Exclude.insert(DRS.SubsumedInsts.begin(), DRS.SubsumedInsts.end());
+ Exclude.insert(DRS.BaseInst);
+ }
+ Exclude.insert(LoopIncs.begin(), LoopIncs.end());
+
+ for (auto &DRS : RootSets) {
+ DenseSet<Instruction*> VBase;
+ collectInLoopUserSet(DRS.BaseInst, Exclude, PossibleRedSet, VBase);
+ for (auto *I : VBase) {
+ Uses[I].set(0);
+ }
+
+ unsigned Idx = 1;
+ for (auto *Root : DRS.Roots) {
+ DenseSet<Instruction*> V;
+ collectInLoopUserSet(Root, Exclude, PossibleRedSet, V);
+
+ // While we're here, check the use sets are the same size.
+ if (V.size() != VBase.size()) {
+ DEBUG(dbgs() << "LRR: Aborting - use sets are different sizes\n");
+ return false;
+ }
+
+ for (auto *I : V) {
+ Uses[I].set(Idx);
+ }
+ ++Idx;
+ }
+
+ // Make sure our subsumed instructions are remembered too.
+ for (auto *I : DRS.SubsumedInsts) {
+ Uses[I].set(IL_All);
+ }
+ }
+
+ // Make sure the loop increments are also accounted for.
+
+ Exclude.clear();
+ for (auto &DRS : RootSets) {
+ Exclude.insert(DRS.Roots.begin(), DRS.Roots.end());
+ Exclude.insert(DRS.SubsumedInsts.begin(), DRS.SubsumedInsts.end());
+ Exclude.insert(DRS.BaseInst);
+ }
+
+ DenseSet<Instruction*> V;
+ collectInLoopUserSet(LoopIncs, Exclude, PossibleRedSet, V);
+ for (auto *I : V) {
+ Uses[I].set(IL_All);
+ }
+
+ return true;
+
+}
+
+/// Get the next instruction in "In" that is a member of set Val.
+/// Start searching from StartI, and do not return anything in Exclude.
+/// If StartI is not given, start from In.begin().
+LoopReroll::DAGRootTracker::UsesTy::iterator
+LoopReroll::DAGRootTracker::nextInstr(int Val, UsesTy &In,
+ const SmallInstructionSet &Exclude,
+ UsesTy::iterator *StartI) {
+ UsesTy::iterator I = StartI ? *StartI : In.begin();
+ while (I != In.end() && (I->second.test(Val) == 0 ||
+ Exclude.count(I->first) != 0))
+ ++I;
+ return I;
+}
+
+bool LoopReroll::DAGRootTracker::isBaseInst(Instruction *I) {
+ for (auto &DRS : RootSets) {
+ if (DRS.BaseInst == I)
+ return true;
+ }
+ return false;
+}
+
+bool LoopReroll::DAGRootTracker::isRootInst(Instruction *I) {
+ for (auto &DRS : RootSets) {
+ if (std::find(DRS.Roots.begin(), DRS.Roots.end(), I) != DRS.Roots.end())
+ return true;
+ }
+ return false;
+}
+
+/// Return true if instruction I depends on any instruction between
+/// Start and End.
+bool LoopReroll::DAGRootTracker::instrDependsOn(Instruction *I,
+ UsesTy::iterator Start,
+ UsesTy::iterator End) {
+ for (auto *U : I->users()) {
+ for (auto It = Start; It != End; ++It)
+ if (U == It->first)
+ return true;
+ }
+ return false;
+}
+
+static bool isIgnorableInst(const Instruction *I) {
+ if (isa<DbgInfoIntrinsic>(I))
+ return true;
+ const IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
+ if (!II)
+ return false;
+ switch (II->getIntrinsicID()) {
+ default:
+ return false;
+ case llvm::Intrinsic::annotation:
+ case Intrinsic::ptr_annotation:
+ case Intrinsic::var_annotation:
+ // TODO: the following intrinsics may also be whitelisted:
+ // lifetime_start, lifetime_end, invariant_start, invariant_end
+ return true;
+ }
+ return false;
+}
+
+bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {
+ // We now need to check for equivalence of the use graph of each root with
+ // that of the primary induction variable (excluding the roots). Our goal
+ // here is not to solve the full graph isomorphism problem, but rather to
+ // catch common cases without a lot of work. As a result, we will assume
+ // that the relative order of the instructions in each unrolled iteration
+ // is the same (although we will not make an assumption about how the
+ // different iterations are intermixed). Note that while the order must be
+ // the same, the instructions may not be in the same basic block.
+
+ // An array of just the possible reductions for this scale factor. When we
+ // collect the set of all users of some root instructions, these reduction
+ // instructions are treated as 'final' (their uses are not considered).
+ // This is important because we don't want the root use set to search down
+ // the reduction chain.
+ SmallInstructionSet PossibleRedSet;
+ SmallInstructionSet PossibleRedLastSet;
+ SmallInstructionSet PossibleRedPHISet;
+ Reductions.restrictToScale(Scale, PossibleRedSet,
+ PossibleRedPHISet, PossibleRedLastSet);
+
+ // Populate "Uses" with where each instruction is used.
+ if (!collectUsedInstructions(PossibleRedSet))
+ return false;
+
+ // Make sure we mark the reduction PHIs as used in all iterations.
+ for (auto *I : PossibleRedPHISet) {
+ Uses[I].set(IL_All);
+ }
+
+ // Make sure all instructions in the loop are in one and only one
+ // set.
+ for (auto &KV : Uses) {
+ if (KV.second.count() != 1 && !isIgnorableInst(KV.first)) {
+ DEBUG(dbgs() << "LRR: Aborting - instruction is not used in 1 iteration: "
+ << *KV.first << " (#uses=" << KV.second.count() << ")\n");
+ return false;
+ }
+ }
+
+ DEBUG(
+ for (auto &KV : Uses) {
+ dbgs() << "LRR: " << KV.second.find_first() << "\t" << *KV.first << "\n";
+ }
+ );
+
+ for (unsigned Iter = 1; Iter < Scale; ++Iter) {
+ // In addition to regular aliasing information, we need to look for
+ // instructions from later (future) iterations that have side effects
+ // preventing us from reordering them past other instructions with side
+ // effects.
+ bool FutureSideEffects = false;
+ AliasSetTracker AST(*AA);
+ // The map between instructions in f(%iv.(i+1)) and f(%iv).
+ DenseMap<Value *, Value *> BaseMap;
+
+ // Compare iteration Iter to the base.
+ SmallInstructionSet Visited;
+ auto BaseIt = nextInstr(0, Uses, Visited);
+ auto RootIt = nextInstr(Iter, Uses, Visited);
+ auto LastRootIt = Uses.begin();
+
+ while (BaseIt != Uses.end() && RootIt != Uses.end()) {
+ Instruction *BaseInst = BaseIt->first;
+ Instruction *RootInst = RootIt->first;
+
+ // Skip over the IV or root instructions; only match their users.
+ bool Continue = false;
+ if (isBaseInst(BaseInst)) {
+ Visited.insert(BaseInst);
+ BaseIt = nextInstr(0, Uses, Visited);
+ Continue = true;
+ }
+ if (isRootInst(RootInst)) {
+ LastRootIt = RootIt;
+ Visited.insert(RootInst);
+ RootIt = nextInstr(Iter, Uses, Visited);
+ Continue = true;
+ }
+ if (Continue) continue;
+
+ if (!BaseInst->isSameOperationAs(RootInst)) {
+ // Last chance saloon. We don't try and solve the full isomorphism
+ // problem, but try and at least catch the case where two instructions
+ // *of different types* are round the wrong way. We won't be able to
+ // efficiently tell, given two ADD instructions, which way around we
+ // should match them, but given an ADD and a SUB, we can at least infer
+ // which one is which.
+ //
+ // This should allow us to deal with a greater subset of the isomorphism
+ // problem. It does however change a linear algorithm into a quadratic
+ // one, so limit the number of probes we do.
+ auto TryIt = RootIt;
+ unsigned N = NumToleratedFailedMatches;
+ while (TryIt != Uses.end() &&
+ !BaseInst->isSameOperationAs(TryIt->first) &&
+ N--) {
+ ++TryIt;
+ TryIt = nextInstr(Iter, Uses, Visited, &TryIt);
+ }
+
+ if (TryIt == Uses.end() || TryIt == RootIt ||
+ instrDependsOn(TryIt->first, RootIt, TryIt)) {
+ DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
+ " vs. " << *RootInst << "\n");
+ return false;
+ }
+
+ RootIt = TryIt;
+ RootInst = TryIt->first;
+ }
+
+ // All instructions between the last root and this root
+ // may belong to some other iteration. If they belong to a
+ // future iteration, then they're dangerous to alias with.
+ //
+ // Note that because we allow a limited amount of flexibility in the order
+ // that we visit nodes, LastRootIt might be *before* RootIt, in which
+ // case we've already checked this set of instructions so we shouldn't
+ // do anything.
+ for (; LastRootIt < RootIt; ++LastRootIt) {
+ Instruction *I = LastRootIt->first;
+ if (LastRootIt->second.find_first() < (int)Iter)
+ continue;
+ if (I->mayWriteToMemory())
+ AST.add(I);
+ // Note: This is specifically guarded by a check on isa<PHINode>,
+ // which while a valid (somewhat arbitrary) micro-optimization, is
+ // needed because otherwise isSafeToSpeculativelyExecute returns
+ // false on PHI nodes.
+ if (!isa<PHINode>(I) && !isSimpleLoadStore(I) &&
+ !isSafeToSpeculativelyExecute(I))
+ // Intervening instructions cause side effects.
+ FutureSideEffects = true;
+ }
+
+ // Make sure that this instruction, which is in the use set of this
+ // root instruction, does not also belong to the base set or the set of
+ // some other root instruction.
+ if (RootIt->second.count() > 1) {
+ DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
+ " vs. " << *RootInst << " (prev. case overlap)\n");
+ return false;
+ }
+
+ // Make sure that we don't alias with any instruction in the alias set
+ // tracker. If we do, then we depend on a future iteration, and we
+ // can't reroll.
+ if (RootInst->mayReadFromMemory())
+ for (auto &K : AST) {
+ if (K.aliasesUnknownInst(RootInst, *AA)) {
+ DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
+ " vs. " << *RootInst << " (depends on future store)\n");
+ return false;
+ }
+ }
+
+ // If we've past an instruction from a future iteration that may have
+ // side effects, and this instruction might also, then we can't reorder
+ // them, and this matching fails. As an exception, we allow the alias
+ // set tracker to handle regular (simple) load/store dependencies.
+ if (FutureSideEffects && ((!isSimpleLoadStore(BaseInst) &&
+ !isSafeToSpeculativelyExecute(BaseInst)) ||
+ (!isSimpleLoadStore(RootInst) &&
+ !isSafeToSpeculativelyExecute(RootInst)))) {
+ DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
+ " vs. " << *RootInst <<
+ " (side effects prevent reordering)\n");
+ return false;
+ }
+
+ // For instructions that are part of a reduction, if the operation is
+ // associative, then don't bother matching the operands (because we
+ // already know that the instructions are isomorphic, and the order
+ // within the iteration does not matter). For non-associative reductions,
+ // we do need to match the operands, because we need to reject
+ // out-of-order instructions within an iteration!
+ // For example (assume floating-point addition), we need to reject this:
+ // x += a[i]; x += b[i];
+ // x += a[i+1]; x += b[i+1];
+ // x += b[i+2]; x += a[i+2];
+ bool InReduction = Reductions.isPairInSame(BaseInst, RootInst);
+
+ if (!(InReduction && BaseInst->isAssociative())) {
+ bool Swapped = false, SomeOpMatched = false;
+ for (unsigned j = 0; j < BaseInst->getNumOperands(); ++j) {
+ Value *Op2 = RootInst->getOperand(j);
+
+ // If this is part of a reduction (and the operation is not
+ // associatve), then we match all operands, but not those that are
+ // part of the reduction.
+ if (InReduction)
+ if (Instruction *Op2I = dyn_cast<Instruction>(Op2))
+ if (Reductions.isPairInSame(RootInst, Op2I))
+ continue;
+
+ DenseMap<Value *, Value *>::iterator BMI = BaseMap.find(Op2);
+ if (BMI != BaseMap.end()) {
+ Op2 = BMI->second;
+ } else {
+ for (auto &DRS : RootSets) {
+ if (DRS.Roots[Iter-1] == (Instruction*) Op2) {
+ Op2 = DRS.BaseInst;
+ break;
+ }
+ }
+ }
+
+ if (BaseInst->getOperand(Swapped ? unsigned(!j) : j) != Op2) {
+ // If we've not already decided to swap the matched operands, and
+ // we've not already matched our first operand (note that we could
+ // have skipped matching the first operand because it is part of a
+ // reduction above), and the instruction is commutative, then try
+ // the swapped match.
+ if (!Swapped && BaseInst->isCommutative() && !SomeOpMatched &&
+ BaseInst->getOperand(!j) == Op2) {
+ Swapped = true;
+ } else {
+ DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst
+ << " vs. " << *RootInst << " (operand " << j << ")\n");
+ return false;
+ }
+ }
+
+ SomeOpMatched = true;
+ }
+ }
+
+ if ((!PossibleRedLastSet.count(BaseInst) &&
+ hasUsesOutsideLoop(BaseInst, L)) ||
+ (!PossibleRedLastSet.count(RootInst) &&
+ hasUsesOutsideLoop(RootInst, L))) {
+ DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
+ " vs. " << *RootInst << " (uses outside loop)\n");
+ return false;
+ }
+
+ Reductions.recordPair(BaseInst, RootInst, Iter);
+ BaseMap.insert(std::make_pair(RootInst, BaseInst));
+
+ LastRootIt = RootIt;
+ Visited.insert(BaseInst);
+ Visited.insert(RootInst);
+ BaseIt = nextInstr(0, Uses, Visited);
+ RootIt = nextInstr(Iter, Uses, Visited);
+ }
+ assert (BaseIt == Uses.end() && RootIt == Uses.end() &&
+ "Mismatched set sizes!");
+ }
+
+ DEBUG(dbgs() << "LRR: Matched all iteration increments for " <<
+ *IV << "\n");
+
+ return true;
+}
+
+void LoopReroll::DAGRootTracker::replace(const SCEV *IterCount) {
+ BasicBlock *Header = L->getHeader();
+ // Remove instructions associated with non-base iterations.
+ for (BasicBlock::reverse_iterator J = Header->rbegin();
+ J != Header->rend();) {
+ unsigned I = Uses[&*J].find_first();
+ if (I > 0 && I < IL_All) {
+ Instruction *D = &*J;
+ DEBUG(dbgs() << "LRR: removing: " << *D << "\n");
+ D->eraseFromParent();
+ continue;
+ }
+
+ ++J;
+ }
+ bool Negative = IVToIncMap[IV] < 0;
+ const DataLayout &DL = Header->getModule()->getDataLayout();
+
+ // We need to create a new induction variable for each different BaseInst.
+ for (auto &DRS : RootSets) {
+ // Insert the new induction variable.
+ const SCEVAddRecExpr *RealIVSCEV =
+ cast<SCEVAddRecExpr>(SE->getSCEV(DRS.BaseInst));
+ const SCEV *Start = RealIVSCEV->getStart();
+ const SCEVAddRecExpr *H = cast<SCEVAddRecExpr>(SE->getAddRecExpr(
+ Start, SE->getConstant(RealIVSCEV->getType(), Negative ? -1 : 1), L,
+ SCEV::FlagAnyWrap));
+ { // Limit the lifetime of SCEVExpander.
+ SCEVExpander Expander(*SE, DL, "reroll");
+ Value *NewIV = Expander.expandCodeFor(H, IV->getType(), &Header->front());
+
+ for (auto &KV : Uses) {
+ if (KV.second.find_first() == 0)
+ KV.first->replaceUsesOfWith(DRS.BaseInst, NewIV);
+ }
+
+ if (BranchInst *BI = dyn_cast<BranchInst>(Header->getTerminator())) {
+ // FIXME: Why do we need this check?
+ if (Uses[BI].find_first() == IL_All) {
+ const SCEV *ICSCEV = RealIVSCEV->evaluateAtIteration(IterCount, *SE);
+
+ // Iteration count SCEV minus 1
+ const SCEV *ICMinus1SCEV = SE->getMinusSCEV(
+ ICSCEV, SE->getConstant(ICSCEV->getType(), Negative ? -1 : 1));
+
+ Value *ICMinus1; // Iteration count minus 1
+ if (isa<SCEVConstant>(ICMinus1SCEV)) {
+ ICMinus1 = Expander.expandCodeFor(ICMinus1SCEV, NewIV->getType(), BI);
+ } else {
+ BasicBlock *Preheader = L->getLoopPreheader();
+ if (!Preheader)
+ Preheader = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA);
+
+ ICMinus1 = Expander.expandCodeFor(ICMinus1SCEV, NewIV->getType(),
+ Preheader->getTerminator());
+ }
+
+ Value *Cond =
+ new ICmpInst(BI, CmpInst::ICMP_EQ, NewIV, ICMinus1, "exitcond");
+ BI->setCondition(Cond);
+
+ if (BI->getSuccessor(1) != Header)
+ BI->swapSuccessors();
+ }
+ }
+ }
+ }
+
+ SimplifyInstructionsInBlock(Header, TLI);
+ DeleteDeadPHIs(Header, TLI);
+}
+
+// Validate the selected reductions. All iterations must have an isomorphic
+// part of the reduction chain and, for non-associative reductions, the chain
+// entries must appear in order.
+bool LoopReroll::ReductionTracker::validateSelected() {
+ // For a non-associative reduction, the chain entries must appear in order.
+ for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end();
+ RI != RIE; ++RI) {
+ int i = *RI;
+ int PrevIter = 0, BaseCount = 0, Count = 0;
+ for (Instruction *J : PossibleReds[i]) {
+ // Note that all instructions in the chain must have been found because
+ // all instructions in the function must have been assigned to some
+ // iteration.
+ int Iter = PossibleRedIter[J];
+ if (Iter != PrevIter && Iter != PrevIter + 1 &&
+ !PossibleReds[i].getReducedValue()->isAssociative()) {
+ DEBUG(dbgs() << "LRR: Out-of-order non-associative reduction: " <<
+ J << "\n");
+ return false;
+ }
+
+ if (Iter != PrevIter) {
+ if (Count != BaseCount) {
+ DEBUG(dbgs() << "LRR: Iteration " << PrevIter <<
+ " reduction use count " << Count <<
+ " is not equal to the base use count " <<
+ BaseCount << "\n");
+ return false;
+ }
+
+ Count = 0;
+ }
+
+ ++Count;
+ if (Iter == 0)
+ ++BaseCount;
+
+ PrevIter = Iter;
+ }
+ }
+
+ return true;
+}
+
+// For all selected reductions, remove all parts except those in the first
+// iteration (and the PHI). Replace outside uses of the reduced value with uses
+// of the first-iteration reduced value (in other words, reroll the selected
+// reductions).
+void LoopReroll::ReductionTracker::replaceSelected() {
+ // Fixup reductions to refer to the last instruction associated with the
+ // first iteration (not the last).
+ for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end();
+ RI != RIE; ++RI) {
+ int i = *RI;
+ int j = 0;
+ for (int e = PossibleReds[i].size(); j != e; ++j)
+ if (PossibleRedIter[PossibleReds[i][j]] != 0) {
+ --j;
+ break;
+ }
+
+ // Replace users with the new end-of-chain value.
+ SmallInstructionVector Users;
+ for (User *U : PossibleReds[i].getReducedValue()->users()) {
+ Users.push_back(cast<Instruction>(U));
+ }
+
+ for (SmallInstructionVector::iterator J = Users.begin(),
+ JE = Users.end(); J != JE; ++J)
+ (*J)->replaceUsesOfWith(PossibleReds[i].getReducedValue(),
+ PossibleReds[i][j]);
+ }
+}
+
+// Reroll the provided loop with respect to the provided induction variable.
+// Generally, we're looking for a loop like this:
+//
+// %iv = phi [ (preheader, ...), (body, %iv.next) ]
+// f(%iv)
+// %iv.1 = add %iv, 1 <-- a root increment
+// f(%iv.1)
+// %iv.2 = add %iv, 2 <-- a root increment
+// f(%iv.2)
+// %iv.scale_m_1 = add %iv, scale-1 <-- a root increment
+// f(%iv.scale_m_1)
+// ...
+// %iv.next = add %iv, scale
+// %cmp = icmp(%iv, ...)
+// br %cmp, header, exit
+//
+// Notably, we do not require that f(%iv), f(%iv.1), etc. be isolated groups of
+// instructions. In other words, the instructions in f(%iv), f(%iv.1), etc. can
+// be intermixed with eachother. The restriction imposed by this algorithm is
+// that the relative order of the isomorphic instructions in f(%iv), f(%iv.1),
+// etc. be the same.
+//
+// First, we collect the use set of %iv, excluding the other increment roots.
+// This gives us f(%iv). Then we iterate over the loop instructions (scale-1)
+// times, having collected the use set of f(%iv.(i+1)), during which we:
+// - Ensure that the next unmatched instruction in f(%iv) is isomorphic to
+// the next unmatched instruction in f(%iv.(i+1)).
+// - Ensure that both matched instructions don't have any external users
+// (with the exception of last-in-chain reduction instructions).
+// - Track the (aliasing) write set, and other side effects, of all
+// instructions that belong to future iterations that come before the matched
+// instructions. If the matched instructions read from that write set, then
+// f(%iv) or f(%iv.(i+1)) has some dependency on instructions in
+// f(%iv.(j+1)) for some j > i, and we cannot reroll the loop. Similarly,
+// if any of these future instructions had side effects (could not be
+// speculatively executed), and so do the matched instructions, when we
+// cannot reorder those side-effect-producing instructions, and rerolling
+// fails.
+//
+// Finally, we make sure that all loop instructions are either loop increment
+// roots, belong to simple latch code, parts of validated reductions, part of
+// f(%iv) or part of some f(%iv.i). If all of that is true (and all reductions
+// have been validated), then we reroll the loop.
+bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header,
+ const SCEV *IterCount,
+ ReductionTracker &Reductions) {
+ DAGRootTracker DAGRoots(this, L, IV, SE, AA, TLI, DT, LI, PreserveLCSSA,
+ IVToIncMap);
+
+ if (!DAGRoots.findRoots())
+ return false;
+ DEBUG(dbgs() << "LRR: Found all root induction increments for: " <<
+ *IV << "\n");
+
+ if (!DAGRoots.validate(Reductions))
+ return false;
+ if (!Reductions.validateSelected())
+ return false;
+ // At this point, we've validated the rerolling, and we're committed to
+ // making changes!
+
+ Reductions.replaceSelected();
+ DAGRoots.replace(IterCount);
+
+ ++NumRerolledLoops;
+ return true;
+}
+
+bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) {
+ if (skipOptnoneFunction(L))
+ return false;
+
+ AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+ LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
+
+ BasicBlock *Header = L->getHeader();
+ DEBUG(dbgs() << "LRR: F[" << Header->getParent()->getName() <<
+ "] Loop %" << Header->getName() << " (" <<
+ L->getNumBlocks() << " block(s))\n");
+
+ bool Changed = false;
+
+ // For now, we'll handle only single BB loops.
+ if (L->getNumBlocks() > 1)
+ return Changed;
+
+ if (!SE->hasLoopInvariantBackedgeTakenCount(L))
+ return Changed;
+
+ const SCEV *LIBETC = SE->getBackedgeTakenCount(L);
+ const SCEV *IterCount = SE->getAddExpr(LIBETC, SE->getOne(LIBETC->getType()));
+ DEBUG(dbgs() << "LRR: iteration count = " << *IterCount << "\n");
+
+ // First, we need to find the induction variable with respect to which we can
+ // reroll (there may be several possible options).
+ SmallInstructionVector PossibleIVs;
+ IVToIncMap.clear();
+ collectPossibleIVs(L, PossibleIVs);
+
+ if (PossibleIVs.empty()) {
+ DEBUG(dbgs() << "LRR: No possible IVs found\n");
+ return Changed;
+ }
+
+ ReductionTracker Reductions;
+ collectPossibleReductions(L, Reductions);
+
+ // For each possible IV, collect the associated possible set of 'root' nodes
+ // (i+1, i+2, etc.).
+ for (SmallInstructionVector::iterator I = PossibleIVs.begin(),
+ IE = PossibleIVs.end(); I != IE; ++I)
+ if (reroll(*I, L, Header, IterCount, Reductions)) {
+ Changed = true;
+ break;
+ }
+
+ return Changed;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp
new file mode 100644
index 0000000..5e6c2da
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp
@@ -0,0 +1,624 @@
+//===- LoopRotation.cpp - Loop Rotation Pass ------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements Loop Rotation Pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-rotate"
+
+static cl::opt<unsigned>
+DefaultRotationThreshold("rotation-max-header-size", cl::init(16), cl::Hidden,
+ cl::desc("The default maximum header size for automatic loop rotation"));
+
+STATISTIC(NumRotated, "Number of loops rotated");
+
+/// RewriteUsesOfClonedInstructions - We just cloned the instructions from the
+/// old header into the preheader. If there were uses of the values produced by
+/// these instruction that were outside of the loop, we have to insert PHI nodes
+/// to merge the two values. Do this now.
+static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
+ BasicBlock *OrigPreheader,
+ ValueToValueMapTy &ValueMap) {
+ // Remove PHI node entries that are no longer live.
+ BasicBlock::iterator I, E = OrigHeader->end();
+ for (I = OrigHeader->begin(); PHINode *PN = dyn_cast<PHINode>(I); ++I)
+ PN->removeIncomingValue(PN->getBasicBlockIndex(OrigPreheader));
+
+ // Now fix up users of the instructions in OrigHeader, inserting PHI nodes
+ // as necessary.
+ SSAUpdater SSA;
+ for (I = OrigHeader->begin(); I != E; ++I) {
+ Value *OrigHeaderVal = &*I;
+
+ // If there are no uses of the value (e.g. because it returns void), there
+ // is nothing to rewrite.
+ if (OrigHeaderVal->use_empty())
+ continue;
+
+ Value *OrigPreHeaderVal = ValueMap[OrigHeaderVal];
+
+ // The value now exits in two versions: the initial value in the preheader
+ // and the loop "next" value in the original header.
+ SSA.Initialize(OrigHeaderVal->getType(), OrigHeaderVal->getName());
+ SSA.AddAvailableValue(OrigHeader, OrigHeaderVal);
+ SSA.AddAvailableValue(OrigPreheader, OrigPreHeaderVal);
+
+ // Visit each use of the OrigHeader instruction.
+ for (Value::use_iterator UI = OrigHeaderVal->use_begin(),
+ UE = OrigHeaderVal->use_end(); UI != UE; ) {
+ // Grab the use before incrementing the iterator.
+ Use &U = *UI;
+
+ // Increment the iterator before removing the use from the list.
+ ++UI;
+
+ // SSAUpdater can't handle a non-PHI use in the same block as an
+ // earlier def. We can easily handle those cases manually.
+ Instruction *UserInst = cast<Instruction>(U.getUser());
+ if (!isa<PHINode>(UserInst)) {
+ BasicBlock *UserBB = UserInst->getParent();
+
+ // The original users in the OrigHeader are already using the
+ // original definitions.
+ if (UserBB == OrigHeader)
+ continue;
+
+ // Users in the OrigPreHeader need to use the value to which the
+ // original definitions are mapped.
+ if (UserBB == OrigPreheader) {
+ U = OrigPreHeaderVal;
+ continue;
+ }
+ }
+
+ // Anything else can be handled by SSAUpdater.
+ SSA.RewriteUse(U);
+ }
+ }
+}
+
+/// Rotate loop LP. Return true if the loop is rotated.
+///
+/// \param SimplifiedLatch is true if the latch was just folded into the final
+/// loop exit. In this case we may want to rotate even though the new latch is
+/// now an exiting branch. This rotation would have happened had the latch not
+/// been simplified. However, if SimplifiedLatch is false, then we avoid
+/// rotating loops in which the latch exits to avoid excessive or endless
+/// rotation. LoopRotate should be repeatable and converge to a canonical
+/// form. This property is satisfied because simplifying the loop latch can only
+/// happen once across multiple invocations of the LoopRotate pass.
+static bool rotateLoop(Loop *L, unsigned MaxHeaderSize, LoopInfo *LI,
+ const TargetTransformInfo *TTI, AssumptionCache *AC,
+ DominatorTree *DT, ScalarEvolution *SE,
+ bool SimplifiedLatch) {
+ // If the loop has only one block then there is not much to rotate.
+ if (L->getBlocks().size() == 1)
+ return false;
+
+ BasicBlock *OrigHeader = L->getHeader();
+ BasicBlock *OrigLatch = L->getLoopLatch();
+
+ BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator());
+ if (!BI || BI->isUnconditional())
+ return false;
+
+ // If the loop header is not one of the loop exiting blocks then
+ // either this loop is already rotated or it is not
+ // suitable for loop rotation transformations.
+ if (!L->isLoopExiting(OrigHeader))
+ return false;
+
+ // If the loop latch already contains a branch that leaves the loop then the
+ // loop is already rotated.
+ if (!OrigLatch)
+ return false;
+
+ // Rotate if either the loop latch does *not* exit the loop, or if the loop
+ // latch was just simplified.
+ if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch)
+ return false;
+
+ // Check size of original header and reject loop if it is very big or we can't
+ // duplicate blocks inside it.
+ {
+ SmallPtrSet<const Value *, 32> EphValues;
+ CodeMetrics::collectEphemeralValues(L, AC, EphValues);
+
+ CodeMetrics Metrics;
+ Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues);
+ if (Metrics.notDuplicatable) {
+ DEBUG(dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable"
+ << " instructions: "; L->dump());
+ return false;
+ }
+ if (Metrics.NumInsts > MaxHeaderSize)
+ return false;
+ }
+
+ // Now, this loop is suitable for rotation.
+ BasicBlock *OrigPreheader = L->getLoopPreheader();
+
+ // If the loop could not be converted to canonical form, it must have an
+ // indirectbr in it, just give up.
+ if (!OrigPreheader)
+ return false;
+
+ // Anything ScalarEvolution may know about this loop or the PHI nodes
+ // in its header will soon be invalidated.
+ if (SE)
+ SE->forgetLoop(L);
+
+ DEBUG(dbgs() << "LoopRotation: rotating "; L->dump());
+
+ // Find new Loop header. NewHeader is a Header's one and only successor
+ // that is inside loop. Header's other successor is outside the
+ // loop. Otherwise loop is not suitable for rotation.
+ BasicBlock *Exit = BI->getSuccessor(0);
+ BasicBlock *NewHeader = BI->getSuccessor(1);
+ if (L->contains(Exit))
+ std::swap(Exit, NewHeader);
+ assert(NewHeader && "Unable to determine new loop header");
+ assert(L->contains(NewHeader) && !L->contains(Exit) &&
+ "Unable to determine loop header and exit blocks");
+
+ // This code assumes that the new header has exactly one predecessor.
+ // Remove any single-entry PHI nodes in it.
+ assert(NewHeader->getSinglePredecessor() &&
+ "New header doesn't have one pred!");
+ FoldSingleEntryPHINodes(NewHeader);
+
+ // Begin by walking OrigHeader and populating ValueMap with an entry for
+ // each Instruction.
+ BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end();
+ ValueToValueMapTy ValueMap;
+
+ // For PHI nodes, the value available in OldPreHeader is just the
+ // incoming value from OldPreHeader.
+ for (; PHINode *PN = dyn_cast<PHINode>(I); ++I)
+ ValueMap[PN] = PN->getIncomingValueForBlock(OrigPreheader);
+
+ const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+
+ // For the rest of the instructions, either hoist to the OrigPreheader if
+ // possible or create a clone in the OldPreHeader if not.
+ TerminatorInst *LoopEntryBranch = OrigPreheader->getTerminator();
+ while (I != E) {
+ Instruction *Inst = &*I++;
+
+ // If the instruction's operands are invariant and it doesn't read or write
+ // memory, then it is safe to hoist. Doing this doesn't change the order of
+ // execution in the preheader, but does prevent the instruction from
+ // executing in each iteration of the loop. This means it is safe to hoist
+ // something that might trap, but isn't safe to hoist something that reads
+ // memory (without proving that the loop doesn't write).
+ if (L->hasLoopInvariantOperands(Inst) &&
+ !Inst->mayReadFromMemory() && !Inst->mayWriteToMemory() &&
+ !isa<TerminatorInst>(Inst) && !isa<DbgInfoIntrinsic>(Inst) &&
+ !isa<AllocaInst>(Inst)) {
+ Inst->moveBefore(LoopEntryBranch);
+ continue;
+ }
+
+ // Otherwise, create a duplicate of the instruction.
+ Instruction *C = Inst->clone();
+
+ // Eagerly remap the operands of the instruction.
+ RemapInstruction(C, ValueMap,
+ RF_NoModuleLevelChanges|RF_IgnoreMissingEntries);
+
+ // With the operands remapped, see if the instruction constant folds or is
+ // otherwise simplifyable. This commonly occurs because the entry from PHI
+ // nodes allows icmps and other instructions to fold.
+ // FIXME: Provide TLI, DT, AC to SimplifyInstruction.
+ Value *V = SimplifyInstruction(C, DL);
+ if (V && LI->replacementPreservesLCSSAForm(C, V)) {
+ // If so, then delete the temporary instruction and stick the folded value
+ // in the map.
+ delete C;
+ ValueMap[Inst] = V;
+ } else {
+ // Otherwise, stick the new instruction into the new block!
+ C->setName(Inst->getName());
+ C->insertBefore(LoopEntryBranch);
+ ValueMap[Inst] = C;
+ }
+ }
+
+ // Along with all the other instructions, we just cloned OrigHeader's
+ // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's
+ // successors by duplicating their incoming values for OrigHeader.
+ TerminatorInst *TI = OrigHeader->getTerminator();
+ for (BasicBlock *SuccBB : TI->successors())
+ for (BasicBlock::iterator BI = SuccBB->begin();
+ PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
+ PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader);
+
+ // Now that OrigPreHeader has a clone of OrigHeader's terminator, remove
+ // OrigPreHeader's old terminator (the original branch into the loop), and
+ // remove the corresponding incoming values from the PHI nodes in OrigHeader.
+ LoopEntryBranch->eraseFromParent();
+
+ // If there were any uses of instructions in the duplicated block outside the
+ // loop, update them, inserting PHI nodes as required
+ RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap);
+
+ // NewHeader is now the header of the loop.
+ L->moveToHeader(NewHeader);
+ assert(L->getHeader() == NewHeader && "Latch block is our new header");
+
+
+ // At this point, we've finished our major CFG changes. As part of cloning
+ // the loop into the preheader we've simplified instructions and the
+ // duplicated conditional branch may now be branching on a constant. If it is
+ // branching on a constant and if that constant means that we enter the loop,
+ // then we fold away the cond branch to an uncond branch. This simplifies the
+ // loop in cases important for nested loops, and it also means we don't have
+ // to split as many edges.
+ BranchInst *PHBI = cast<BranchInst>(OrigPreheader->getTerminator());
+ assert(PHBI->isConditional() && "Should be clone of BI condbr!");
+ if (!isa<ConstantInt>(PHBI->getCondition()) ||
+ PHBI->getSuccessor(cast<ConstantInt>(PHBI->getCondition())->isZero())
+ != NewHeader) {
+ // The conditional branch can't be folded, handle the general case.
+ // Update DominatorTree to reflect the CFG change we just made. Then split
+ // edges as necessary to preserve LoopSimplify form.
+ if (DT) {
+ // Everything that was dominated by the old loop header is now dominated
+ // by the original loop preheader. Conceptually the header was merged
+ // into the preheader, even though we reuse the actual block as a new
+ // loop latch.
+ DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader);
+ SmallVector<DomTreeNode *, 8> HeaderChildren(OrigHeaderNode->begin(),
+ OrigHeaderNode->end());
+ DomTreeNode *OrigPreheaderNode = DT->getNode(OrigPreheader);
+ for (unsigned I = 0, E = HeaderChildren.size(); I != E; ++I)
+ DT->changeImmediateDominator(HeaderChildren[I], OrigPreheaderNode);
+
+ assert(DT->getNode(Exit)->getIDom() == OrigPreheaderNode);
+ assert(DT->getNode(NewHeader)->getIDom() == OrigPreheaderNode);
+
+ // Update OrigHeader to be dominated by the new header block.
+ DT->changeImmediateDominator(OrigHeader, OrigLatch);
+ }
+
+ // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and
+ // thus is not a preheader anymore.
+ // Split the edge to form a real preheader.
+ BasicBlock *NewPH = SplitCriticalEdge(
+ OrigPreheader, NewHeader,
+ CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA());
+ NewPH->setName(NewHeader->getName() + ".lr.ph");
+
+ // Preserve canonical loop form, which means that 'Exit' should have only
+ // one predecessor. Note that Exit could be an exit block for multiple
+ // nested loops, causing both of the edges to now be critical and need to
+ // be split.
+ SmallVector<BasicBlock *, 4> ExitPreds(pred_begin(Exit), pred_end(Exit));
+ bool SplitLatchEdge = false;
+ for (SmallVectorImpl<BasicBlock *>::iterator PI = ExitPreds.begin(),
+ PE = ExitPreds.end();
+ PI != PE; ++PI) {
+ // We only need to split loop exit edges.
+ Loop *PredLoop = LI->getLoopFor(*PI);
+ if (!PredLoop || PredLoop->contains(Exit))
+ continue;
+ if (isa<IndirectBrInst>((*PI)->getTerminator()))
+ continue;
+ SplitLatchEdge |= L->getLoopLatch() == *PI;
+ BasicBlock *ExitSplit = SplitCriticalEdge(
+ *PI, Exit, CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA());
+ ExitSplit->moveBefore(Exit);
+ }
+ assert(SplitLatchEdge &&
+ "Despite splitting all preds, failed to split latch exit?");
+ } else {
+ // We can fold the conditional branch in the preheader, this makes things
+ // simpler. The first step is to remove the extra edge to the Exit block.
+ Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/);
+ BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI);
+ NewBI->setDebugLoc(PHBI->getDebugLoc());
+ PHBI->eraseFromParent();
+
+ // With our CFG finalized, update DomTree if it is available.
+ if (DT) {
+ // Update OrigHeader to be dominated by the new header block.
+ DT->changeImmediateDominator(NewHeader, OrigPreheader);
+ DT->changeImmediateDominator(OrigHeader, OrigLatch);
+
+ // Brute force incremental dominator tree update. Call
+ // findNearestCommonDominator on all CFG predecessors of each child of the
+ // original header.
+ DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader);
+ SmallVector<DomTreeNode *, 8> HeaderChildren(OrigHeaderNode->begin(),
+ OrigHeaderNode->end());
+ bool Changed;
+ do {
+ Changed = false;
+ for (unsigned I = 0, E = HeaderChildren.size(); I != E; ++I) {
+ DomTreeNode *Node = HeaderChildren[I];
+ BasicBlock *BB = Node->getBlock();
+
+ pred_iterator PI = pred_begin(BB);
+ BasicBlock *NearestDom = *PI;
+ for (pred_iterator PE = pred_end(BB); PI != PE; ++PI)
+ NearestDom = DT->findNearestCommonDominator(NearestDom, *PI);
+
+ // Remember if this changes the DomTree.
+ if (Node->getIDom()->getBlock() != NearestDom) {
+ DT->changeImmediateDominator(BB, NearestDom);
+ Changed = true;
+ }
+ }
+
+ // If the dominator changed, this may have an effect on other
+ // predecessors, continue until we reach a fixpoint.
+ } while (Changed);
+ }
+ }
+
+ assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation");
+ assert(L->getLoopLatch() && "Invalid loop latch after loop rotation");
+
+ // Now that the CFG and DomTree are in a consistent state again, try to merge
+ // the OrigHeader block into OrigLatch. This will succeed if they are
+ // connected by an unconditional branch. This is just a cleanup so the
+ // emitted code isn't too gross in this common case.
+ MergeBlockIntoPredecessor(OrigHeader, DT, LI);
+
+ DEBUG(dbgs() << "LoopRotation: into "; L->dump());
+
+ ++NumRotated;
+ return true;
+}
+
+/// Determine whether the instructions in this range may be safely and cheaply
+/// speculated. This is not an important enough situation to develop complex
+/// heuristics. We handle a single arithmetic instruction along with any type
+/// conversions.
+static bool shouldSpeculateInstrs(BasicBlock::iterator Begin,
+ BasicBlock::iterator End, Loop *L) {
+ bool seenIncrement = false;
+ bool MultiExitLoop = false;
+
+ if (!L->getExitingBlock())
+ MultiExitLoop = true;
+
+ for (BasicBlock::iterator I = Begin; I != End; ++I) {
+
+ if (!isSafeToSpeculativelyExecute(&*I))
+ return false;
+
+ if (isa<DbgInfoIntrinsic>(I))
+ continue;
+
+ switch (I->getOpcode()) {
+ default:
+ return false;
+ case Instruction::GetElementPtr:
+ // GEPs are cheap if all indices are constant.
+ if (!cast<GEPOperator>(I)->hasAllConstantIndices())
+ return false;
+ // fall-thru to increment case
+ case Instruction::Add:
+ case Instruction::Sub:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr: {
+ Value *IVOpnd = !isa<Constant>(I->getOperand(0))
+ ? I->getOperand(0)
+ : !isa<Constant>(I->getOperand(1))
+ ? I->getOperand(1)
+ : nullptr;
+ if (!IVOpnd)
+ return false;
+
+ // If increment operand is used outside of the loop, this speculation
+ // could cause extra live range interference.
+ if (MultiExitLoop) {
+ for (User *UseI : IVOpnd->users()) {
+ auto *UserInst = cast<Instruction>(UseI);
+ if (!L->contains(UserInst))
+ return false;
+ }
+ }
+
+ if (seenIncrement)
+ return false;
+ seenIncrement = true;
+ break;
+ }
+ case Instruction::Trunc:
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ // ignore type conversions
+ break;
+ }
+ }
+ return true;
+}
+
+/// Fold the loop tail into the loop exit by speculating the loop tail
+/// instructions. Typically, this is a single post-increment. In the case of a
+/// simple 2-block loop, hoisting the increment can be much better than
+/// duplicating the entire loop header. In the case of loops with early exits,
+/// rotation will not work anyway, but simplifyLoopLatch will put the loop in
+/// canonical form so downstream passes can handle it.
+///
+/// I don't believe this invalidates SCEV.
+static bool simplifyLoopLatch(Loop *L, LoopInfo *LI, DominatorTree *DT) {
+ BasicBlock *Latch = L->getLoopLatch();
+ if (!Latch || Latch->hasAddressTaken())
+ return false;
+
+ BranchInst *Jmp = dyn_cast<BranchInst>(Latch->getTerminator());
+ if (!Jmp || !Jmp->isUnconditional())
+ return false;
+
+ BasicBlock *LastExit = Latch->getSinglePredecessor();
+ if (!LastExit || !L->isLoopExiting(LastExit))
+ return false;
+
+ BranchInst *BI = dyn_cast<BranchInst>(LastExit->getTerminator());
+ if (!BI)
+ return false;
+
+ if (!shouldSpeculateInstrs(Latch->begin(), Jmp->getIterator(), L))
+ return false;
+
+ DEBUG(dbgs() << "Folding loop latch " << Latch->getName() << " into "
+ << LastExit->getName() << "\n");
+
+ // Hoist the instructions from Latch into LastExit.
+ LastExit->getInstList().splice(BI->getIterator(), Latch->getInstList(),
+ Latch->begin(), Jmp->getIterator());
+
+ unsigned FallThruPath = BI->getSuccessor(0) == Latch ? 0 : 1;
+ BasicBlock *Header = Jmp->getSuccessor(0);
+ assert(Header == L->getHeader() && "expected a backward branch");
+
+ // Remove Latch from the CFG so that LastExit becomes the new Latch.
+ BI->setSuccessor(FallThruPath, Header);
+ Latch->replaceSuccessorsPhiUsesWith(LastExit);
+ Jmp->eraseFromParent();
+
+ // Nuke the Latch block.
+ assert(Latch->empty() && "unable to evacuate Latch");
+ LI->removeBlock(Latch);
+ if (DT)
+ DT->eraseNode(Latch);
+ Latch->eraseFromParent();
+ return true;
+}
+
+/// Rotate \c L as many times as possible. Return true if the loop is rotated
+/// at least once.
+static bool iterativelyRotateLoop(Loop *L, unsigned MaxHeaderSize, LoopInfo *LI,
+ const TargetTransformInfo *TTI,
+ AssumptionCache *AC, DominatorTree *DT,
+ ScalarEvolution *SE) {
+ // Save the loop metadata.
+ MDNode *LoopMD = L->getLoopID();
+
+ // Simplify the loop latch before attempting to rotate the header
+ // upward. Rotation may not be needed if the loop tail can be folded into the
+ // loop exit.
+ bool SimplifiedLatch = simplifyLoopLatch(L, LI, DT);
+
+ // One loop can be rotated multiple times.
+ bool MadeChange = false;
+ while (rotateLoop(L, MaxHeaderSize, LI, TTI, AC, DT, SE, SimplifiedLatch)) {
+ MadeChange = true;
+ SimplifiedLatch = false;
+ }
+
+ // Restore the loop metadata.
+ // NB! We presume LoopRotation DOESN'T ADD its own metadata.
+ if ((MadeChange || SimplifiedLatch) && LoopMD)
+ L->setLoopID(LoopMD);
+
+ return MadeChange;
+}
+
+namespace {
+
+class LoopRotate : public LoopPass {
+ unsigned MaxHeaderSize;
+
+public:
+ static char ID; // Pass ID, replacement for typeid
+ LoopRotate(int SpecifiedMaxHeaderSize = -1) : LoopPass(ID) {
+ initializeLoopRotatePass(*PassRegistry::getPassRegistry());
+ if (SpecifiedMaxHeaderSize == -1)
+ MaxHeaderSize = DefaultRotationThreshold;
+ else
+ MaxHeaderSize = unsigned(SpecifiedMaxHeaderSize);
+ }
+
+ // LCSSA form makes instruction renaming easier.
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addPreserved<AAResultsWrapperPass>();
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
+ AU.addRequiredID(LoopSimplifyID);
+ AU.addPreservedID(LoopSimplifyID);
+ AU.addRequiredID(LCSSAID);
+ AU.addPreservedID(LCSSAID);
+ AU.addPreserved<ScalarEvolutionWrapperPass>();
+ AU.addPreserved<SCEVAAWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.addPreserved<BasicAAWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ }
+
+ bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+ if (skipOptnoneFunction(L))
+ return false;
+ Function &F = *L->getHeader()->getParent();
+
+ auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ const auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+ auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
+ auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
+ auto *SE = SEWP ? &SEWP->getSE() : nullptr;
+
+ return iterativelyRotateLoop(L, MaxHeaderSize, LI, TTI, AC, DT, SE);
+ }
+};
+}
+
+char LoopRotate::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopRotate, "loop-rotate", "Rotate Loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(LCSSA)
+INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_END(LoopRotate, "loop-rotate", "Rotate Loops", false, false)
+
+Pass *llvm::createLoopRotatePass(int MaxHeaderSize) {
+ return new LoopRotate(MaxHeaderSize);
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
new file mode 100644
index 0000000..2101225
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -0,0 +1,5024 @@
+//===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This transformation analyzes and transforms the induction variables (and
+// computations derived from them) into forms suitable for efficient execution
+// on the target.
+//
+// This pass performs a strength reduction on array references inside loops that
+// have as one or more of their components the loop induction variable, it
+// rewrites expressions to take advantage of scaled-index addressing modes
+// available on the target, and it performs a variety of other optimizations
+// related to loop induction variables.
+//
+// Terminology note: this code has a lot of handling for "post-increment" or
+// "post-inc" users. This is not talking about post-increment addressing modes;
+// it is instead talking about code like this:
+//
+// %i = phi [ 0, %entry ], [ %i.next, %latch ]
+// ...
+// %i.next = add %i, 1
+// %c = icmp eq %i.next, %n
+//
+// The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however
+// it's useful to think about these as the same register, with some uses using
+// the value of the register before the add and some using it after. In this
+// example, the icmp is a post-increment user, since it uses %i.next, which is
+// the value of the induction variable after the increment. The other common
+// case of post-increment users is users outside the loop.
+//
+// TODO: More sophistication in the way Formulae are generated and filtered.
+//
+// TODO: Handle multiple loops at a time.
+//
+// TODO: Should the addressing mode BaseGV be changed to a ConstantExpr instead
+// of a GlobalValue?
+//
+// TODO: When truncation is free, truncate ICmp users' operands to make it a
+// smaller encoding (on x86 at least).
+//
+// TODO: When a negated register is used by an add (such as in a list of
+// multiple base registers, or as the increment expression in an addrec),
+// we may not actually need both reg and (-1 * reg) in registers; the
+// negation can be implemented by using a sub instead of an add. The
+// lack of support for taking this into consideration when making
+// register pressure decisions is partly worked around by the "Special"
+// use kind.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/Analysis/IVUsers.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-reduce"
+
+/// MaxIVUsers is an arbitrary threshold that provides an early opportunitiy for
+/// bail out. This threshold is far beyond the number of users that LSR can
+/// conceivably solve, so it should not affect generated code, but catches the
+/// worst cases before LSR burns too much compile time and stack space.
+static const unsigned MaxIVUsers = 200;
+
+// Temporary flag to cleanup congruent phis after LSR phi expansion.
+// It's currently disabled until we can determine whether it's truly useful or
+// not. The flag should be removed after the v3.0 release.
+// This is now needed for ivchains.
+static cl::opt<bool> EnablePhiElim(
+ "enable-lsr-phielim", cl::Hidden, cl::init(true),
+ cl::desc("Enable LSR phi elimination"));
+
+#ifndef NDEBUG
+// Stress test IV chain generation.
+static cl::opt<bool> StressIVChain(
+ "stress-ivchain", cl::Hidden, cl::init(false),
+ cl::desc("Stress test LSR IV chains"));
+#else
+static bool StressIVChain = false;
+#endif
+
+namespace {
+
+struct MemAccessTy {
+ /// Used in situations where the accessed memory type is unknown.
+ static const unsigned UnknownAddressSpace = ~0u;
+
+ Type *MemTy;
+ unsigned AddrSpace;
+
+ MemAccessTy() : MemTy(nullptr), AddrSpace(UnknownAddressSpace) {}
+
+ MemAccessTy(Type *Ty, unsigned AS) :
+ MemTy(Ty), AddrSpace(AS) {}
+
+ bool operator==(MemAccessTy Other) const {
+ return MemTy == Other.MemTy && AddrSpace == Other.AddrSpace;
+ }
+
+ bool operator!=(MemAccessTy Other) const { return !(*this == Other); }
+
+ static MemAccessTy getUnknown(LLVMContext &Ctx) {
+ return MemAccessTy(Type::getVoidTy(Ctx), UnknownAddressSpace);
+ }
+};
+
+/// This class holds data which is used to order reuse candidates.
+class RegSortData {
+public:
+ /// This represents the set of LSRUse indices which reference
+ /// a particular register.
+ SmallBitVector UsedByIndices;
+
+ void print(raw_ostream &OS) const;
+ void dump() const;
+};
+
+}
+
+void RegSortData::print(raw_ostream &OS) const {
+ OS << "[NumUses=" << UsedByIndices.count() << ']';
+}
+
+LLVM_DUMP_METHOD
+void RegSortData::dump() const {
+ print(errs()); errs() << '\n';
+}
+
+namespace {
+
+/// Map register candidates to information about how they are used.
+class RegUseTracker {
+ typedef DenseMap<const SCEV *, RegSortData> RegUsesTy;
+
+ RegUsesTy RegUsesMap;
+ SmallVector<const SCEV *, 16> RegSequence;
+
+public:
+ void countRegister(const SCEV *Reg, size_t LUIdx);
+ void dropRegister(const SCEV *Reg, size_t LUIdx);
+ void swapAndDropUse(size_t LUIdx, size_t LastLUIdx);
+
+ bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const;
+
+ const SmallBitVector &getUsedByIndices(const SCEV *Reg) const;
+
+ void clear();
+
+ typedef SmallVectorImpl<const SCEV *>::iterator iterator;
+ typedef SmallVectorImpl<const SCEV *>::const_iterator const_iterator;
+ iterator begin() { return RegSequence.begin(); }
+ iterator end() { return RegSequence.end(); }
+ const_iterator begin() const { return RegSequence.begin(); }
+ const_iterator end() const { return RegSequence.end(); }
+};
+
+}
+
+void
+RegUseTracker::countRegister(const SCEV *Reg, size_t LUIdx) {
+ std::pair<RegUsesTy::iterator, bool> Pair =
+ RegUsesMap.insert(std::make_pair(Reg, RegSortData()));
+ RegSortData &RSD = Pair.first->second;
+ if (Pair.second)
+ RegSequence.push_back(Reg);
+ RSD.UsedByIndices.resize(std::max(RSD.UsedByIndices.size(), LUIdx + 1));
+ RSD.UsedByIndices.set(LUIdx);
+}
+
+void
+RegUseTracker::dropRegister(const SCEV *Reg, size_t LUIdx) {
+ RegUsesTy::iterator It = RegUsesMap.find(Reg);
+ assert(It != RegUsesMap.end());
+ RegSortData &RSD = It->second;
+ assert(RSD.UsedByIndices.size() > LUIdx);
+ RSD.UsedByIndices.reset(LUIdx);
+}
+
+void
+RegUseTracker::swapAndDropUse(size_t LUIdx, size_t LastLUIdx) {
+ assert(LUIdx <= LastLUIdx);
+
+ // Update RegUses. The data structure is not optimized for this purpose;
+ // we must iterate through it and update each of the bit vectors.
+ for (auto &Pair : RegUsesMap) {
+ SmallBitVector &UsedByIndices = Pair.second.UsedByIndices;
+ if (LUIdx < UsedByIndices.size())
+ UsedByIndices[LUIdx] =
+ LastLUIdx < UsedByIndices.size() ? UsedByIndices[LastLUIdx] : 0;
+ UsedByIndices.resize(std::min(UsedByIndices.size(), LastLUIdx));
+ }
+}
+
+bool
+RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const {
+ RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
+ if (I == RegUsesMap.end())
+ return false;
+ const SmallBitVector &UsedByIndices = I->second.UsedByIndices;
+ int i = UsedByIndices.find_first();
+ if (i == -1) return false;
+ if ((size_t)i != LUIdx) return true;
+ return UsedByIndices.find_next(i) != -1;
+}
+
+const SmallBitVector &RegUseTracker::getUsedByIndices(const SCEV *Reg) const {
+ RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
+ assert(I != RegUsesMap.end() && "Unknown register!");
+ return I->second.UsedByIndices;
+}
+
+void RegUseTracker::clear() {
+ RegUsesMap.clear();
+ RegSequence.clear();
+}
+
+namespace {
+
+/// This class holds information that describes a formula for computing
+/// satisfying a use. It may include broken-out immediates and scaled registers.
+struct Formula {
+ /// Global base address used for complex addressing.
+ GlobalValue *BaseGV;
+
+ /// Base offset for complex addressing.
+ int64_t BaseOffset;
+
+ /// Whether any complex addressing has a base register.
+ bool HasBaseReg;
+
+ /// The scale of any complex addressing.
+ int64_t Scale;
+
+ /// The list of "base" registers for this use. When this is non-empty. The
+ /// canonical representation of a formula is
+ /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and
+ /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty().
+ /// #1 enforces that the scaled register is always used when at least two
+ /// registers are needed by the formula: e.g., reg1 + reg2 is reg1 + 1 * reg2.
+ /// #2 enforces that 1 * reg is reg.
+ /// This invariant can be temporarly broken while building a formula.
+ /// However, every formula inserted into the LSRInstance must be in canonical
+ /// form.
+ SmallVector<const SCEV *, 4> BaseRegs;
+
+ /// The 'scaled' register for this use. This should be non-null when Scale is
+ /// not zero.
+ const SCEV *ScaledReg;
+
+ /// An additional constant offset which added near the use. This requires a
+ /// temporary register, but the offset itself can live in an add immediate
+ /// field rather than a register.
+ int64_t UnfoldedOffset;
+
+ Formula()
+ : BaseGV(nullptr), BaseOffset(0), HasBaseReg(false), Scale(0),
+ ScaledReg(nullptr), UnfoldedOffset(0) {}
+
+ void initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
+
+ bool isCanonical() const;
+
+ void canonicalize();
+
+ bool unscale();
+
+ size_t getNumRegs() const;
+ Type *getType() const;
+
+ void deleteBaseReg(const SCEV *&S);
+
+ bool referencesReg(const SCEV *S) const;
+ bool hasRegsUsedByUsesOtherThan(size_t LUIdx,
+ const RegUseTracker &RegUses) const;
+
+ void print(raw_ostream &OS) const;
+ void dump() const;
+};
+
+}
+
+/// Recursion helper for initialMatch.
+static void DoInitialMatch(const SCEV *S, Loop *L,
+ SmallVectorImpl<const SCEV *> &Good,
+ SmallVectorImpl<const SCEV *> &Bad,
+ ScalarEvolution &SE) {
+ // Collect expressions which properly dominate the loop header.
+ if (SE.properlyDominates(S, L->getHeader())) {
+ Good.push_back(S);
+ return;
+ }
+
+ // Look at add operands.
+ if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
+ for (const SCEV *S : Add->operands())
+ DoInitialMatch(S, L, Good, Bad, SE);
+ return;
+ }
+
+ // Look at addrec operands.
+ if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S))
+ if (!AR->getStart()->isZero()) {
+ DoInitialMatch(AR->getStart(), L, Good, Bad, SE);
+ DoInitialMatch(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0),
+ AR->getStepRecurrence(SE),
+ // FIXME: AR->getNoWrapFlags()
+ AR->getLoop(), SCEV::FlagAnyWrap),
+ L, Good, Bad, SE);
+ return;
+ }
+
+ // Handle a multiplication by -1 (negation) if it didn't fold.
+ if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S))
+ if (Mul->getOperand(0)->isAllOnesValue()) {
+ SmallVector<const SCEV *, 4> Ops(Mul->op_begin()+1, Mul->op_end());
+ const SCEV *NewMul = SE.getMulExpr(Ops);
+
+ SmallVector<const SCEV *, 4> MyGood;
+ SmallVector<const SCEV *, 4> MyBad;
+ DoInitialMatch(NewMul, L, MyGood, MyBad, SE);
+ const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue(
+ SE.getEffectiveSCEVType(NewMul->getType())));
+ for (const SCEV *S : MyGood)
+ Good.push_back(SE.getMulExpr(NegOne, S));
+ for (const SCEV *S : MyBad)
+ Bad.push_back(SE.getMulExpr(NegOne, S));
+ return;
+ }
+
+ // Ok, we can't do anything interesting. Just stuff the whole thing into a
+ // register and hope for the best.
+ Bad.push_back(S);
+}
+
+/// Incorporate loop-variant parts of S into this Formula, attempting to keep
+/// all loop-invariant and loop-computable values in a single base register.
+void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
+ SmallVector<const SCEV *, 4> Good;
+ SmallVector<const SCEV *, 4> Bad;
+ DoInitialMatch(S, L, Good, Bad, SE);
+ if (!Good.empty()) {
+ const SCEV *Sum = SE.getAddExpr(Good);
+ if (!Sum->isZero())
+ BaseRegs.push_back(Sum);
+ HasBaseReg = true;
+ }
+ if (!Bad.empty()) {
+ const SCEV *Sum = SE.getAddExpr(Bad);
+ if (!Sum->isZero())
+ BaseRegs.push_back(Sum);
+ HasBaseReg = true;
+ }
+ canonicalize();
+}
+
+/// \brief Check whether or not this formula statisfies the canonical
+/// representation.
+/// \see Formula::BaseRegs.
+bool Formula::isCanonical() const {
+ if (ScaledReg)
+ return Scale != 1 || !BaseRegs.empty();
+ return BaseRegs.size() <= 1;
+}
+
+/// \brief Helper method to morph a formula into its canonical representation.
+/// \see Formula::BaseRegs.
+/// Every formula having more than one base register, must use the ScaledReg
+/// field. Otherwise, we would have to do special cases everywhere in LSR
+/// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ...
+/// On the other hand, 1*reg should be canonicalized into reg.
+void Formula::canonicalize() {
+ if (isCanonical())
+ return;
+ // So far we did not need this case. This is easy to implement but it is
+ // useless to maintain dead code. Beside it could hurt compile time.
+ assert(!BaseRegs.empty() && "1*reg => reg, should not be needed.");
+ // Keep the invariant sum in BaseRegs and one of the variant sum in ScaledReg.
+ ScaledReg = BaseRegs.back();
+ BaseRegs.pop_back();
+ Scale = 1;
+ size_t BaseRegsSize = BaseRegs.size();
+ size_t Try = 0;
+ // If ScaledReg is an invariant, try to find a variant expression.
+ while (Try < BaseRegsSize && !isa<SCEVAddRecExpr>(ScaledReg))
+ std::swap(ScaledReg, BaseRegs[Try++]);
+}
+
+/// \brief Get rid of the scale in the formula.
+/// In other words, this method morphes reg1 + 1*reg2 into reg1 + reg2.
+/// \return true if it was possible to get rid of the scale, false otherwise.
+/// \note After this operation the formula may not be in the canonical form.
+bool Formula::unscale() {
+ if (Scale != 1)
+ return false;
+ Scale = 0;
+ BaseRegs.push_back(ScaledReg);
+ ScaledReg = nullptr;
+ return true;
+}
+
+/// Return the total number of register operands used by this formula. This does
+/// not include register uses implied by non-constant addrec strides.
+size_t Formula::getNumRegs() const {
+ return !!ScaledReg + BaseRegs.size();
+}
+
+/// Return the type of this formula, if it has one, or null otherwise. This type
+/// is meaningless except for the bit size.
+Type *Formula::getType() const {
+ return !BaseRegs.empty() ? BaseRegs.front()->getType() :
+ ScaledReg ? ScaledReg->getType() :
+ BaseGV ? BaseGV->getType() :
+ nullptr;
+}
+
+/// Delete the given base reg from the BaseRegs list.
+void Formula::deleteBaseReg(const SCEV *&S) {
+ if (&S != &BaseRegs.back())
+ std::swap(S, BaseRegs.back());
+ BaseRegs.pop_back();
+}
+
+/// Test if this formula references the given register.
+bool Formula::referencesReg(const SCEV *S) const {
+ return S == ScaledReg ||
+ std::find(BaseRegs.begin(), BaseRegs.end(), S) != BaseRegs.end();
+}
+
+/// Test whether this formula uses registers which are used by uses other than
+/// the use with the given index.
+bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx,
+ const RegUseTracker &RegUses) const {
+ if (ScaledReg)
+ if (RegUses.isRegUsedByUsesOtherThan(ScaledReg, LUIdx))
+ return true;
+ for (const SCEV *BaseReg : BaseRegs)
+ if (RegUses.isRegUsedByUsesOtherThan(BaseReg, LUIdx))
+ return true;
+ return false;
+}
+
+void Formula::print(raw_ostream &OS) const {
+ bool First = true;
+ if (BaseGV) {
+ if (!First) OS << " + "; else First = false;
+ BaseGV->printAsOperand(OS, /*PrintType=*/false);
+ }
+ if (BaseOffset != 0) {
+ if (!First) OS << " + "; else First = false;
+ OS << BaseOffset;
+ }
+ for (const SCEV *BaseReg : BaseRegs) {
+ if (!First) OS << " + "; else First = false;
+ OS << "reg(" << *BaseReg << ')';
+ }
+ if (HasBaseReg && BaseRegs.empty()) {
+ if (!First) OS << " + "; else First = false;
+ OS << "**error: HasBaseReg**";
+ } else if (!HasBaseReg && !BaseRegs.empty()) {
+ if (!First) OS << " + "; else First = false;
+ OS << "**error: !HasBaseReg**";
+ }
+ if (Scale != 0) {
+ if (!First) OS << " + "; else First = false;
+ OS << Scale << "*reg(";
+ if (ScaledReg)
+ OS << *ScaledReg;
+ else
+ OS << "<unknown>";
+ OS << ')';
+ }
+ if (UnfoldedOffset != 0) {
+ if (!First) OS << " + ";
+ OS << "imm(" << UnfoldedOffset << ')';
+ }
+}
+
+LLVM_DUMP_METHOD
+void Formula::dump() const {
+ print(errs()); errs() << '\n';
+}
+
+/// Return true if the given addrec can be sign-extended without changing its
+/// value.
+static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
+ Type *WideTy =
+ IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(AR->getType()) + 1);
+ return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
+}
+
+/// Return true if the given add can be sign-extended without changing its
+/// value.
+static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {
+ Type *WideTy =
+ IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1);
+ return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy));
+}
+
+/// Return true if the given mul can be sign-extended without changing its
+/// value.
+static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) {
+ Type *WideTy =
+ IntegerType::get(SE.getContext(),
+ SE.getTypeSizeInBits(M->getType()) * M->getNumOperands());
+ return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy));
+}
+
+/// Return an expression for LHS /s RHS, if it can be determined and if the
+/// remainder is known to be zero, or null otherwise. If IgnoreSignificantBits
+/// is true, expressions like (X * Y) /s Y are simplified to Y, ignoring that
+/// the multiplication may overflow, which is useful when the result will be
+/// used in a context where the most significant bits are ignored.
+static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
+ ScalarEvolution &SE,
+ bool IgnoreSignificantBits = false) {
+ // Handle the trivial case, which works for any SCEV type.
+ if (LHS == RHS)
+ return SE.getConstant(LHS->getType(), 1);
+
+ // Handle a few RHS special cases.
+ const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS);
+ if (RC) {
+ const APInt &RA = RC->getAPInt();
+ // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do
+ // some folding.
+ if (RA.isAllOnesValue())
+ return SE.getMulExpr(LHS, RC);
+ // Handle x /s 1 as x.
+ if (RA == 1)
+ return LHS;
+ }
+
+ // Check for a division of a constant by a constant.
+ if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) {
+ if (!RC)
+ return nullptr;
+ const APInt &LA = C->getAPInt();
+ const APInt &RA = RC->getAPInt();
+ if (LA.srem(RA) != 0)
+ return nullptr;
+ return SE.getConstant(LA.sdiv(RA));
+ }
+
+ // Distribute the sdiv over addrec operands, if the addrec doesn't overflow.
+ if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS)) {
+ if (IgnoreSignificantBits || isAddRecSExtable(AR, SE)) {
+ const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE,
+ IgnoreSignificantBits);
+ if (!Step) return nullptr;
+ const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE,
+ IgnoreSignificantBits);
+ if (!Start) return nullptr;
+ // FlagNW is independent of the start value, step direction, and is
+ // preserved with smaller magnitude steps.
+ // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
+ return SE.getAddRecExpr(Start, Step, AR->getLoop(), SCEV::FlagAnyWrap);
+ }
+ return nullptr;
+ }
+
+ // Distribute the sdiv over add operands, if the add doesn't overflow.
+ if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(LHS)) {
+ if (IgnoreSignificantBits || isAddSExtable(Add, SE)) {
+ SmallVector<const SCEV *, 8> Ops;
+ for (const SCEV *S : Add->operands()) {
+ const SCEV *Op = getExactSDiv(S, RHS, SE, IgnoreSignificantBits);
+ if (!Op) return nullptr;
+ Ops.push_back(Op);
+ }
+ return SE.getAddExpr(Ops);
+ }
+ return nullptr;
+ }
+
+ // Check for a multiply operand that we can pull RHS out of.
+ if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS)) {
+ if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) {
+ SmallVector<const SCEV *, 4> Ops;
+ bool Found = false;
+ for (const SCEV *S : Mul->operands()) {
+ if (!Found)
+ if (const SCEV *Q = getExactSDiv(S, RHS, SE,
+ IgnoreSignificantBits)) {
+ S = Q;
+ Found = true;
+ }
+ Ops.push_back(S);
+ }
+ return Found ? SE.getMulExpr(Ops) : nullptr;
+ }
+ return nullptr;
+ }
+
+ // Otherwise we don't know.
+ return nullptr;
+}
+
+/// If S involves the addition of a constant integer value, return that integer
+/// value, and mutate S to point to a new SCEV with that value excluded.
+static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
+ if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
+ if (C->getAPInt().getMinSignedBits() <= 64) {
+ S = SE.getConstant(C->getType(), 0);
+ return C->getValue()->getSExtValue();
+ }
+ } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
+ SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
+ int64_t Result = ExtractImmediate(NewOps.front(), SE);
+ if (Result != 0)
+ S = SE.getAddExpr(NewOps);
+ return Result;
+ } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
+ SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
+ int64_t Result = ExtractImmediate(NewOps.front(), SE);
+ if (Result != 0)
+ S = SE.getAddRecExpr(NewOps, AR->getLoop(),
+ // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
+ SCEV::FlagAnyWrap);
+ return Result;
+ }
+ return 0;
+}
+
+/// If S involves the addition of a GlobalValue address, return that symbol, and
+/// mutate S to point to a new SCEV with that value excluded.
+static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) {
+ if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
+ if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) {
+ S = SE.getConstant(GV->getType(), 0);
+ return GV;
+ }
+ } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
+ SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
+ GlobalValue *Result = ExtractSymbol(NewOps.back(), SE);
+ if (Result)
+ S = SE.getAddExpr(NewOps);
+ return Result;
+ } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
+ SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
+ GlobalValue *Result = ExtractSymbol(NewOps.front(), SE);
+ if (Result)
+ S = SE.getAddRecExpr(NewOps, AR->getLoop(),
+ // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
+ SCEV::FlagAnyWrap);
+ return Result;
+ }
+ return nullptr;
+}
+
+/// Returns true if the specified instruction is using the specified value as an
+/// address.
+static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
+ bool isAddress = isa<LoadInst>(Inst);
+ if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+ if (SI->getOperand(1) == OperandVal)
+ isAddress = true;
+ } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
+ // Addressing modes can also be folded into prefetches and a variety
+ // of intrinsics.
+ switch (II->getIntrinsicID()) {
+ default: break;
+ case Intrinsic::prefetch:
+ case Intrinsic::x86_sse_storeu_ps:
+ case Intrinsic::x86_sse2_storeu_pd:
+ case Intrinsic::x86_sse2_storeu_dq:
+ case Intrinsic::x86_sse2_storel_dq:
+ if (II->getArgOperand(0) == OperandVal)
+ isAddress = true;
+ break;
+ }
+ }
+ return isAddress;
+}
+
+/// Return the type of the memory being accessed.
+static MemAccessTy getAccessType(const Instruction *Inst) {
+ MemAccessTy AccessTy(Inst->getType(), MemAccessTy::UnknownAddressSpace);
+ if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+ AccessTy.MemTy = SI->getOperand(0)->getType();
+ AccessTy.AddrSpace = SI->getPointerAddressSpace();
+ } else if (const LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+ AccessTy.AddrSpace = LI->getPointerAddressSpace();
+ } else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
+ // Addressing modes can also be folded into prefetches and a variety
+ // of intrinsics.
+ switch (II->getIntrinsicID()) {
+ default: break;
+ case Intrinsic::x86_sse_storeu_ps:
+ case Intrinsic::x86_sse2_storeu_pd:
+ case Intrinsic::x86_sse2_storeu_dq:
+ case Intrinsic::x86_sse2_storel_dq:
+ AccessTy.MemTy = II->getArgOperand(0)->getType();
+ break;
+ }
+ }
+
+ // All pointers have the same requirements, so canonicalize them to an
+ // arbitrary pointer type to minimize variation.
+ if (PointerType *PTy = dyn_cast<PointerType>(AccessTy.MemTy))
+ AccessTy.MemTy = PointerType::get(IntegerType::get(PTy->getContext(), 1),
+ PTy->getAddressSpace());
+
+ return AccessTy;
+}
+
+/// Return true if this AddRec is already a phi in its loop.
+static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
+ for (BasicBlock::iterator I = AR->getLoop()->getHeader()->begin();
+ PHINode *PN = dyn_cast<PHINode>(I); ++I) {
+ if (SE.isSCEVable(PN->getType()) &&
+ (SE.getEffectiveSCEVType(PN->getType()) ==
+ SE.getEffectiveSCEVType(AR->getType())) &&
+ SE.getSCEV(PN) == AR)
+ return true;
+ }
+ return false;
+}
+
+/// Check if expanding this expression is likely to incur significant cost. This
+/// is tricky because SCEV doesn't track which expressions are actually computed
+/// by the current IR.
+///
+/// We currently allow expansion of IV increments that involve adds,
+/// multiplication by constants, and AddRecs from existing phis.
+///
+/// TODO: Allow UDivExpr if we can find an existing IV increment that is an
+/// obvious multiple of the UDivExpr.
+static bool isHighCostExpansion(const SCEV *S,
+ SmallPtrSetImpl<const SCEV*> &Processed,
+ ScalarEvolution &SE) {
+ // Zero/One operand expressions
+ switch (S->getSCEVType()) {
+ case scUnknown:
+ case scConstant:
+ return false;
+ case scTruncate:
+ return isHighCostExpansion(cast<SCEVTruncateExpr>(S)->getOperand(),
+ Processed, SE);
+ case scZeroExtend:
+ return isHighCostExpansion(cast<SCEVZeroExtendExpr>(S)->getOperand(),
+ Processed, SE);
+ case scSignExtend:
+ return isHighCostExpansion(cast<SCEVSignExtendExpr>(S)->getOperand(),
+ Processed, SE);
+ }
+
+ if (!Processed.insert(S).second)
+ return false;
+
+ if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
+ for (const SCEV *S : Add->operands()) {
+ if (isHighCostExpansion(S, Processed, SE))
+ return true;
+ }
+ return false;
+ }
+
+ if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
+ if (Mul->getNumOperands() == 2) {
+ // Multiplication by a constant is ok
+ if (isa<SCEVConstant>(Mul->getOperand(0)))
+ return isHighCostExpansion(Mul->getOperand(1), Processed, SE);
+
+ // If we have the value of one operand, check if an existing
+ // multiplication already generates this expression.
+ if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(Mul->getOperand(1))) {
+ Value *UVal = U->getValue();
+ for (User *UR : UVal->users()) {
+ // If U is a constant, it may be used by a ConstantExpr.
+ Instruction *UI = dyn_cast<Instruction>(UR);
+ if (UI && UI->getOpcode() == Instruction::Mul &&
+ SE.isSCEVable(UI->getType())) {
+ return SE.getSCEV(UI) == Mul;
+ }
+ }
+ }
+ }
+ }
+
+ if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
+ if (isExistingPhi(AR, SE))
+ return false;
+ }
+
+ // Fow now, consider any other type of expression (div/mul/min/max) high cost.
+ return true;
+}
+
+/// If any of the instructions is the specified set are trivially dead, delete
+/// them and see if this makes any of their operands subsequently dead.
+static bool
+DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) {
+ bool Changed = false;
+
+ while (!DeadInsts.empty()) {
+ Value *V = DeadInsts.pop_back_val();
+ Instruction *I = dyn_cast_or_null<Instruction>(V);
+
+ if (!I || !isInstructionTriviallyDead(I))
+ continue;
+
+ for (Use &O : I->operands())
+ if (Instruction *U = dyn_cast<Instruction>(O)) {
+ O = nullptr;
+ if (U->use_empty())
+ DeadInsts.emplace_back(U);
+ }
+
+ I->eraseFromParent();
+ Changed = true;
+ }
+
+ return Changed;
+}
+
+namespace {
+class LSRUse;
+}
+
+/// \brief Check if the addressing mode defined by \p F is completely
+/// folded in \p LU at isel time.
+/// This includes address-mode folding and special icmp tricks.
+/// This function returns true if \p LU can accommodate what \p F
+/// defines and up to 1 base + 1 scaled + offset.
+/// In other words, if \p F has several base registers, this function may
+/// still return true. Therefore, users still need to account for
+/// additional base registers and/or unfolded offsets to derive an
+/// accurate cost model.
+static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
+ const LSRUse &LU, const Formula &F);
+// Get the cost of the scaling factor used in F for LU.
+static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
+ const LSRUse &LU, const Formula &F);
+
+namespace {
+
+/// This class is used to measure and compare candidate formulae.
+class Cost {
+ /// TODO: Some of these could be merged. Also, a lexical ordering
+ /// isn't always optimal.
+ unsigned NumRegs;
+ unsigned AddRecCost;
+ unsigned NumIVMuls;
+ unsigned NumBaseAdds;
+ unsigned ImmCost;
+ unsigned SetupCost;
+ unsigned ScaleCost;
+
+public:
+ Cost()
+ : NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0), ImmCost(0),
+ SetupCost(0), ScaleCost(0) {}
+
+ bool operator<(const Cost &Other) const;
+
+ void Lose();
+
+#ifndef NDEBUG
+ // Once any of the metrics loses, they must all remain losers.
+ bool isValid() {
+ return ((NumRegs | AddRecCost | NumIVMuls | NumBaseAdds
+ | ImmCost | SetupCost | ScaleCost) != ~0u)
+ || ((NumRegs & AddRecCost & NumIVMuls & NumBaseAdds
+ & ImmCost & SetupCost & ScaleCost) == ~0u);
+ }
+#endif
+
+ bool isLoser() {
+ assert(isValid() && "invalid cost");
+ return NumRegs == ~0u;
+ }
+
+ void RateFormula(const TargetTransformInfo &TTI,
+ const Formula &F,
+ SmallPtrSetImpl<const SCEV *> &Regs,
+ const DenseSet<const SCEV *> &VisitedRegs,
+ const Loop *L,
+ const SmallVectorImpl<int64_t> &Offsets,
+ ScalarEvolution &SE, DominatorTree &DT,
+ const LSRUse &LU,
+ SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr);
+
+ void print(raw_ostream &OS) const;
+ void dump() const;
+
+private:
+ void RateRegister(const SCEV *Reg,
+ SmallPtrSetImpl<const SCEV *> &Regs,
+ const Loop *L,
+ ScalarEvolution &SE, DominatorTree &DT);
+ void RatePrimaryRegister(const SCEV *Reg,
+ SmallPtrSetImpl<const SCEV *> &Regs,
+ const Loop *L,
+ ScalarEvolution &SE, DominatorTree &DT,
+ SmallPtrSetImpl<const SCEV *> *LoserRegs);
+};
+
+}
+
+/// Tally up interesting quantities from the given register.
+void Cost::RateRegister(const SCEV *Reg,
+ SmallPtrSetImpl<const SCEV *> &Regs,
+ const Loop *L,
+ ScalarEvolution &SE, DominatorTree &DT) {
+ if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
+ // If this is an addrec for another loop, don't second-guess its addrec phi
+ // nodes. LSR isn't currently smart enough to reason about more than one
+ // loop at a time. LSR has already run on inner loops, will not run on outer
+ // loops, and cannot be expected to change sibling loops.
+ if (AR->getLoop() != L) {
+ // If the AddRec exists, consider it's register free and leave it alone.
+ if (isExistingPhi(AR, SE))
+ return;
+
+ // Otherwise, do not consider this formula at all.
+ Lose();
+ return;
+ }
+ AddRecCost += 1; /// TODO: This should be a function of the stride.
+
+ // Add the step value register, if it needs one.
+ // TODO: The non-affine case isn't precisely modeled here.
+ if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
+ if (!Regs.count(AR->getOperand(1))) {
+ RateRegister(AR->getOperand(1), Regs, L, SE, DT);
+ if (isLoser())
+ return;
+ }
+ }
+ }
+ ++NumRegs;
+
+ // Rough heuristic; favor registers which don't require extra setup
+ // instructions in the preheader.
+ if (!isa<SCEVUnknown>(Reg) &&
+ !isa<SCEVConstant>(Reg) &&
+ !(isa<SCEVAddRecExpr>(Reg) &&
+ (isa<SCEVUnknown>(cast<SCEVAddRecExpr>(Reg)->getStart()) ||
+ isa<SCEVConstant>(cast<SCEVAddRecExpr>(Reg)->getStart()))))
+ ++SetupCost;
+
+ NumIVMuls += isa<SCEVMulExpr>(Reg) &&
+ SE.hasComputableLoopEvolution(Reg, L);
+}
+
+/// Record this register in the set. If we haven't seen it before, rate
+/// it. Optional LoserRegs provides a way to declare any formula that refers to
+/// one of those regs an instant loser.
+void Cost::RatePrimaryRegister(const SCEV *Reg,
+ SmallPtrSetImpl<const SCEV *> &Regs,
+ const Loop *L,
+ ScalarEvolution &SE, DominatorTree &DT,
+ SmallPtrSetImpl<const SCEV *> *LoserRegs) {
+ if (LoserRegs && LoserRegs->count(Reg)) {
+ Lose();
+ return;
+ }
+ if (Regs.insert(Reg).second) {
+ RateRegister(Reg, Regs, L, SE, DT);
+ if (LoserRegs && isLoser())
+ LoserRegs->insert(Reg);
+ }
+}
+
+void Cost::RateFormula(const TargetTransformInfo &TTI,
+ const Formula &F,
+ SmallPtrSetImpl<const SCEV *> &Regs,
+ const DenseSet<const SCEV *> &VisitedRegs,
+ const Loop *L,
+ const SmallVectorImpl<int64_t> &Offsets,
+ ScalarEvolution &SE, DominatorTree &DT,
+ const LSRUse &LU,
+ SmallPtrSetImpl<const SCEV *> *LoserRegs) {
+ assert(F.isCanonical() && "Cost is accurate only for canonical formula");
+ // Tally up the registers.
+ if (const SCEV *ScaledReg = F.ScaledReg) {
+ if (VisitedRegs.count(ScaledReg)) {
+ Lose();
+ return;
+ }
+ RatePrimaryRegister(ScaledReg, Regs, L, SE, DT, LoserRegs);
+ if (isLoser())
+ return;
+ }
+ for (const SCEV *BaseReg : F.BaseRegs) {
+ if (VisitedRegs.count(BaseReg)) {
+ Lose();
+ return;
+ }
+ RatePrimaryRegister(BaseReg, Regs, L, SE, DT, LoserRegs);
+ if (isLoser())
+ return;
+ }
+
+ // Determine how many (unfolded) adds we'll need inside the loop.
+ size_t NumBaseParts = F.getNumRegs();
+ if (NumBaseParts > 1)
+ // Do not count the base and a possible second register if the target
+ // allows to fold 2 registers.
+ NumBaseAdds +=
+ NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(TTI, LU, F)));
+ NumBaseAdds += (F.UnfoldedOffset != 0);
+
+ // Accumulate non-free scaling amounts.
+ ScaleCost += getScalingFactorCost(TTI, LU, F);
+
+ // Tally up the non-zero immediates.
+ for (int64_t O : Offsets) {
+ int64_t Offset = (uint64_t)O + F.BaseOffset;
+ if (F.BaseGV)
+ ImmCost += 64; // Handle symbolic values conservatively.
+ // TODO: This should probably be the pointer size.
+ else if (Offset != 0)
+ ImmCost += APInt(64, Offset, true).getMinSignedBits();
+ }
+ assert(isValid() && "invalid cost");
+}
+
+/// Set this cost to a losing value.
+void Cost::Lose() {
+ NumRegs = ~0u;
+ AddRecCost = ~0u;
+ NumIVMuls = ~0u;
+ NumBaseAdds = ~0u;
+ ImmCost = ~0u;
+ SetupCost = ~0u;
+ ScaleCost = ~0u;
+}
+
+/// Choose the lower cost.
+bool Cost::operator<(const Cost &Other) const {
+ return std::tie(NumRegs, AddRecCost, NumIVMuls, NumBaseAdds, ScaleCost,
+ ImmCost, SetupCost) <
+ std::tie(Other.NumRegs, Other.AddRecCost, Other.NumIVMuls,
+ Other.NumBaseAdds, Other.ScaleCost, Other.ImmCost,
+ Other.SetupCost);
+}
+
+void Cost::print(raw_ostream &OS) const {
+ OS << NumRegs << " reg" << (NumRegs == 1 ? "" : "s");
+ if (AddRecCost != 0)
+ OS << ", with addrec cost " << AddRecCost;
+ if (NumIVMuls != 0)
+ OS << ", plus " << NumIVMuls << " IV mul" << (NumIVMuls == 1 ? "" : "s");
+ if (NumBaseAdds != 0)
+ OS << ", plus " << NumBaseAdds << " base add"
+ << (NumBaseAdds == 1 ? "" : "s");
+ if (ScaleCost != 0)
+ OS << ", plus " << ScaleCost << " scale cost";
+ if (ImmCost != 0)
+ OS << ", plus " << ImmCost << " imm cost";
+ if (SetupCost != 0)
+ OS << ", plus " << SetupCost << " setup cost";
+}
+
+LLVM_DUMP_METHOD
+void Cost::dump() const {
+ print(errs()); errs() << '\n';
+}
+
+namespace {
+
+/// An operand value in an instruction which is to be replaced with some
+/// equivalent, possibly strength-reduced, replacement.
+struct LSRFixup {
+ /// The instruction which will be updated.
+ Instruction *UserInst;
+
+ /// The operand of the instruction which will be replaced. The operand may be
+ /// used more than once; every instance will be replaced.
+ Value *OperandValToReplace;
+
+ /// If this user is to use the post-incremented value of an induction
+ /// variable, this variable is non-null and holds the loop associated with the
+ /// induction variable.
+ PostIncLoopSet PostIncLoops;
+
+ /// The index of the LSRUse describing the expression which this fixup needs,
+ /// minus an offset (below).
+ size_t LUIdx;
+
+ /// A constant offset to be added to the LSRUse expression. This allows
+ /// multiple fixups to share the same LSRUse with different offsets, for
+ /// example in an unrolled loop.
+ int64_t Offset;
+
+ bool isUseFullyOutsideLoop(const Loop *L) const;
+
+ LSRFixup();
+
+ void print(raw_ostream &OS) const;
+ void dump() const;
+};
+
+}
+
+LSRFixup::LSRFixup()
+ : UserInst(nullptr), OperandValToReplace(nullptr), LUIdx(~size_t(0)),
+ Offset(0) {}
+
+/// Test whether this fixup always uses its value outside of the given loop.
+bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const {
+ // PHI nodes use their value in their incoming blocks.
+ if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) {
+ for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+ if (PN->getIncomingValue(i) == OperandValToReplace &&
+ L->contains(PN->getIncomingBlock(i)))
+ return false;
+ return true;
+ }
+
+ return !L->contains(UserInst);
+}
+
+void LSRFixup::print(raw_ostream &OS) const {
+ OS << "UserInst=";
+ // Store is common and interesting enough to be worth special-casing.
+ if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) {
+ OS << "store ";
+ Store->getOperand(0)->printAsOperand(OS, /*PrintType=*/false);
+ } else if (UserInst->getType()->isVoidTy())
+ OS << UserInst->getOpcodeName();
+ else
+ UserInst->printAsOperand(OS, /*PrintType=*/false);
+
+ OS << ", OperandValToReplace=";
+ OperandValToReplace->printAsOperand(OS, /*PrintType=*/false);
+
+ for (const Loop *PIL : PostIncLoops) {
+ OS << ", PostIncLoop=";
+ PIL->getHeader()->printAsOperand(OS, /*PrintType=*/false);
+ }
+
+ if (LUIdx != ~size_t(0))
+ OS << ", LUIdx=" << LUIdx;
+
+ if (Offset != 0)
+ OS << ", Offset=" << Offset;
+}
+
+LLVM_DUMP_METHOD
+void LSRFixup::dump() const {
+ print(errs()); errs() << '\n';
+}
+
+namespace {
+
+/// A DenseMapInfo implementation for holding DenseMaps and DenseSets of sorted
+/// SmallVectors of const SCEV*.
+struct UniquifierDenseMapInfo {
+ static SmallVector<const SCEV *, 4> getEmptyKey() {
+ SmallVector<const SCEV *, 4> V;
+ V.push_back(reinterpret_cast<const SCEV *>(-1));
+ return V;
+ }
+
+ static SmallVector<const SCEV *, 4> getTombstoneKey() {
+ SmallVector<const SCEV *, 4> V;
+ V.push_back(reinterpret_cast<const SCEV *>(-2));
+ return V;
+ }
+
+ static unsigned getHashValue(const SmallVector<const SCEV *, 4> &V) {
+ return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
+ }
+
+ static bool isEqual(const SmallVector<const SCEV *, 4> &LHS,
+ const SmallVector<const SCEV *, 4> &RHS) {
+ return LHS == RHS;
+ }
+};
+
+/// This class holds the state that LSR keeps for each use in IVUsers, as well
+/// as uses invented by LSR itself. It includes information about what kinds of
+/// things can be folded into the user, information about the user itself, and
+/// information about how the use may be satisfied. TODO: Represent multiple
+/// users of the same expression in common?
+class LSRUse {
+ DenseSet<SmallVector<const SCEV *, 4>, UniquifierDenseMapInfo> Uniquifier;
+
+public:
+ /// An enum for a kind of use, indicating what types of scaled and immediate
+ /// operands it might support.
+ enum KindType {
+ Basic, ///< A normal use, with no folding.
+ Special, ///< A special case of basic, allowing -1 scales.
+ Address, ///< An address use; folding according to TargetLowering
+ ICmpZero ///< An equality icmp with both operands folded into one.
+ // TODO: Add a generic icmp too?
+ };
+
+ typedef PointerIntPair<const SCEV *, 2, KindType> SCEVUseKindPair;
+
+ KindType Kind;
+ MemAccessTy AccessTy;
+
+ SmallVector<int64_t, 8> Offsets;
+ int64_t MinOffset;
+ int64_t MaxOffset;
+
+ /// This records whether all of the fixups using this LSRUse are outside of
+ /// the loop, in which case some special-case heuristics may be used.
+ bool AllFixupsOutsideLoop;
+
+ /// RigidFormula is set to true to guarantee that this use will be associated
+ /// with a single formula--the one that initially matched. Some SCEV
+ /// expressions cannot be expanded. This allows LSR to consider the registers
+ /// used by those expressions without the need to expand them later after
+ /// changing the formula.
+ bool RigidFormula;
+
+ /// This records the widest use type for any fixup using this
+ /// LSRUse. FindUseWithSimilarFormula can't consider uses with different max
+ /// fixup widths to be equivalent, because the narrower one may be relying on
+ /// the implicit truncation to truncate away bogus bits.
+ Type *WidestFixupType;
+
+ /// A list of ways to build a value that can satisfy this user. After the
+ /// list is populated, one of these is selected heuristically and used to
+ /// formulate a replacement for OperandValToReplace in UserInst.
+ SmallVector<Formula, 12> Formulae;
+
+ /// The set of register candidates used by all formulae in this LSRUse.
+ SmallPtrSet<const SCEV *, 4> Regs;
+
+ LSRUse(KindType K, MemAccessTy AT)
+ : Kind(K), AccessTy(AT), MinOffset(INT64_MAX), MaxOffset(INT64_MIN),
+ AllFixupsOutsideLoop(true), RigidFormula(false),
+ WidestFixupType(nullptr) {}
+
+ bool HasFormulaWithSameRegs(const Formula &F) const;
+ bool InsertFormula(const Formula &F);
+ void DeleteFormula(Formula &F);
+ void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
+
+ void print(raw_ostream &OS) const;
+ void dump() const;
+};
+
+}
+
+/// Test whether this use as a formula which has the same registers as the given
+/// formula.
+bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
+ SmallVector<const SCEV *, 4> Key = F.BaseRegs;
+ if (F.ScaledReg) Key.push_back(F.ScaledReg);
+ // Unstable sort by host order ok, because this is only used for uniquifying.
+ std::sort(Key.begin(), Key.end());
+ return Uniquifier.count(Key);
+}
+
+/// If the given formula has not yet been inserted, add it to the list, and
+/// return true. Return false otherwise. The formula must be in canonical form.
+bool LSRUse::InsertFormula(const Formula &F) {
+ assert(F.isCanonical() && "Invalid canonical representation");
+
+ if (!Formulae.empty() && RigidFormula)
+ return false;
+
+ SmallVector<const SCEV *, 4> Key = F.BaseRegs;
+ if (F.ScaledReg) Key.push_back(F.ScaledReg);
+ // Unstable sort by host order ok, because this is only used for uniquifying.
+ std::sort(Key.begin(), Key.end());
+
+ if (!Uniquifier.insert(Key).second)
+ return false;
+
+ // Using a register to hold the value of 0 is not profitable.
+ assert((!F.ScaledReg || !F.ScaledReg->isZero()) &&
+ "Zero allocated in a scaled register!");
+#ifndef NDEBUG
+ for (const SCEV *BaseReg : F.BaseRegs)
+ assert(!BaseReg->isZero() && "Zero allocated in a base register!");
+#endif
+
+ // Add the formula to the list.
+ Formulae.push_back(F);
+
+ // Record registers now being used by this use.
+ Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
+ if (F.ScaledReg)
+ Regs.insert(F.ScaledReg);
+
+ return true;
+}
+
+/// Remove the given formula from this use's list.
+void LSRUse::DeleteFormula(Formula &F) {
+ if (&F != &Formulae.back())
+ std::swap(F, Formulae.back());
+ Formulae.pop_back();
+}
+
+/// Recompute the Regs field, and update RegUses.
+void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) {
+ // Now that we've filtered out some formulae, recompute the Regs set.
+ SmallPtrSet<const SCEV *, 4> OldRegs = std::move(Regs);
+ Regs.clear();
+ for (const Formula &F : Formulae) {
+ if (F.ScaledReg) Regs.insert(F.ScaledReg);
+ Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
+ }
+
+ // Update the RegTracker.
+ for (const SCEV *S : OldRegs)
+ if (!Regs.count(S))
+ RegUses.dropRegister(S, LUIdx);
+}
+
+void LSRUse::print(raw_ostream &OS) const {
+ OS << "LSR Use: Kind=";
+ switch (Kind) {
+ case Basic: OS << "Basic"; break;
+ case Special: OS << "Special"; break;
+ case ICmpZero: OS << "ICmpZero"; break;
+ case Address:
+ OS << "Address of ";
+ if (AccessTy.MemTy->isPointerTy())
+ OS << "pointer"; // the full pointer type could be really verbose
+ else {
+ OS << *AccessTy.MemTy;
+ }
+
+ OS << " in addrspace(" << AccessTy.AddrSpace << ')';
+ }
+
+ OS << ", Offsets={";
+ bool NeedComma = false;
+ for (int64_t O : Offsets) {
+ if (NeedComma) OS << ',';
+ OS << O;
+ NeedComma = true;
+ }
+ OS << '}';
+
+ if (AllFixupsOutsideLoop)
+ OS << ", all-fixups-outside-loop";
+
+ if (WidestFixupType)
+ OS << ", widest fixup type: " << *WidestFixupType;
+}
+
+LLVM_DUMP_METHOD
+void LSRUse::dump() const {
+ print(errs()); errs() << '\n';
+}
+
+static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
+ LSRUse::KindType Kind, MemAccessTy AccessTy,
+ GlobalValue *BaseGV, int64_t BaseOffset,
+ bool HasBaseReg, int64_t Scale) {
+ switch (Kind) {
+ case LSRUse::Address:
+ return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, BaseOffset,
+ HasBaseReg, Scale, AccessTy.AddrSpace);
+
+ case LSRUse::ICmpZero:
+ // There's not even a target hook for querying whether it would be legal to
+ // fold a GV into an ICmp.
+ if (BaseGV)
+ return false;
+
+ // ICmp only has two operands; don't allow more than two non-trivial parts.
+ if (Scale != 0 && HasBaseReg && BaseOffset != 0)
+ return false;
+
+ // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by
+ // putting the scaled register in the other operand of the icmp.
+ if (Scale != 0 && Scale != -1)
+ return false;
+
+ // If we have low-level target information, ask the target if it can fold an
+ // integer immediate on an icmp.
+ if (BaseOffset != 0) {
+ // We have one of:
+ // ICmpZero BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset
+ // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset
+ // Offs is the ICmp immediate.
+ if (Scale == 0)
+ // The cast does the right thing with INT64_MIN.
+ BaseOffset = -(uint64_t)BaseOffset;
+ return TTI.isLegalICmpImmediate(BaseOffset);
+ }
+
+ // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg
+ return true;
+
+ case LSRUse::Basic:
+ // Only handle single-register values.
+ return !BaseGV && Scale == 0 && BaseOffset == 0;
+
+ case LSRUse::Special:
+ // Special case Basic to handle -1 scales.
+ return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset == 0;
+ }
+
+ llvm_unreachable("Invalid LSRUse Kind!");
+}
+
+static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
+ int64_t MinOffset, int64_t MaxOffset,
+ LSRUse::KindType Kind, MemAccessTy AccessTy,
+ GlobalValue *BaseGV, int64_t BaseOffset,
+ bool HasBaseReg, int64_t Scale) {
+ // Check for overflow.
+ if (((int64_t)((uint64_t)BaseOffset + MinOffset) > BaseOffset) !=
+ (MinOffset > 0))
+ return false;
+ MinOffset = (uint64_t)BaseOffset + MinOffset;
+ if (((int64_t)((uint64_t)BaseOffset + MaxOffset) > BaseOffset) !=
+ (MaxOffset > 0))
+ return false;
+ MaxOffset = (uint64_t)BaseOffset + MaxOffset;
+
+ return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MinOffset,
+ HasBaseReg, Scale) &&
+ isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MaxOffset,
+ HasBaseReg, Scale);
+}
+
+static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
+ int64_t MinOffset, int64_t MaxOffset,
+ LSRUse::KindType Kind, MemAccessTy AccessTy,
+ const Formula &F) {
+ // For the purpose of isAMCompletelyFolded either having a canonical formula
+ // or a scale not equal to zero is correct.
+ // Problems may arise from non canonical formulae having a scale == 0.
+ // Strictly speaking it would best to just rely on canonical formulae.
+ // However, when we generate the scaled formulae, we first check that the
+ // scaling factor is profitable before computing the actual ScaledReg for
+ // compile time sake.
+ assert((F.isCanonical() || F.Scale != 0));
+ return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
+ F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale);
+}
+
+/// Test whether we know how to expand the current formula.
+static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
+ int64_t MaxOffset, LSRUse::KindType Kind,
+ MemAccessTy AccessTy, GlobalValue *BaseGV,
+ int64_t BaseOffset, bool HasBaseReg, int64_t Scale) {
+ // We know how to expand completely foldable formulae.
+ return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
+ BaseOffset, HasBaseReg, Scale) ||
+ // Or formulae that use a base register produced by a sum of base
+ // registers.
+ (Scale == 1 &&
+ isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
+ BaseGV, BaseOffset, true, 0));
+}
+
+static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
+ int64_t MaxOffset, LSRUse::KindType Kind,
+ MemAccessTy AccessTy, const Formula &F) {
+ return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV,
+ F.BaseOffset, F.HasBaseReg, F.Scale);
+}
+
+static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
+ const LSRUse &LU, const Formula &F) {
+ return isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
+ LU.AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg,
+ F.Scale);
+}
+
+static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
+ const LSRUse &LU, const Formula &F) {
+ if (!F.Scale)
+ return 0;
+
+ // If the use is not completely folded in that instruction, we will have to
+ // pay an extra cost only for scale != 1.
+ if (!isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
+ LU.AccessTy, F))
+ return F.Scale != 1;
+
+ switch (LU.Kind) {
+ case LSRUse::Address: {
+ // Check the scaling factor cost with both the min and max offsets.
+ int ScaleCostMinOffset = TTI.getScalingFactorCost(
+ LU.AccessTy.MemTy, F.BaseGV, F.BaseOffset + LU.MinOffset, F.HasBaseReg,
+ F.Scale, LU.AccessTy.AddrSpace);
+ int ScaleCostMaxOffset = TTI.getScalingFactorCost(
+ LU.AccessTy.MemTy, F.BaseGV, F.BaseOffset + LU.MaxOffset, F.HasBaseReg,
+ F.Scale, LU.AccessTy.AddrSpace);
+
+ assert(ScaleCostMinOffset >= 0 && ScaleCostMaxOffset >= 0 &&
+ "Legal addressing mode has an illegal cost!");
+ return std::max(ScaleCostMinOffset, ScaleCostMaxOffset);
+ }
+ case LSRUse::ICmpZero:
+ case LSRUse::Basic:
+ case LSRUse::Special:
+ // The use is completely folded, i.e., everything is folded into the
+ // instruction.
+ return 0;
+ }
+
+ llvm_unreachable("Invalid LSRUse Kind!");
+}
+
+static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
+ LSRUse::KindType Kind, MemAccessTy AccessTy,
+ GlobalValue *BaseGV, int64_t BaseOffset,
+ bool HasBaseReg) {
+ // Fast-path: zero is always foldable.
+ if (BaseOffset == 0 && !BaseGV) return true;
+
+ // Conservatively, create an address with an immediate and a
+ // base and a scale.
+ int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
+
+ // Canonicalize a scale of 1 to a base register if the formula doesn't
+ // already have a base register.
+ if (!HasBaseReg && Scale == 1) {
+ Scale = 0;
+ HasBaseReg = true;
+ }
+
+ return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, BaseOffset,
+ HasBaseReg, Scale);
+}
+
+static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
+ ScalarEvolution &SE, int64_t MinOffset,
+ int64_t MaxOffset, LSRUse::KindType Kind,
+ MemAccessTy AccessTy, const SCEV *S,
+ bool HasBaseReg) {
+ // Fast-path: zero is always foldable.
+ if (S->isZero()) return true;
+
+ // Conservatively, create an address with an immediate and a
+ // base and a scale.
+ int64_t BaseOffset = ExtractImmediate(S, SE);
+ GlobalValue *BaseGV = ExtractSymbol(S, SE);
+
+ // If there's anything else involved, it's not foldable.
+ if (!S->isZero()) return false;
+
+ // Fast-path: zero is always foldable.
+ if (BaseOffset == 0 && !BaseGV) return true;
+
+ // Conservatively, create an address with an immediate and a
+ // base and a scale.
+ int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
+
+ return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
+ BaseOffset, HasBaseReg, Scale);
+}
+
+namespace {
+
+/// An individual increment in a Chain of IV increments. Relate an IV user to
+/// an expression that computes the IV it uses from the IV used by the previous
+/// link in the Chain.
+///
+/// For the head of a chain, IncExpr holds the absolute SCEV expression for the
+/// original IVOperand. The head of the chain's IVOperand is only valid during
+/// chain collection, before LSR replaces IV users. During chain generation,
+/// IncExpr can be used to find the new IVOperand that computes the same
+/// expression.
+struct IVInc {
+ Instruction *UserInst;
+ Value* IVOperand;
+ const SCEV *IncExpr;
+
+ IVInc(Instruction *U, Value *O, const SCEV *E):
+ UserInst(U), IVOperand(O), IncExpr(E) {}
+};
+
+// The list of IV increments in program order. We typically add the head of a
+// chain without finding subsequent links.
+struct IVChain {
+ SmallVector<IVInc,1> Incs;
+ const SCEV *ExprBase;
+
+ IVChain() : ExprBase(nullptr) {}
+
+ IVChain(const IVInc &Head, const SCEV *Base)
+ : Incs(1, Head), ExprBase(Base) {}
+
+ typedef SmallVectorImpl<IVInc>::const_iterator const_iterator;
+
+ // Return the first increment in the chain.
+ const_iterator begin() const {
+ assert(!Incs.empty());
+ return std::next(Incs.begin());
+ }
+ const_iterator end() const {
+ return Incs.end();
+ }
+
+ // Returns true if this chain contains any increments.
+ bool hasIncs() const { return Incs.size() >= 2; }
+
+ // Add an IVInc to the end of this chain.
+ void add(const IVInc &X) { Incs.push_back(X); }
+
+ // Returns the last UserInst in the chain.
+ Instruction *tailUserInst() const { return Incs.back().UserInst; }
+
+ // Returns true if IncExpr can be profitably added to this chain.
+ bool isProfitableIncrement(const SCEV *OperExpr,
+ const SCEV *IncExpr,
+ ScalarEvolution&);
+};
+
+/// Helper for CollectChains to track multiple IV increment uses. Distinguish
+/// between FarUsers that definitely cross IV increments and NearUsers that may
+/// be used between IV increments.
+struct ChainUsers {
+ SmallPtrSet<Instruction*, 4> FarUsers;
+ SmallPtrSet<Instruction*, 4> NearUsers;
+};
+
+/// This class holds state for the main loop strength reduction logic.
+class LSRInstance {
+ IVUsers &IU;
+ ScalarEvolution &SE;
+ DominatorTree &DT;
+ LoopInfo &LI;
+ const TargetTransformInfo &TTI;
+ Loop *const L;
+ bool Changed;
+
+ /// This is the insert position that the current loop's induction variable
+ /// increment should be placed. In simple loops, this is the latch block's
+ /// terminator. But in more complicated cases, this is a position which will
+ /// dominate all the in-loop post-increment users.
+ Instruction *IVIncInsertPos;
+
+ /// Interesting factors between use strides.
+ SmallSetVector<int64_t, 8> Factors;
+
+ /// Interesting use types, to facilitate truncation reuse.
+ SmallSetVector<Type *, 4> Types;
+
+ /// The list of operands which are to be replaced.
+ SmallVector<LSRFixup, 16> Fixups;
+
+ /// The list of interesting uses.
+ SmallVector<LSRUse, 16> Uses;
+
+ /// Track which uses use which register candidates.
+ RegUseTracker RegUses;
+
+ // Limit the number of chains to avoid quadratic behavior. We don't expect to
+ // have more than a few IV increment chains in a loop. Missing a Chain falls
+ // back to normal LSR behavior for those uses.
+ static const unsigned MaxChains = 8;
+
+ /// IV users can form a chain of IV increments.
+ SmallVector<IVChain, MaxChains> IVChainVec;
+
+ /// IV users that belong to profitable IVChains.
+ SmallPtrSet<Use*, MaxChains> IVIncSet;
+
+ void OptimizeShadowIV();
+ bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse);
+ ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
+ void OptimizeLoopTermCond();
+
+ void ChainInstruction(Instruction *UserInst, Instruction *IVOper,
+ SmallVectorImpl<ChainUsers> &ChainUsersVec);
+ void FinalizeChain(IVChain &Chain);
+ void CollectChains();
+ void GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
+ SmallVectorImpl<WeakVH> &DeadInsts);
+
+ void CollectInterestingTypesAndFactors();
+ void CollectFixupsAndInitialFormulae();
+
+ LSRFixup &getNewFixup() {
+ Fixups.push_back(LSRFixup());
+ return Fixups.back();
+ }
+
+ // Support for sharing of LSRUses between LSRFixups.
+ typedef DenseMap<LSRUse::SCEVUseKindPair, size_t> UseMapTy;
+ UseMapTy UseMap;
+
+ bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
+ LSRUse::KindType Kind, MemAccessTy AccessTy);
+
+ std::pair<size_t, int64_t> getUse(const SCEV *&Expr, LSRUse::KindType Kind,
+ MemAccessTy AccessTy);
+
+ void DeleteUse(LSRUse &LU, size_t LUIdx);
+
+ LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU);
+
+ void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
+ void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
+ void CountRegisters(const Formula &F, size_t LUIdx);
+ bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);
+
+ void CollectLoopInvariantFixupsAndFormulae();
+
+ void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base,
+ unsigned Depth = 0);
+
+ void GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
+ const Formula &Base, unsigned Depth,
+ size_t Idx, bool IsScaledReg = false);
+ void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base);
+ void GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
+ const Formula &Base, size_t Idx,
+ bool IsScaledReg = false);
+ void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
+ void GenerateConstantOffsetsImpl(LSRUse &LU, unsigned LUIdx,
+ const Formula &Base,
+ const SmallVectorImpl<int64_t> &Worklist,
+ size_t Idx, bool IsScaledReg = false);
+ void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
+ void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base);
+ void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base);
+ void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base);
+ void GenerateCrossUseConstantOffsets();
+ void GenerateAllReuseFormulae();
+
+ void FilterOutUndesirableDedicatedRegisters();
+
+ size_t EstimateSearchSpaceComplexity() const;
+ void NarrowSearchSpaceByDetectingSupersets();
+ void NarrowSearchSpaceByCollapsingUnrolledCode();
+ void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
+ void NarrowSearchSpaceByPickingWinnerRegs();
+ void NarrowSearchSpaceUsingHeuristics();
+
+ void SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
+ Cost &SolutionCost,
+ SmallVectorImpl<const Formula *> &Workspace,
+ const Cost &CurCost,
+ const SmallPtrSet<const SCEV *, 16> &CurRegs,
+ DenseSet<const SCEV *> &VisitedRegs) const;
+ void Solve(SmallVectorImpl<const Formula *> &Solution) const;
+
+ BasicBlock::iterator
+ HoistInsertPosition(BasicBlock::iterator IP,
+ const SmallVectorImpl<Instruction *> &Inputs) const;
+ BasicBlock::iterator
+ AdjustInsertPositionForExpand(BasicBlock::iterator IP,
+ const LSRFixup &LF,
+ const LSRUse &LU,
+ SCEVExpander &Rewriter) const;
+
+ Value *Expand(const LSRFixup &LF,
+ const Formula &F,
+ BasicBlock::iterator IP,
+ SCEVExpander &Rewriter,
+ SmallVectorImpl<WeakVH> &DeadInsts) const;
+ void RewriteForPHI(PHINode *PN, const LSRFixup &LF,
+ const Formula &F,
+ SCEVExpander &Rewriter,
+ SmallVectorImpl<WeakVH> &DeadInsts) const;
+ void Rewrite(const LSRFixup &LF,
+ const Formula &F,
+ SCEVExpander &Rewriter,
+ SmallVectorImpl<WeakVH> &DeadInsts) const;
+ void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution);
+
+public:
+ LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT,
+ LoopInfo &LI, const TargetTransformInfo &TTI);
+
+ bool getChanged() const { return Changed; }
+
+ void print_factors_and_types(raw_ostream &OS) const;
+ void print_fixups(raw_ostream &OS) const;
+ void print_uses(raw_ostream &OS) const;
+ void print(raw_ostream &OS) const;
+ void dump() const;
+};
+
+}
+
+/// If IV is used in a int-to-float cast inside the loop then try to eliminate
+/// the cast operation.
+void LSRInstance::OptimizeShadowIV() {
+ const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
+ if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
+ return;
+
+ for (IVUsers::const_iterator UI = IU.begin(), E = IU.end();
+ UI != E; /* empty */) {
+ IVUsers::const_iterator CandidateUI = UI;
+ ++UI;
+ Instruction *ShadowUse = CandidateUI->getUser();
+ Type *DestTy = nullptr;
+ bool IsSigned = false;
+
+ /* If shadow use is a int->float cast then insert a second IV
+ to eliminate this cast.
+
+ for (unsigned i = 0; i < n; ++i)
+ foo((double)i);
+
+ is transformed into
+
+ double d = 0.0;
+ for (unsigned i = 0; i < n; ++i, ++d)
+ foo(d);
+ */
+ if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) {
+ IsSigned = false;
+ DestTy = UCast->getDestTy();
+ }
+ else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) {
+ IsSigned = true;
+ DestTy = SCast->getDestTy();
+ }
+ if (!DestTy) continue;
+
+ // If target does not support DestTy natively then do not apply
+ // this transformation.
+ if (!TTI.isTypeLegal(DestTy)) continue;
+
+ PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
+ if (!PH) continue;
+ if (PH->getNumIncomingValues() != 2) continue;
+
+ Type *SrcTy = PH->getType();
+ int Mantissa = DestTy->getFPMantissaWidth();
+ if (Mantissa == -1) continue;
+ if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
+ continue;
+
+ unsigned Entry, Latch;
+ if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
+ Entry = 0;
+ Latch = 1;
+ } else {
+ Entry = 1;
+ Latch = 0;
+ }
+
+ ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
+ if (!Init) continue;
+ Constant *NewInit = ConstantFP::get(DestTy, IsSigned ?
+ (double)Init->getSExtValue() :
+ (double)Init->getZExtValue());
+
+ BinaryOperator *Incr =
+ dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch));
+ if (!Incr) continue;
+ if (Incr->getOpcode() != Instruction::Add
+ && Incr->getOpcode() != Instruction::Sub)
+ continue;
+
+ /* Initialize new IV, double d = 0.0 in above example. */
+ ConstantInt *C = nullptr;
+ if (Incr->getOperand(0) == PH)
+ C = dyn_cast<ConstantInt>(Incr->getOperand(1));
+ else if (Incr->getOperand(1) == PH)
+ C = dyn_cast<ConstantInt>(Incr->getOperand(0));
+ else
+ continue;
+
+ if (!C) continue;
+
+ // Ignore negative constants, as the code below doesn't handle them
+ // correctly. TODO: Remove this restriction.
+ if (!C->getValue().isStrictlyPositive()) continue;
+
+ /* Add new PHINode. */
+ PHINode *NewPH = PHINode::Create(DestTy, 2, "IV.S.", PH);
+
+ /* create new increment. '++d' in above example. */
+ Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue());
+ BinaryOperator *NewIncr =
+ BinaryOperator::Create(Incr->getOpcode() == Instruction::Add ?
+ Instruction::FAdd : Instruction::FSub,
+ NewPH, CFP, "IV.S.next.", Incr);
+
+ NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
+ NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));
+
+ /* Remove cast operation */
+ ShadowUse->replaceAllUsesWith(NewPH);
+ ShadowUse->eraseFromParent();
+ Changed = true;
+ break;
+ }
+}
+
+/// If Cond has an operand that is an expression of an IV, set the IV user and
+/// stride information and return true, otherwise return false.
+bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {
+ for (IVStrideUse &U : IU)
+ if (U.getUser() == Cond) {
+ // NOTE: we could handle setcc instructions with multiple uses here, but
+ // InstCombine does it as well for simple uses, it's not clear that it
+ // occurs enough in real life to handle.
+ CondUse = &U;
+ return true;
+ }
+ return false;
+}
+
+/// Rewrite the loop's terminating condition if it uses a max computation.
+///
+/// This is a narrow solution to a specific, but acute, problem. For loops
+/// like this:
+///
+/// i = 0;
+/// do {
+/// p[i] = 0.0;
+/// } while (++i < n);
+///
+/// the trip count isn't just 'n', because 'n' might not be positive. And
+/// unfortunately this can come up even for loops where the user didn't use
+/// a C do-while loop. For example, seemingly well-behaved top-test loops
+/// will commonly be lowered like this:
+//
+/// if (n > 0) {
+/// i = 0;
+/// do {
+/// p[i] = 0.0;
+/// } while (++i < n);
+/// }
+///
+/// and then it's possible for subsequent optimization to obscure the if
+/// test in such a way that indvars can't find it.
+///
+/// When indvars can't find the if test in loops like this, it creates a
+/// max expression, which allows it to give the loop a canonical
+/// induction variable:
+///
+/// i = 0;
+/// max = n < 1 ? 1 : n;
+/// do {
+/// p[i] = 0.0;
+/// } while (++i != max);
+///
+/// Canonical induction variables are necessary because the loop passes
+/// are designed around them. The most obvious example of this is the
+/// LoopInfo analysis, which doesn't remember trip count values. It
+/// expects to be able to rediscover the trip count each time it is
+/// needed, and it does this using a simple analysis that only succeeds if
+/// the loop has a canonical induction variable.
+///
+/// However, when it comes time to generate code, the maximum operation
+/// can be quite costly, especially if it's inside of an outer loop.
+///
+/// This function solves this problem by detecting this type of loop and
+/// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
+/// the instructions for the maximum computation.
+///
+ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
+ // Check that the loop matches the pattern we're looking for.
+ if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
+ Cond->getPredicate() != CmpInst::ICMP_NE)
+ return Cond;
+
+ SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
+ if (!Sel || !Sel->hasOneUse()) return Cond;
+
+ const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
+ if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
+ return Cond;
+ const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1);
+
+ // Add one to the backedge-taken count to get the trip count.
+ const SCEV *IterationCount = SE.getAddExpr(One, BackedgeTakenCount);
+ if (IterationCount != SE.getSCEV(Sel)) return Cond;
+
+ // Check for a max calculation that matches the pattern. There's no check
+ // for ICMP_ULE here because the comparison would be with zero, which
+ // isn't interesting.
+ CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
+ const SCEVNAryExpr *Max = nullptr;
+ if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) {
+ Pred = ICmpInst::ICMP_SLE;
+ Max = S;
+ } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(IterationCount)) {
+ Pred = ICmpInst::ICMP_SLT;
+ Max = S;
+ } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(IterationCount)) {
+ Pred = ICmpInst::ICMP_ULT;
+ Max = U;
+ } else {
+ // No match; bail.
+ return Cond;
+ }
+
+ // To handle a max with more than two operands, this optimization would
+ // require additional checking and setup.
+ if (Max->getNumOperands() != 2)
+ return Cond;
+
+ const SCEV *MaxLHS = Max->getOperand(0);
+ const SCEV *MaxRHS = Max->getOperand(1);
+
+ // ScalarEvolution canonicalizes constants to the left. For < and >, look
+ // for a comparison with 1. For <= and >=, a comparison with zero.
+ if (!MaxLHS ||
+ (ICmpInst::isTrueWhenEqual(Pred) ? !MaxLHS->isZero() : (MaxLHS != One)))
+ return Cond;
+
+ // Check the relevant induction variable for conformance to
+ // the pattern.
+ const SCEV *IV = SE.getSCEV(Cond->getOperand(0));
+ const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV);
+ if (!AR || !AR->isAffine() ||
+ AR->getStart() != One ||
+ AR->getStepRecurrence(SE) != One)
+ return Cond;
+
+ assert(AR->getLoop() == L &&
+ "Loop condition operand is an addrec in a different loop!");
+
+ // Check the right operand of the select, and remember it, as it will
+ // be used in the new comparison instruction.
+ Value *NewRHS = nullptr;
+ if (ICmpInst::isTrueWhenEqual(Pred)) {
+ // Look for n+1, and grab n.
+ if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1)))
+ if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
+ if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
+ NewRHS = BO->getOperand(0);
+ if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2)))
+ if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
+ if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
+ NewRHS = BO->getOperand(0);
+ if (!NewRHS)
+ return Cond;
+ } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
+ NewRHS = Sel->getOperand(1);
+ else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
+ NewRHS = Sel->getOperand(2);
+ else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(MaxRHS))
+ NewRHS = SU->getValue();
+ else
+ // Max doesn't match expected pattern.
+ return Cond;
+
+ // Determine the new comparison opcode. It may be signed or unsigned,
+ // and the original comparison may be either equality or inequality.
+ if (Cond->getPredicate() == CmpInst::ICMP_EQ)
+ Pred = CmpInst::getInversePredicate(Pred);
+
+ // Ok, everything looks ok to change the condition into an SLT or SGE and
+ // delete the max calculation.
+ ICmpInst *NewCond =
+ new ICmpInst(Cond, Pred, Cond->getOperand(0), NewRHS, "scmp");
+
+ // Delete the max calculation instructions.
+ Cond->replaceAllUsesWith(NewCond);
+ CondUse->setUser(NewCond);
+ Instruction *Cmp = cast<Instruction>(Sel->getOperand(0));
+ Cond->eraseFromParent();
+ Sel->eraseFromParent();
+ if (Cmp->use_empty())
+ Cmp->eraseFromParent();
+ return NewCond;
+}
+
+/// Change loop terminating condition to use the postinc iv when possible.
+void
+LSRInstance::OptimizeLoopTermCond() {
+ SmallPtrSet<Instruction *, 4> PostIncs;
+
+ BasicBlock *LatchBlock = L->getLoopLatch();
+ SmallVector<BasicBlock*, 8> ExitingBlocks;
+ L->getExitingBlocks(ExitingBlocks);
+
+ for (BasicBlock *ExitingBlock : ExitingBlocks) {
+
+ // Get the terminating condition for the loop if possible. If we
+ // can, we want to change it to use a post-incremented version of its
+ // induction variable, to allow coalescing the live ranges for the IV into
+ // one register value.
+
+ BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
+ if (!TermBr)
+ continue;
+ // FIXME: Overly conservative, termination condition could be an 'or' etc..
+ if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
+ continue;
+
+ // Search IVUsesByStride to find Cond's IVUse if there is one.
+ IVStrideUse *CondUse = nullptr;
+ ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
+ if (!FindIVUserForCond(Cond, CondUse))
+ continue;
+
+ // If the trip count is computed in terms of a max (due to ScalarEvolution
+ // being unable to find a sufficient guard, for example), change the loop
+ // comparison to use SLT or ULT instead of NE.
+ // One consequence of doing this now is that it disrupts the count-down
+ // optimization. That's not always a bad thing though, because in such
+ // cases it may still be worthwhile to avoid a max.
+ Cond = OptimizeMax(Cond, CondUse);
+
+ // If this exiting block dominates the latch block, it may also use
+ // the post-inc value if it won't be shared with other uses.
+ // Check for dominance.
+ if (!DT.dominates(ExitingBlock, LatchBlock))
+ continue;
+
+ // Conservatively avoid trying to use the post-inc value in non-latch
+ // exits if there may be pre-inc users in intervening blocks.
+ if (LatchBlock != ExitingBlock)
+ for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI)
+ // Test if the use is reachable from the exiting block. This dominator
+ // query is a conservative approximation of reachability.
+ if (&*UI != CondUse &&
+ !DT.properlyDominates(UI->getUser()->getParent(), ExitingBlock)) {
+ // Conservatively assume there may be reuse if the quotient of their
+ // strides could be a legal scale.
+ const SCEV *A = IU.getStride(*CondUse, L);
+ const SCEV *B = IU.getStride(*UI, L);
+ if (!A || !B) continue;
+ if (SE.getTypeSizeInBits(A->getType()) !=
+ SE.getTypeSizeInBits(B->getType())) {
+ if (SE.getTypeSizeInBits(A->getType()) >
+ SE.getTypeSizeInBits(B->getType()))
+ B = SE.getSignExtendExpr(B, A->getType());
+ else
+ A = SE.getSignExtendExpr(A, B->getType());
+ }
+ if (const SCEVConstant *D =
+ dyn_cast_or_null<SCEVConstant>(getExactSDiv(B, A, SE))) {
+ const ConstantInt *C = D->getValue();
+ // Stride of one or negative one can have reuse with non-addresses.
+ if (C->isOne() || C->isAllOnesValue())
+ goto decline_post_inc;
+ // Avoid weird situations.
+ if (C->getValue().getMinSignedBits() >= 64 ||
+ C->getValue().isMinSignedValue())
+ goto decline_post_inc;
+ // Check for possible scaled-address reuse.
+ MemAccessTy AccessTy = getAccessType(UI->getUser());
+ int64_t Scale = C->getSExtValue();
+ if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
+ /*BaseOffset=*/0,
+ /*HasBaseReg=*/false, Scale,
+ AccessTy.AddrSpace))
+ goto decline_post_inc;
+ Scale = -Scale;
+ if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
+ /*BaseOffset=*/0,
+ /*HasBaseReg=*/false, Scale,
+ AccessTy.AddrSpace))
+ goto decline_post_inc;
+ }
+ }
+
+ DEBUG(dbgs() << " Change loop exiting icmp to use postinc iv: "
+ << *Cond << '\n');
+
+ // It's possible for the setcc instruction to be anywhere in the loop, and
+ // possible for it to have multiple users. If it is not immediately before
+ // the exiting block branch, move it.
+ if (&*++BasicBlock::iterator(Cond) != TermBr) {
+ if (Cond->hasOneUse()) {
+ Cond->moveBefore(TermBr);
+ } else {
+ // Clone the terminating condition and insert into the loopend.
+ ICmpInst *OldCond = Cond;
+ Cond = cast<ICmpInst>(Cond->clone());
+ Cond->setName(L->getHeader()->getName() + ".termcond");
+ ExitingBlock->getInstList().insert(TermBr->getIterator(), Cond);
+
+ // Clone the IVUse, as the old use still exists!
+ CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace());
+ TermBr->replaceUsesOfWith(OldCond, Cond);
+ }
+ }
+
+ // If we get to here, we know that we can transform the setcc instruction to
+ // use the post-incremented version of the IV, allowing us to coalesce the
+ // live ranges for the IV correctly.
+ CondUse->transformToPostInc(L);
+ Changed = true;
+
+ PostIncs.insert(Cond);
+ decline_post_inc:;
+ }
+
+ // Determine an insertion point for the loop induction variable increment. It
+ // must dominate all the post-inc comparisons we just set up, and it must
+ // dominate the loop latch edge.
+ IVIncInsertPos = L->getLoopLatch()->getTerminator();
+ for (Instruction *Inst : PostIncs) {
+ BasicBlock *BB =
+ DT.findNearestCommonDominator(IVIncInsertPos->getParent(),
+ Inst->getParent());
+ if (BB == Inst->getParent())
+ IVIncInsertPos = Inst;
+ else if (BB != IVIncInsertPos->getParent())
+ IVIncInsertPos = BB->getTerminator();
+ }
+}
+
+/// Determine if the given use can accommodate a fixup at the given offset and
+/// other details. If so, update the use and return true.
+bool LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset,
+ bool HasBaseReg, LSRUse::KindType Kind,
+ MemAccessTy AccessTy) {
+ int64_t NewMinOffset = LU.MinOffset;
+ int64_t NewMaxOffset = LU.MaxOffset;
+ MemAccessTy NewAccessTy = AccessTy;
+
+ // Check for a mismatched kind. It's tempting to collapse mismatched kinds to
+ // something conservative, however this can pessimize in the case that one of
+ // the uses will have all its uses outside the loop, for example.
+ if (LU.Kind != Kind)
+ return false;
+
+ // Check for a mismatched access type, and fall back conservatively as needed.
+ // TODO: Be less conservative when the type is similar and can use the same
+ // addressing modes.
+ if (Kind == LSRUse::Address) {
+ if (AccessTy != LU.AccessTy)
+ NewAccessTy = MemAccessTy::getUnknown(AccessTy.MemTy->getContext());
+ }
+
+ // Conservatively assume HasBaseReg is true for now.
+ if (NewOffset < LU.MinOffset) {
+ if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
+ LU.MaxOffset - NewOffset, HasBaseReg))
+ return false;
+ NewMinOffset = NewOffset;
+ } else if (NewOffset > LU.MaxOffset) {
+ if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
+ NewOffset - LU.MinOffset, HasBaseReg))
+ return false;
+ NewMaxOffset = NewOffset;
+ }
+
+ // Update the use.
+ LU.MinOffset = NewMinOffset;
+ LU.MaxOffset = NewMaxOffset;
+ LU.AccessTy = NewAccessTy;
+ if (NewOffset != LU.Offsets.back())
+ LU.Offsets.push_back(NewOffset);
+ return true;
+}
+
+/// Return an LSRUse index and an offset value for a fixup which needs the given
+/// expression, with the given kind and optional access type. Either reuse an
+/// existing use or create a new one, as needed.
+std::pair<size_t, int64_t> LSRInstance::getUse(const SCEV *&Expr,
+ LSRUse::KindType Kind,
+ MemAccessTy AccessTy) {
+ const SCEV *Copy = Expr;
+ int64_t Offset = ExtractImmediate(Expr, SE);
+
+ // Basic uses can't accept any offset, for example.
+ if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ nullptr,
+ Offset, /*HasBaseReg=*/ true)) {
+ Expr = Copy;
+ Offset = 0;
+ }
+
+ std::pair<UseMapTy::iterator, bool> P =
+ UseMap.insert(std::make_pair(LSRUse::SCEVUseKindPair(Expr, Kind), 0));
+ if (!P.second) {
+ // A use already existed with this base.
+ size_t LUIdx = P.first->second;
+ LSRUse &LU = Uses[LUIdx];
+ if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy))
+ // Reuse this use.
+ return std::make_pair(LUIdx, Offset);
+ }
+
+ // Create a new use.
+ size_t LUIdx = Uses.size();
+ P.first->second = LUIdx;
+ Uses.push_back(LSRUse(Kind, AccessTy));
+ LSRUse &LU = Uses[LUIdx];
+
+ // We don't need to track redundant offsets, but we don't need to go out
+ // of our way here to avoid them.
+ if (LU.Offsets.empty() || Offset != LU.Offsets.back())
+ LU.Offsets.push_back(Offset);
+
+ LU.MinOffset = Offset;
+ LU.MaxOffset = Offset;
+ return std::make_pair(LUIdx, Offset);
+}
+
+/// Delete the given use from the Uses list.
+void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) {
+ if (&LU != &Uses.back())
+ std::swap(LU, Uses.back());
+ Uses.pop_back();
+
+ // Update RegUses.
+ RegUses.swapAndDropUse(LUIdx, Uses.size());
+}
+
+/// Look for a use distinct from OrigLU which is has a formula that has the same
+/// registers as the given formula.
+LSRUse *
+LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
+ const LSRUse &OrigLU) {
+ // Search all uses for the formula. This could be more clever.
+ for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+ LSRUse &LU = Uses[LUIdx];
+ // Check whether this use is close enough to OrigLU, to see whether it's
+ // worthwhile looking through its formulae.
+ // Ignore ICmpZero uses because they may contain formulae generated by
+ // GenerateICmpZeroScales, in which case adding fixup offsets may
+ // be invalid.
+ if (&LU != &OrigLU &&
+ LU.Kind != LSRUse::ICmpZero &&
+ LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy &&
+ LU.WidestFixupType == OrigLU.WidestFixupType &&
+ LU.HasFormulaWithSameRegs(OrigF)) {
+ // Scan through this use's formulae.
+ for (const Formula &F : LU.Formulae) {
+ // Check to see if this formula has the same registers and symbols
+ // as OrigF.
+ if (F.BaseRegs == OrigF.BaseRegs &&
+ F.ScaledReg == OrigF.ScaledReg &&
+ F.BaseGV == OrigF.BaseGV &&
+ F.Scale == OrigF.Scale &&
+ F.UnfoldedOffset == OrigF.UnfoldedOffset) {
+ if (F.BaseOffset == 0)
+ return &LU;
+ // This is the formula where all the registers and symbols matched;
+ // there aren't going to be any others. Since we declined it, we
+ // can skip the rest of the formulae and proceed to the next LSRUse.
+ break;
+ }
+ }
+ }
+ }
+
+ // Nothing looked good.
+ return nullptr;
+}
+
+void LSRInstance::CollectInterestingTypesAndFactors() {
+ SmallSetVector<const SCEV *, 4> Strides;
+
+ // Collect interesting types and strides.
+ SmallVector<const SCEV *, 4> Worklist;
+ for (const IVStrideUse &U : IU) {
+ const SCEV *Expr = IU.getExpr(U);
+
+ // Collect interesting types.
+ Types.insert(SE.getEffectiveSCEVType(Expr->getType()));
+
+ // Add strides for mentioned loops.
+ Worklist.push_back(Expr);
+ do {
+ const SCEV *S = Worklist.pop_back_val();
+ if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
+ if (AR->getLoop() == L)
+ Strides.insert(AR->getStepRecurrence(SE));
+ Worklist.push_back(AR->getStart());
+ } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
+ Worklist.append(Add->op_begin(), Add->op_end());
+ }
+ } while (!Worklist.empty());
+ }
+
+ // Compute interesting factors from the set of interesting strides.
+ for (SmallSetVector<const SCEV *, 4>::const_iterator
+ I = Strides.begin(), E = Strides.end(); I != E; ++I)
+ for (SmallSetVector<const SCEV *, 4>::const_iterator NewStrideIter =
+ std::next(I); NewStrideIter != E; ++NewStrideIter) {
+ const SCEV *OldStride = *I;
+ const SCEV *NewStride = *NewStrideIter;
+
+ if (SE.getTypeSizeInBits(OldStride->getType()) !=
+ SE.getTypeSizeInBits(NewStride->getType())) {
+ if (SE.getTypeSizeInBits(OldStride->getType()) >
+ SE.getTypeSizeInBits(NewStride->getType()))
+ NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType());
+ else
+ OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType());
+ }
+ if (const SCEVConstant *Factor =
+ dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride,
+ SE, true))) {
+ if (Factor->getAPInt().getMinSignedBits() <= 64)
+ Factors.insert(Factor->getAPInt().getSExtValue());
+ } else if (const SCEVConstant *Factor =
+ dyn_cast_or_null<SCEVConstant>(getExactSDiv(OldStride,
+ NewStride,
+ SE, true))) {
+ if (Factor->getAPInt().getMinSignedBits() <= 64)
+ Factors.insert(Factor->getAPInt().getSExtValue());
+ }
+ }
+
+ // If all uses use the same type, don't bother looking for truncation-based
+ // reuse.
+ if (Types.size() == 1)
+ Types.clear();
+
+ DEBUG(print_factors_and_types(dbgs()));
+}
+
+/// Helper for CollectChains that finds an IV operand (computed by an AddRec in
+/// this loop) within [OI,OE) or returns OE. If IVUsers mapped Instructions to
+/// IVStrideUses, we could partially skip this.
+static User::op_iterator
+findIVOperand(User::op_iterator OI, User::op_iterator OE,
+ Loop *L, ScalarEvolution &SE) {
+ for(; OI != OE; ++OI) {
+ if (Instruction *Oper = dyn_cast<Instruction>(*OI)) {
+ if (!SE.isSCEVable(Oper->getType()))
+ continue;
+
+ if (const SCEVAddRecExpr *AR =
+ dyn_cast<SCEVAddRecExpr>(SE.getSCEV(Oper))) {
+ if (AR->getLoop() == L)
+ break;
+ }
+ }
+ }
+ return OI;
+}
+
+/// IVChain logic must consistenctly peek base TruncInst operands, so wrap it in
+/// a convenient helper.
+static Value *getWideOperand(Value *Oper) {
+ if (TruncInst *Trunc = dyn_cast<TruncInst>(Oper))
+ return Trunc->getOperand(0);
+ return Oper;
+}
+
+/// Return true if we allow an IV chain to include both types.
+static bool isCompatibleIVType(Value *LVal, Value *RVal) {
+ Type *LType = LVal->getType();
+ Type *RType = RVal->getType();
+ return (LType == RType) || (LType->isPointerTy() && RType->isPointerTy());
+}
+
+/// Return an approximation of this SCEV expression's "base", or NULL for any
+/// constant. Returning the expression itself is conservative. Returning a
+/// deeper subexpression is more precise and valid as long as it isn't less
+/// complex than another subexpression. For expressions involving multiple
+/// unscaled values, we need to return the pointer-type SCEVUnknown. This avoids
+/// forming chains across objects, such as: PrevOper==a[i], IVOper==b[i],
+/// IVInc==b-a.
+///
+/// Since SCEVUnknown is the rightmost type, and pointers are the rightmost
+/// SCEVUnknown, we simply return the rightmost SCEV operand.
+static const SCEV *getExprBase(const SCEV *S) {
+ switch (S->getSCEVType()) {
+ default: // uncluding scUnknown.
+ return S;
+ case scConstant:
+ return nullptr;
+ case scTruncate:
+ return getExprBase(cast<SCEVTruncateExpr>(S)->getOperand());
+ case scZeroExtend:
+ return getExprBase(cast<SCEVZeroExtendExpr>(S)->getOperand());
+ case scSignExtend:
+ return getExprBase(cast<SCEVSignExtendExpr>(S)->getOperand());
+ case scAddExpr: {
+ // Skip over scaled operands (scMulExpr) to follow add operands as long as
+ // there's nothing more complex.
+ // FIXME: not sure if we want to recognize negation.
+ const SCEVAddExpr *Add = cast<SCEVAddExpr>(S);
+ for (std::reverse_iterator<SCEVAddExpr::op_iterator> I(Add->op_end()),
+ E(Add->op_begin()); I != E; ++I) {
+ const SCEV *SubExpr = *I;
+ if (SubExpr->getSCEVType() == scAddExpr)
+ return getExprBase(SubExpr);
+
+ if (SubExpr->getSCEVType() != scMulExpr)
+ return SubExpr;
+ }
+ return S; // all operands are scaled, be conservative.
+ }
+ case scAddRecExpr:
+ return getExprBase(cast<SCEVAddRecExpr>(S)->getStart());
+ }
+}
+
+/// Return true if the chain increment is profitable to expand into a loop
+/// invariant value, which may require its own register. A profitable chain
+/// increment will be an offset relative to the same base. We allow such offsets
+/// to potentially be used as chain increment as long as it's not obviously
+/// expensive to expand using real instructions.
+bool IVChain::isProfitableIncrement(const SCEV *OperExpr,
+ const SCEV *IncExpr,
+ ScalarEvolution &SE) {
+ // Aggressively form chains when -stress-ivchain.
+ if (StressIVChain)
+ return true;
+
+ // Do not replace a constant offset from IV head with a nonconstant IV
+ // increment.
+ if (!isa<SCEVConstant>(IncExpr)) {
+ const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Incs[0].IVOperand));
+ if (isa<SCEVConstant>(SE.getMinusSCEV(OperExpr, HeadExpr)))
+ return 0;
+ }
+
+ SmallPtrSet<const SCEV*, 8> Processed;
+ return !isHighCostExpansion(IncExpr, Processed, SE);
+}
+
+/// Return true if the number of registers needed for the chain is estimated to
+/// be less than the number required for the individual IV users. First prohibit
+/// any IV users that keep the IV live across increments (the Users set should
+/// be empty). Next count the number and type of increments in the chain.
+///
+/// Chaining IVs can lead to considerable code bloat if ISEL doesn't
+/// effectively use postinc addressing modes. Only consider it profitable it the
+/// increments can be computed in fewer registers when chained.
+///
+/// TODO: Consider IVInc free if it's already used in another chains.
+static bool
+isProfitableChain(IVChain &Chain, SmallPtrSetImpl<Instruction*> &Users,
+ ScalarEvolution &SE, const TargetTransformInfo &TTI) {
+ if (StressIVChain)
+ return true;
+
+ if (!Chain.hasIncs())
+ return false;
+
+ if (!Users.empty()) {
+ DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n";
+ for (Instruction *Inst : Users) {
+ dbgs() << " " << *Inst << "\n";
+ });
+ return false;
+ }
+ assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
+
+ // The chain itself may require a register, so intialize cost to 1.
+ int cost = 1;
+
+ // A complete chain likely eliminates the need for keeping the original IV in
+ // a register. LSR does not currently know how to form a complete chain unless
+ // the header phi already exists.
+ if (isa<PHINode>(Chain.tailUserInst())
+ && SE.getSCEV(Chain.tailUserInst()) == Chain.Incs[0].IncExpr) {
+ --cost;
+ }
+ const SCEV *LastIncExpr = nullptr;
+ unsigned NumConstIncrements = 0;
+ unsigned NumVarIncrements = 0;
+ unsigned NumReusedIncrements = 0;
+ for (const IVInc &Inc : Chain) {
+ if (Inc.IncExpr->isZero())
+ continue;
+
+ // Incrementing by zero or some constant is neutral. We assume constants can
+ // be folded into an addressing mode or an add's immediate operand.
+ if (isa<SCEVConstant>(Inc.IncExpr)) {
+ ++NumConstIncrements;
+ continue;
+ }
+
+ if (Inc.IncExpr == LastIncExpr)
+ ++NumReusedIncrements;
+ else
+ ++NumVarIncrements;
+
+ LastIncExpr = Inc.IncExpr;
+ }
+ // An IV chain with a single increment is handled by LSR's postinc
+ // uses. However, a chain with multiple increments requires keeping the IV's
+ // value live longer than it needs to be if chained.
+ if (NumConstIncrements > 1)
+ --cost;
+
+ // Materializing increment expressions in the preheader that didn't exist in
+ // the original code may cost a register. For example, sign-extended array
+ // indices can produce ridiculous increments like this:
+ // IV + ((sext i32 (2 * %s) to i64) + (-1 * (sext i32 %s to i64)))
+ cost += NumVarIncrements;
+
+ // Reusing variable increments likely saves a register to hold the multiple of
+ // the stride.
+ cost -= NumReusedIncrements;
+
+ DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost
+ << "\n");
+
+ return cost < 0;
+}
+
+/// Add this IV user to an existing chain or make it the head of a new chain.
+void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
+ SmallVectorImpl<ChainUsers> &ChainUsersVec) {
+ // When IVs are used as types of varying widths, they are generally converted
+ // to a wider type with some uses remaining narrow under a (free) trunc.
+ Value *const NextIV = getWideOperand(IVOper);
+ const SCEV *const OperExpr = SE.getSCEV(NextIV);
+ const SCEV *const OperExprBase = getExprBase(OperExpr);
+
+ // Visit all existing chains. Check if its IVOper can be computed as a
+ // profitable loop invariant increment from the last link in the Chain.
+ unsigned ChainIdx = 0, NChains = IVChainVec.size();
+ const SCEV *LastIncExpr = nullptr;
+ for (; ChainIdx < NChains; ++ChainIdx) {
+ IVChain &Chain = IVChainVec[ChainIdx];
+
+ // Prune the solution space aggressively by checking that both IV operands
+ // are expressions that operate on the same unscaled SCEVUnknown. This
+ // "base" will be canceled by the subsequent getMinusSCEV call. Checking
+ // first avoids creating extra SCEV expressions.
+ if (!StressIVChain && Chain.ExprBase != OperExprBase)
+ continue;
+
+ Value *PrevIV = getWideOperand(Chain.Incs.back().IVOperand);
+ if (!isCompatibleIVType(PrevIV, NextIV))
+ continue;
+
+ // A phi node terminates a chain.
+ if (isa<PHINode>(UserInst) && isa<PHINode>(Chain.tailUserInst()))
+ continue;
+
+ // The increment must be loop-invariant so it can be kept in a register.
+ const SCEV *PrevExpr = SE.getSCEV(PrevIV);
+ const SCEV *IncExpr = SE.getMinusSCEV(OperExpr, PrevExpr);
+ if (!SE.isLoopInvariant(IncExpr, L))
+ continue;
+
+ if (Chain.isProfitableIncrement(OperExpr, IncExpr, SE)) {
+ LastIncExpr = IncExpr;
+ break;
+ }
+ }
+ // If we haven't found a chain, create a new one, unless we hit the max. Don't
+ // bother for phi nodes, because they must be last in the chain.
+ if (ChainIdx == NChains) {
+ if (isa<PHINode>(UserInst))
+ return;
+ if (NChains >= MaxChains && !StressIVChain) {
+ DEBUG(dbgs() << "IV Chain Limit\n");
+ return;
+ }
+ LastIncExpr = OperExpr;
+ // IVUsers may have skipped over sign/zero extensions. We don't currently
+ // attempt to form chains involving extensions unless they can be hoisted
+ // into this loop's AddRec.
+ if (!isa<SCEVAddRecExpr>(LastIncExpr))
+ return;
+ ++NChains;
+ IVChainVec.push_back(IVChain(IVInc(UserInst, IVOper, LastIncExpr),
+ OperExprBase));
+ ChainUsersVec.resize(NChains);
+ DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst
+ << ") IV=" << *LastIncExpr << "\n");
+ } else {
+ DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Inc: (" << *UserInst
+ << ") IV+" << *LastIncExpr << "\n");
+ // Add this IV user to the end of the chain.
+ IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr));
+ }
+ IVChain &Chain = IVChainVec[ChainIdx];
+
+ SmallPtrSet<Instruction*,4> &NearUsers = ChainUsersVec[ChainIdx].NearUsers;
+ // This chain's NearUsers become FarUsers.
+ if (!LastIncExpr->isZero()) {
+ ChainUsersVec[ChainIdx].FarUsers.insert(NearUsers.begin(),
+ NearUsers.end());
+ NearUsers.clear();
+ }
+
+ // All other uses of IVOperand become near uses of the chain.
+ // We currently ignore intermediate values within SCEV expressions, assuming
+ // they will eventually be used be the current chain, or can be computed
+ // from one of the chain increments. To be more precise we could
+ // transitively follow its user and only add leaf IV users to the set.
+ for (User *U : IVOper->users()) {
+ Instruction *OtherUse = dyn_cast<Instruction>(U);
+ if (!OtherUse)
+ continue;
+ // Uses in the chain will no longer be uses if the chain is formed.
+ // Include the head of the chain in this iteration (not Chain.begin()).
+ IVChain::const_iterator IncIter = Chain.Incs.begin();
+ IVChain::const_iterator IncEnd = Chain.Incs.end();
+ for( ; IncIter != IncEnd; ++IncIter) {
+ if (IncIter->UserInst == OtherUse)
+ break;
+ }
+ if (IncIter != IncEnd)
+ continue;
+
+ if (SE.isSCEVable(OtherUse->getType())
+ && !isa<SCEVUnknown>(SE.getSCEV(OtherUse))
+ && IU.isIVUserOrOperand(OtherUse)) {
+ continue;
+ }
+ NearUsers.insert(OtherUse);
+ }
+
+ // Since this user is part of the chain, it's no longer considered a use
+ // of the chain.
+ ChainUsersVec[ChainIdx].FarUsers.erase(UserInst);
+}
+
+/// Populate the vector of Chains.
+///
+/// This decreases ILP at the architecture level. Targets with ample registers,
+/// multiple memory ports, and no register renaming probably don't want
+/// this. However, such targets should probably disable LSR altogether.
+///
+/// The job of LSR is to make a reasonable choice of induction variables across
+/// the loop. Subsequent passes can easily "unchain" computation exposing more
+/// ILP *within the loop* if the target wants it.
+///
+/// Finding the best IV chain is potentially a scheduling problem. Since LSR
+/// will not reorder memory operations, it will recognize this as a chain, but
+/// will generate redundant IV increments. Ideally this would be corrected later
+/// by a smart scheduler:
+/// = A[i]
+/// = A[i+x]
+/// A[i] =
+/// A[i+x] =
+///
+/// TODO: Walk the entire domtree within this loop, not just the path to the
+/// loop latch. This will discover chains on side paths, but requires
+/// maintaining multiple copies of the Chains state.
+void LSRInstance::CollectChains() {
+ DEBUG(dbgs() << "Collecting IV Chains.\n");
+ SmallVector<ChainUsers, 8> ChainUsersVec;
+
+ SmallVector<BasicBlock *,8> LatchPath;
+ BasicBlock *LoopHeader = L->getHeader();
+ for (DomTreeNode *Rung = DT.getNode(L->getLoopLatch());
+ Rung->getBlock() != LoopHeader; Rung = Rung->getIDom()) {
+ LatchPath.push_back(Rung->getBlock());
+ }
+ LatchPath.push_back(LoopHeader);
+
+ // Walk the instruction stream from the loop header to the loop latch.
+ for (SmallVectorImpl<BasicBlock *>::reverse_iterator
+ BBIter = LatchPath.rbegin(), BBEnd = LatchPath.rend();
+ BBIter != BBEnd; ++BBIter) {
+ for (BasicBlock::iterator I = (*BBIter)->begin(), E = (*BBIter)->end();
+ I != E; ++I) {
+ // Skip instructions that weren't seen by IVUsers analysis.
+ if (isa<PHINode>(I) || !IU.isIVUserOrOperand(&*I))
+ continue;
+
+ // Ignore users that are part of a SCEV expression. This way we only
+ // consider leaf IV Users. This effectively rediscovers a portion of
+ // IVUsers analysis but in program order this time.
+ if (SE.isSCEVable(I->getType()) && !isa<SCEVUnknown>(SE.getSCEV(&*I)))
+ continue;
+
+ // Remove this instruction from any NearUsers set it may be in.
+ for (unsigned ChainIdx = 0, NChains = IVChainVec.size();
+ ChainIdx < NChains; ++ChainIdx) {
+ ChainUsersVec[ChainIdx].NearUsers.erase(&*I);
+ }
+ // Search for operands that can be chained.
+ SmallPtrSet<Instruction*, 4> UniqueOperands;
+ User::op_iterator IVOpEnd = I->op_end();
+ User::op_iterator IVOpIter = findIVOperand(I->op_begin(), IVOpEnd, L, SE);
+ while (IVOpIter != IVOpEnd) {
+ Instruction *IVOpInst = cast<Instruction>(*IVOpIter);
+ if (UniqueOperands.insert(IVOpInst).second)
+ ChainInstruction(&*I, IVOpInst, ChainUsersVec);
+ IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
+ }
+ } // Continue walking down the instructions.
+ } // Continue walking down the domtree.
+ // Visit phi backedges to determine if the chain can generate the IV postinc.
+ for (BasicBlock::iterator I = L->getHeader()->begin();
+ PHINode *PN = dyn_cast<PHINode>(I); ++I) {
+ if (!SE.isSCEVable(PN->getType()))
+ continue;
+
+ Instruction *IncV =
+ dyn_cast<Instruction>(PN->getIncomingValueForBlock(L->getLoopLatch()));
+ if (IncV)
+ ChainInstruction(PN, IncV, ChainUsersVec);
+ }
+ // Remove any unprofitable chains.
+ unsigned ChainIdx = 0;
+ for (unsigned UsersIdx = 0, NChains = IVChainVec.size();
+ UsersIdx < NChains; ++UsersIdx) {
+ if (!isProfitableChain(IVChainVec[UsersIdx],
+ ChainUsersVec[UsersIdx].FarUsers, SE, TTI))
+ continue;
+ // Preserve the chain at UsesIdx.
+ if (ChainIdx != UsersIdx)
+ IVChainVec[ChainIdx] = IVChainVec[UsersIdx];
+ FinalizeChain(IVChainVec[ChainIdx]);
+ ++ChainIdx;
+ }
+ IVChainVec.resize(ChainIdx);
+}
+
+void LSRInstance::FinalizeChain(IVChain &Chain) {
+ assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
+ DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n");
+
+ for (const IVInc &Inc : Chain) {
+ DEBUG(dbgs() << " Inc: " << Inc.UserInst << "\n");
+ auto UseI = std::find(Inc.UserInst->op_begin(), Inc.UserInst->op_end(),
+ Inc.IVOperand);
+ assert(UseI != Inc.UserInst->op_end() && "cannot find IV operand");
+ IVIncSet.insert(UseI);
+ }
+}
+
+/// Return true if the IVInc can be folded into an addressing mode.
+static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
+ Value *Operand, const TargetTransformInfo &TTI) {
+ const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr);
+ if (!IncConst || !isAddressUse(UserInst, Operand))
+ return false;
+
+ if (IncConst->getAPInt().getMinSignedBits() > 64)
+ return false;
+
+ MemAccessTy AccessTy = getAccessType(UserInst);
+ int64_t IncOffset = IncConst->getValue()->getSExtValue();
+ if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr,
+ IncOffset, /*HaseBaseReg=*/false))
+ return false;
+
+ return true;
+}
+
+/// Generate an add or subtract for each IVInc in a chain to materialize the IV
+/// user's operand from the previous IV user's operand.
+void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
+ SmallVectorImpl<WeakVH> &DeadInsts) {
+ // Find the new IVOperand for the head of the chain. It may have been replaced
+ // by LSR.
+ const IVInc &Head = Chain.Incs[0];
+ User::op_iterator IVOpEnd = Head.UserInst->op_end();
+ // findIVOperand returns IVOpEnd if it can no longer find a valid IV user.
+ User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(),
+ IVOpEnd, L, SE);
+ Value *IVSrc = nullptr;
+ while (IVOpIter != IVOpEnd) {
+ IVSrc = getWideOperand(*IVOpIter);
+
+ // If this operand computes the expression that the chain needs, we may use
+ // it. (Check this after setting IVSrc which is used below.)
+ //
+ // Note that if Head.IncExpr is wider than IVSrc, then this phi is too
+ // narrow for the chain, so we can no longer use it. We do allow using a
+ // wider phi, assuming the LSR checked for free truncation. In that case we
+ // should already have a truncate on this operand such that
+ // getSCEV(IVSrc) == IncExpr.
+ if (SE.getSCEV(*IVOpIter) == Head.IncExpr
+ || SE.getSCEV(IVSrc) == Head.IncExpr) {
+ break;
+ }
+ IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
+ }
+ if (IVOpIter == IVOpEnd) {
+ // Gracefully give up on this chain.
+ DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n");
+ return;
+ }
+
+ DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n");
+ Type *IVTy = IVSrc->getType();
+ Type *IntTy = SE.getEffectiveSCEVType(IVTy);
+ const SCEV *LeftOverExpr = nullptr;
+ for (const IVInc &Inc : Chain) {
+ Instruction *InsertPt = Inc.UserInst;
+ if (isa<PHINode>(InsertPt))
+ InsertPt = L->getLoopLatch()->getTerminator();
+
+ // IVOper will replace the current IV User's operand. IVSrc is the IV
+ // value currently held in a register.
+ Value *IVOper = IVSrc;
+ if (!Inc.IncExpr->isZero()) {
+ // IncExpr was the result of subtraction of two narrow values, so must
+ // be signed.
+ const SCEV *IncExpr = SE.getNoopOrSignExtend(Inc.IncExpr, IntTy);
+ LeftOverExpr = LeftOverExpr ?
+ SE.getAddExpr(LeftOverExpr, IncExpr) : IncExpr;
+ }
+ if (LeftOverExpr && !LeftOverExpr->isZero()) {
+ // Expand the IV increment.
+ Rewriter.clearPostInc();
+ Value *IncV = Rewriter.expandCodeFor(LeftOverExpr, IntTy, InsertPt);
+ const SCEV *IVOperExpr = SE.getAddExpr(SE.getUnknown(IVSrc),
+ SE.getUnknown(IncV));
+ IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
+
+ // If an IV increment can't be folded, use it as the next IV value.
+ if (!canFoldIVIncExpr(LeftOverExpr, Inc.UserInst, Inc.IVOperand, TTI)) {
+ assert(IVTy == IVOper->getType() && "inconsistent IV increment type");
+ IVSrc = IVOper;
+ LeftOverExpr = nullptr;
+ }
+ }
+ Type *OperTy = Inc.IVOperand->getType();
+ if (IVTy != OperTy) {
+ assert(SE.getTypeSizeInBits(IVTy) >= SE.getTypeSizeInBits(OperTy) &&
+ "cannot extend a chained IV");
+ IRBuilder<> Builder(InsertPt);
+ IVOper = Builder.CreateTruncOrBitCast(IVOper, OperTy, "lsr.chain");
+ }
+ Inc.UserInst->replaceUsesOfWith(Inc.IVOperand, IVOper);
+ DeadInsts.emplace_back(Inc.IVOperand);
+ }
+ // If LSR created a new, wider phi, we may also replace its postinc. We only
+ // do this if we also found a wide value for the head of the chain.
+ if (isa<PHINode>(Chain.tailUserInst())) {
+ for (BasicBlock::iterator I = L->getHeader()->begin();
+ PHINode *Phi = dyn_cast<PHINode>(I); ++I) {
+ if (!isCompatibleIVType(Phi, IVSrc))
+ continue;
+ Instruction *PostIncV = dyn_cast<Instruction>(
+ Phi->getIncomingValueForBlock(L->getLoopLatch()));
+ if (!PostIncV || (SE.getSCEV(PostIncV) != SE.getSCEV(IVSrc)))
+ continue;
+ Value *IVOper = IVSrc;
+ Type *PostIncTy = PostIncV->getType();
+ if (IVTy != PostIncTy) {
+ assert(PostIncTy->isPointerTy() && "mixing int/ptr IV types");
+ IRBuilder<> Builder(L->getLoopLatch()->getTerminator());
+ Builder.SetCurrentDebugLocation(PostIncV->getDebugLoc());
+ IVOper = Builder.CreatePointerCast(IVSrc, PostIncTy, "lsr.chain");
+ }
+ Phi->replaceUsesOfWith(PostIncV, IVOper);
+ DeadInsts.emplace_back(PostIncV);
+ }
+ }
+}
+
+void LSRInstance::CollectFixupsAndInitialFormulae() {
+ for (const IVStrideUse &U : IU) {
+ Instruction *UserInst = U.getUser();
+ // Skip IV users that are part of profitable IV Chains.
+ User::op_iterator UseI = std::find(UserInst->op_begin(), UserInst->op_end(),
+ U.getOperandValToReplace());
+ assert(UseI != UserInst->op_end() && "cannot find IV operand");
+ if (IVIncSet.count(UseI))
+ continue;
+
+ // Record the uses.
+ LSRFixup &LF = getNewFixup();
+ LF.UserInst = UserInst;
+ LF.OperandValToReplace = U.getOperandValToReplace();
+ LF.PostIncLoops = U.getPostIncLoops();
+
+ LSRUse::KindType Kind = LSRUse::Basic;
+ MemAccessTy AccessTy;
+ if (isAddressUse(LF.UserInst, LF.OperandValToReplace)) {
+ Kind = LSRUse::Address;
+ AccessTy = getAccessType(LF.UserInst);
+ }
+
+ const SCEV *S = IU.getExpr(U);
+
+ // Equality (== and !=) ICmps are special. We can rewrite (i == N) as
+ // (N - i == 0), and this allows (N - i) to be the expression that we work
+ // with rather than just N or i, so we can consider the register
+ // requirements for both N and i at the same time. Limiting this code to
+ // equality icmps is not a problem because all interesting loops use
+ // equality icmps, thanks to IndVarSimplify.
+ if (ICmpInst *CI = dyn_cast<ICmpInst>(LF.UserInst))
+ if (CI->isEquality()) {
+ // Swap the operands if needed to put the OperandValToReplace on the
+ // left, for consistency.
+ Value *NV = CI->getOperand(1);
+ if (NV == LF.OperandValToReplace) {
+ CI->setOperand(1, CI->getOperand(0));
+ CI->setOperand(0, NV);
+ NV = CI->getOperand(1);
+ Changed = true;
+ }
+
+ // x == y --> x - y == 0
+ const SCEV *N = SE.getSCEV(NV);
+ if (SE.isLoopInvariant(N, L) && isSafeToExpand(N, SE)) {
+ // S is normalized, so normalize N before folding it into S
+ // to keep the result normalized.
+ N = TransformForPostIncUse(Normalize, N, CI, nullptr,
+ LF.PostIncLoops, SE, DT);
+ Kind = LSRUse::ICmpZero;
+ S = SE.getMinusSCEV(N, S);
+ }
+
+ // -1 and the negations of all interesting strides (except the negation
+ // of -1) are now also interesting.
+ for (size_t i = 0, e = Factors.size(); i != e; ++i)
+ if (Factors[i] != -1)
+ Factors.insert(-(uint64_t)Factors[i]);
+ Factors.insert(-1);
+ }
+
+ // Set up the initial formula for this use.
+ std::pair<size_t, int64_t> P = getUse(S, Kind, AccessTy);
+ LF.LUIdx = P.first;
+ LF.Offset = P.second;
+ LSRUse &LU = Uses[LF.LUIdx];
+ LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
+ if (!LU.WidestFixupType ||
+ SE.getTypeSizeInBits(LU.WidestFixupType) <
+ SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
+ LU.WidestFixupType = LF.OperandValToReplace->getType();
+
+ // If this is the first use of this LSRUse, give it a formula.
+ if (LU.Formulae.empty()) {
+ InsertInitialFormula(S, LU, LF.LUIdx);
+ CountRegisters(LU.Formulae.back(), LF.LUIdx);
+ }
+ }
+
+ DEBUG(print_fixups(dbgs()));
+}
+
+/// Insert a formula for the given expression into the given use, separating out
+/// loop-variant portions from loop-invariant and loop-computable portions.
+void
+LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) {
+ // Mark uses whose expressions cannot be expanded.
+ if (!isSafeToExpand(S, SE))
+ LU.RigidFormula = true;
+
+ Formula F;
+ F.initialMatch(S, L, SE);
+ bool Inserted = InsertFormula(LU, LUIdx, F);
+ assert(Inserted && "Initial formula already exists!"); (void)Inserted;
+}
+
+/// Insert a simple single-register formula for the given expression into the
+/// given use.
+void
+LSRInstance::InsertSupplementalFormula(const SCEV *S,
+ LSRUse &LU, size_t LUIdx) {
+ Formula F;
+ F.BaseRegs.push_back(S);
+ F.HasBaseReg = true;
+ bool Inserted = InsertFormula(LU, LUIdx, F);
+ assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;
+}
+
+/// Note which registers are used by the given formula, updating RegUses.
+void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
+ if (F.ScaledReg)
+ RegUses.countRegister(F.ScaledReg, LUIdx);
+ for (const SCEV *BaseReg : F.BaseRegs)
+ RegUses.countRegister(BaseReg, LUIdx);
+}
+
+/// If the given formula has not yet been inserted, add it to the list, and
+/// return true. Return false otherwise.
+bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
+ // Do not insert formula that we will not be able to expand.
+ assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) &&
+ "Formula is illegal");
+ if (!LU.InsertFormula(F))
+ return false;
+
+ CountRegisters(F, LUIdx);
+ return true;
+}
+
+/// Check for other uses of loop-invariant values which we're tracking. These
+/// other uses will pin these values in registers, making them less profitable
+/// for elimination.
+/// TODO: This currently misses non-constant addrec step registers.
+/// TODO: Should this give more weight to users inside the loop?
+void
+LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
+ SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end());
+ SmallPtrSet<const SCEV *, 32> Visited;
+
+ while (!Worklist.empty()) {
+ const SCEV *S = Worklist.pop_back_val();
+
+ // Don't process the same SCEV twice
+ if (!Visited.insert(S).second)
+ continue;
+
+ if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
+ Worklist.append(N->op_begin(), N->op_end());
+ else if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S))
+ Worklist.push_back(C->getOperand());
+ else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
+ Worklist.push_back(D->getLHS());
+ Worklist.push_back(D->getRHS());
+ } else if (const SCEVUnknown *US = dyn_cast<SCEVUnknown>(S)) {
+ const Value *V = US->getValue();
+ if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
+ // Look for instructions defined outside the loop.
+ if (L->contains(Inst)) continue;
+ } else if (isa<UndefValue>(V))
+ // Undef doesn't have a live range, so it doesn't matter.
+ continue;
+ for (const Use &U : V->uses()) {
+ const Instruction *UserInst = dyn_cast<Instruction>(U.getUser());
+ // Ignore non-instructions.
+ if (!UserInst)
+ continue;
+ // Ignore instructions in other functions (as can happen with
+ // Constants).
+ if (UserInst->getParent()->getParent() != L->getHeader()->getParent())
+ continue;
+ // Ignore instructions not dominated by the loop.
+ const BasicBlock *UseBB = !isa<PHINode>(UserInst) ?
+ UserInst->getParent() :
+ cast<PHINode>(UserInst)->getIncomingBlock(
+ PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
+ if (!DT.dominates(L->getHeader(), UseBB))
+ continue;
+ // Don't bother if the instruction is in a BB which ends in an EHPad.
+ if (UseBB->getTerminator()->isEHPad())
+ continue;
+ // Ignore uses which are part of other SCEV expressions, to avoid
+ // analyzing them multiple times.
+ if (SE.isSCEVable(UserInst->getType())) {
+ const SCEV *UserS = SE.getSCEV(const_cast<Instruction *>(UserInst));
+ // If the user is a no-op, look through to its uses.
+ if (!isa<SCEVUnknown>(UserS))
+ continue;
+ if (UserS == US) {
+ Worklist.push_back(
+ SE.getUnknown(const_cast<Instruction *>(UserInst)));
+ continue;
+ }
+ }
+ // Ignore icmp instructions which are already being analyzed.
+ if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {
+ unsigned OtherIdx = !U.getOperandNo();
+ Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx));
+ if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L))
+ continue;
+ }
+
+ LSRFixup &LF = getNewFixup();
+ LF.UserInst = const_cast<Instruction *>(UserInst);
+ LF.OperandValToReplace = U;
+ std::pair<size_t, int64_t> P = getUse(
+ S, LSRUse::Basic, MemAccessTy());
+ LF.LUIdx = P.first;
+ LF.Offset = P.second;
+ LSRUse &LU = Uses[LF.LUIdx];
+ LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
+ if (!LU.WidestFixupType ||
+ SE.getTypeSizeInBits(LU.WidestFixupType) <
+ SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
+ LU.WidestFixupType = LF.OperandValToReplace->getType();
+ InsertSupplementalFormula(US, LU, LF.LUIdx);
+ CountRegisters(LU.Formulae.back(), Uses.size() - 1);
+ break;
+ }
+ }
+ }
+}
+
+/// Split S into subexpressions which can be pulled out into separate
+/// registers. If C is non-null, multiply each subexpression by C.
+///
+/// Return remainder expression after factoring the subexpressions captured by
+/// Ops. If Ops is complete, return NULL.
+static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
+ SmallVectorImpl<const SCEV *> &Ops,
+ const Loop *L,
+ ScalarEvolution &SE,
+ unsigned Depth = 0) {
+ // Arbitrarily cap recursion to protect compile time.
+ if (Depth >= 3)
+ return S;
+
+ if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
+ // Break out add operands.
+ for (const SCEV *S : Add->operands()) {
+ const SCEV *Remainder = CollectSubexprs(S, C, Ops, L, SE, Depth+1);
+ if (Remainder)
+ Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
+ }
+ return nullptr;
+ } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
+ // Split a non-zero base out of an addrec.
+ if (AR->getStart()->isZero())
+ return S;
+
+ const SCEV *Remainder = CollectSubexprs(AR->getStart(),
+ C, Ops, L, SE, Depth+1);
+ // Split the non-zero AddRec unless it is part of a nested recurrence that
+ // does not pertain to this loop.
+ if (Remainder && (AR->getLoop() == L || !isa<SCEVAddRecExpr>(Remainder))) {
+ Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
+ Remainder = nullptr;
+ }
+ if (Remainder != AR->getStart()) {
+ if (!Remainder)
+ Remainder = SE.getConstant(AR->getType(), 0);
+ return SE.getAddRecExpr(Remainder,
+ AR->getStepRecurrence(SE),
+ AR->getLoop(),
+ //FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
+ SCEV::FlagAnyWrap);
+ }
+ } else if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
+ // Break (C * (a + b + c)) into C*a + C*b + C*c.
+ if (Mul->getNumOperands() != 2)
+ return S;
+ if (const SCEVConstant *Op0 =
+ dyn_cast<SCEVConstant>(Mul->getOperand(0))) {
+ C = C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0;
+ const SCEV *Remainder =
+ CollectSubexprs(Mul->getOperand(1), C, Ops, L, SE, Depth+1);
+ if (Remainder)
+ Ops.push_back(SE.getMulExpr(C, Remainder));
+ return nullptr;
+ }
+ }
+ return S;
+}
+
+/// \brief Helper function for LSRInstance::GenerateReassociations.
+void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
+ const Formula &Base,
+ unsigned Depth, size_t Idx,
+ bool IsScaledReg) {
+ const SCEV *BaseReg = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
+ SmallVector<const SCEV *, 8> AddOps;
+ const SCEV *Remainder = CollectSubexprs(BaseReg, nullptr, AddOps, L, SE);
+ if (Remainder)
+ AddOps.push_back(Remainder);
+
+ if (AddOps.size() == 1)
+ return;
+
+ for (SmallVectorImpl<const SCEV *>::const_iterator J = AddOps.begin(),
+ JE = AddOps.end();
+ J != JE; ++J) {
+
+ // Loop-variant "unknown" values are uninteresting; we won't be able to
+ // do anything meaningful with them.
+ if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L))
+ continue;
+
+ // Don't pull a constant into a register if the constant could be folded
+ // into an immediate field.
+ if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
+ LU.AccessTy, *J, Base.getNumRegs() > 1))
+ continue;
+
+ // Collect all operands except *J.
+ SmallVector<const SCEV *, 8> InnerAddOps(
+ ((const SmallVector<const SCEV *, 8> &)AddOps).begin(), J);
+ InnerAddOps.append(std::next(J),
+ ((const SmallVector<const SCEV *, 8> &)AddOps).end());
+
+ // Don't leave just a constant behind in a register if the constant could
+ // be folded into an immediate field.
+ if (InnerAddOps.size() == 1 &&
+ isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
+ LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1))
+ continue;
+
+ const SCEV *InnerSum = SE.getAddExpr(InnerAddOps);
+ if (InnerSum->isZero())
+ continue;
+ Formula F = Base;
+
+ // Add the remaining pieces of the add back into the new formula.
+ const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
+ if (InnerSumSC && SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
+ TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
+ InnerSumSC->getValue()->getZExtValue())) {
+ F.UnfoldedOffset =
+ (uint64_t)F.UnfoldedOffset + InnerSumSC->getValue()->getZExtValue();
+ if (IsScaledReg)
+ F.ScaledReg = nullptr;
+ else
+ F.BaseRegs.erase(F.BaseRegs.begin() + Idx);
+ } else if (IsScaledReg)
+ F.ScaledReg = InnerSum;
+ else
+ F.BaseRegs[Idx] = InnerSum;
+
+ // Add J as its own register, or an unfolded immediate.
+ const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
+ if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
+ TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
+ SC->getValue()->getZExtValue()))
+ F.UnfoldedOffset =
+ (uint64_t)F.UnfoldedOffset + SC->getValue()->getZExtValue();
+ else
+ F.BaseRegs.push_back(*J);
+ // We may have changed the number of register in base regs, adjust the
+ // formula accordingly.
+ F.canonicalize();
+
+ if (InsertFormula(LU, LUIdx, F))
+ // If that formula hadn't been seen before, recurse to find more like
+ // it.
+ GenerateReassociations(LU, LUIdx, LU.Formulae.back(), Depth + 1);
+ }
+}
+
+/// Split out subexpressions from adds and the bases of addrecs.
+void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
+ Formula Base, unsigned Depth) {
+ assert(Base.isCanonical() && "Input must be in the canonical form");
+ // Arbitrarily cap recursion to protect compile time.
+ if (Depth >= 3)
+ return;
+
+ for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
+ GenerateReassociationsImpl(LU, LUIdx, Base, Depth, i);
+
+ if (Base.Scale == 1)
+ GenerateReassociationsImpl(LU, LUIdx, Base, Depth,
+ /* Idx */ -1, /* IsScaledReg */ true);
+}
+
+/// Generate a formula consisting of all of the loop-dominating registers added
+/// into a single register.
+void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
+ Formula Base) {
+ // This method is only interesting on a plurality of registers.
+ if (Base.BaseRegs.size() + (Base.Scale == 1) <= 1)
+ return;
+
+ // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before
+ // processing the formula.
+ Base.unscale();
+ Formula F = Base;
+ F.BaseRegs.clear();
+ SmallVector<const SCEV *, 4> Ops;
+ for (const SCEV *BaseReg : Base.BaseRegs) {
+ if (SE.properlyDominates(BaseReg, L->getHeader()) &&
+ !SE.hasComputableLoopEvolution(BaseReg, L))
+ Ops.push_back(BaseReg);
+ else
+ F.BaseRegs.push_back(BaseReg);
+ }
+ if (Ops.size() > 1) {
+ const SCEV *Sum = SE.getAddExpr(Ops);
+ // TODO: If Sum is zero, it probably means ScalarEvolution missed an
+ // opportunity to fold something. For now, just ignore such cases
+ // rather than proceed with zero in a register.
+ if (!Sum->isZero()) {
+ F.BaseRegs.push_back(Sum);
+ F.canonicalize();
+ (void)InsertFormula(LU, LUIdx, F);
+ }
+ }
+}
+
+/// \brief Helper function for LSRInstance::GenerateSymbolicOffsets.
+void LSRInstance::GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
+ const Formula &Base, size_t Idx,
+ bool IsScaledReg) {
+ const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
+ GlobalValue *GV = ExtractSymbol(G, SE);
+ if (G->isZero() || !GV)
+ return;
+ Formula F = Base;
+ F.BaseGV = GV;
+ if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
+ return;
+ if (IsScaledReg)
+ F.ScaledReg = G;
+ else
+ F.BaseRegs[Idx] = G;
+ (void)InsertFormula(LU, LUIdx, F);
+}
+
+/// Generate reuse formulae using symbolic offsets.
+void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
+ Formula Base) {
+ // We can't add a symbolic offset if the address already contains one.
+ if (Base.BaseGV) return;
+
+ for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
+ GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, i);
+ if (Base.Scale == 1)
+ GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, /* Idx */ -1,
+ /* IsScaledReg */ true);
+}
+
+/// \brief Helper function for LSRInstance::GenerateConstantOffsets.
+void LSRInstance::GenerateConstantOffsetsImpl(
+ LSRUse &LU, unsigned LUIdx, const Formula &Base,
+ const SmallVectorImpl<int64_t> &Worklist, size_t Idx, bool IsScaledReg) {
+ const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
+ for (int64_t Offset : Worklist) {
+ Formula F = Base;
+ F.BaseOffset = (uint64_t)Base.BaseOffset - Offset;
+ if (isLegalUse(TTI, LU.MinOffset - Offset, LU.MaxOffset - Offset, LU.Kind,
+ LU.AccessTy, F)) {
+ // Add the offset to the base register.
+ const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), Offset), G);
+ // If it cancelled out, drop the base register, otherwise update it.
+ if (NewG->isZero()) {
+ if (IsScaledReg) {
+ F.Scale = 0;
+ F.ScaledReg = nullptr;
+ } else
+ F.deleteBaseReg(F.BaseRegs[Idx]);
+ F.canonicalize();
+ } else if (IsScaledReg)
+ F.ScaledReg = NewG;
+ else
+ F.BaseRegs[Idx] = NewG;
+
+ (void)InsertFormula(LU, LUIdx, F);
+ }
+ }
+
+ int64_t Imm = ExtractImmediate(G, SE);
+ if (G->isZero() || Imm == 0)
+ return;
+ Formula F = Base;
+ F.BaseOffset = (uint64_t)F.BaseOffset + Imm;
+ if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
+ return;
+ if (IsScaledReg)
+ F.ScaledReg = G;
+ else
+ F.BaseRegs[Idx] = G;
+ (void)InsertFormula(LU, LUIdx, F);
+}
+
+/// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets.
+void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
+ Formula Base) {
+ // TODO: For now, just add the min and max offset, because it usually isn't
+ // worthwhile looking at everything inbetween.
+ SmallVector<int64_t, 2> Worklist;
+ Worklist.push_back(LU.MinOffset);
+ if (LU.MaxOffset != LU.MinOffset)
+ Worklist.push_back(LU.MaxOffset);
+
+ for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
+ GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, i);
+ if (Base.Scale == 1)
+ GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, /* Idx */ -1,
+ /* IsScaledReg */ true);
+}
+
+/// For ICmpZero, check to see if we can scale up the comparison. For example, x
+/// == y -> x*c == y*c.
+void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
+ Formula Base) {
+ if (LU.Kind != LSRUse::ICmpZero) return;
+
+ // Determine the integer type for the base formula.
+ Type *IntTy = Base.getType();
+ if (!IntTy) return;
+ if (SE.getTypeSizeInBits(IntTy) > 64) return;
+
+ // Don't do this if there is more than one offset.
+ if (LU.MinOffset != LU.MaxOffset) return;
+
+ assert(!Base.BaseGV && "ICmpZero use is not legal!");
+
+ // Check each interesting stride.
+ for (int64_t Factor : Factors) {
+ // Check that the multiplication doesn't overflow.
+ if (Base.BaseOffset == INT64_MIN && Factor == -1)
+ continue;
+ int64_t NewBaseOffset = (uint64_t)Base.BaseOffset * Factor;
+ if (NewBaseOffset / Factor != Base.BaseOffset)
+ continue;
+ // If the offset will be truncated at this use, check that it is in bounds.
+ if (!IntTy->isPointerTy() &&
+ !ConstantInt::isValueValidForType(IntTy, NewBaseOffset))
+ continue;
+
+ // Check that multiplying with the use offset doesn't overflow.
+ int64_t Offset = LU.MinOffset;
+ if (Offset == INT64_MIN && Factor == -1)
+ continue;
+ Offset = (uint64_t)Offset * Factor;
+ if (Offset / Factor != LU.MinOffset)
+ continue;
+ // If the offset will be truncated at this use, check that it is in bounds.
+ if (!IntTy->isPointerTy() &&
+ !ConstantInt::isValueValidForType(IntTy, Offset))
+ continue;
+
+ Formula F = Base;
+ F.BaseOffset = NewBaseOffset;
+
+ // Check that this scale is legal.
+ if (!isLegalUse(TTI, Offset, Offset, LU.Kind, LU.AccessTy, F))
+ continue;
+
+ // Compensate for the use having MinOffset built into it.
+ F.BaseOffset = (uint64_t)F.BaseOffset + Offset - LU.MinOffset;
+
+ const SCEV *FactorS = SE.getConstant(IntTy, Factor);
+
+ // Check that multiplying with each base register doesn't overflow.
+ for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) {
+ F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS);
+ if (getExactSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
+ goto next;
+ }
+
+ // Check that multiplying with the scaled register doesn't overflow.
+ if (F.ScaledReg) {
+ F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS);
+ if (getExactSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
+ continue;
+ }
+
+ // Check that multiplying with the unfolded offset doesn't overflow.
+ if (F.UnfoldedOffset != 0) {
+ if (F.UnfoldedOffset == INT64_MIN && Factor == -1)
+ continue;
+ F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset * Factor;
+ if (F.UnfoldedOffset / Factor != Base.UnfoldedOffset)
+ continue;
+ // If the offset will be truncated, check that it is in bounds.
+ if (!IntTy->isPointerTy() &&
+ !ConstantInt::isValueValidForType(IntTy, F.UnfoldedOffset))
+ continue;
+ }
+
+ // If we make it here and it's legal, add it.
+ (void)InsertFormula(LU, LUIdx, F);
+ next:;
+ }
+}
+
+/// Generate stride factor reuse formulae by making use of scaled-offset address
+/// modes, for example.
+void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
+ // Determine the integer type for the base formula.
+ Type *IntTy = Base.getType();
+ if (!IntTy) return;
+
+ // If this Formula already has a scaled register, we can't add another one.
+ // Try to unscale the formula to generate a better scale.
+ if (Base.Scale != 0 && !Base.unscale())
+ return;
+
+ assert(Base.Scale == 0 && "unscale did not did its job!");
+
+ // Check each interesting stride.
+ for (int64_t Factor : Factors) {
+ Base.Scale = Factor;
+ Base.HasBaseReg = Base.BaseRegs.size() > 1;
+ // Check whether this scale is going to be legal.
+ if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
+ Base)) {
+ // As a special-case, handle special out-of-loop Basic users specially.
+ // TODO: Reconsider this special case.
+ if (LU.Kind == LSRUse::Basic &&
+ isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LSRUse::Special,
+ LU.AccessTy, Base) &&
+ LU.AllFixupsOutsideLoop)
+ LU.Kind = LSRUse::Special;
+ else
+ continue;
+ }
+ // For an ICmpZero, negating a solitary base register won't lead to
+ // new solutions.
+ if (LU.Kind == LSRUse::ICmpZero &&
+ !Base.HasBaseReg && Base.BaseOffset == 0 && !Base.BaseGV)
+ continue;
+ // For each addrec base reg, apply the scale, if possible.
+ for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
+ if (const SCEVAddRecExpr *AR =
+ dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i])) {
+ const SCEV *FactorS = SE.getConstant(IntTy, Factor);
+ if (FactorS->isZero())
+ continue;
+ // Divide out the factor, ignoring high bits, since we'll be
+ // scaling the value back up in the end.
+ if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true)) {
+ // TODO: This could be optimized to avoid all the copying.
+ Formula F = Base;
+ F.ScaledReg = Quotient;
+ F.deleteBaseReg(F.BaseRegs[i]);
+ // The canonical representation of 1*reg is reg, which is already in
+ // Base. In that case, do not try to insert the formula, it will be
+ // rejected anyway.
+ if (F.Scale == 1 && F.BaseRegs.empty())
+ continue;
+ (void)InsertFormula(LU, LUIdx, F);
+ }
+ }
+ }
+}
+
+/// Generate reuse formulae from different IV types.
+void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
+ // Don't bother truncating symbolic values.
+ if (Base.BaseGV) return;
+
+ // Determine the integer type for the base formula.
+ Type *DstTy = Base.getType();
+ if (!DstTy) return;
+ DstTy = SE.getEffectiveSCEVType(DstTy);
+
+ for (Type *SrcTy : Types) {
+ if (SrcTy != DstTy && TTI.isTruncateFree(SrcTy, DstTy)) {
+ Formula F = Base;
+
+ if (F.ScaledReg) F.ScaledReg = SE.getAnyExtendExpr(F.ScaledReg, SrcTy);
+ for (const SCEV *&BaseReg : F.BaseRegs)
+ BaseReg = SE.getAnyExtendExpr(BaseReg, SrcTy);
+
+ // TODO: This assumes we've done basic processing on all uses and
+ // have an idea what the register usage is.
+ if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses))
+ continue;
+
+ (void)InsertFormula(LU, LUIdx, F);
+ }
+ }
+}
+
+namespace {
+
+/// Helper class for GenerateCrossUseConstantOffsets. It's used to defer
+/// modifications so that the search phase doesn't have to worry about the data
+/// structures moving underneath it.
+struct WorkItem {
+ size_t LUIdx;
+ int64_t Imm;
+ const SCEV *OrigReg;
+
+ WorkItem(size_t LI, int64_t I, const SCEV *R)
+ : LUIdx(LI), Imm(I), OrigReg(R) {}
+
+ void print(raw_ostream &OS) const;
+ void dump() const;
+};
+
+}
+
+void WorkItem::print(raw_ostream &OS) const {
+ OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx
+ << " , add offset " << Imm;
+}
+
+LLVM_DUMP_METHOD
+void WorkItem::dump() const {
+ print(errs()); errs() << '\n';
+}
+
+/// Look for registers which are a constant distance apart and try to form reuse
+/// opportunities between them.
+void LSRInstance::GenerateCrossUseConstantOffsets() {
+ // Group the registers by their value without any added constant offset.
+ typedef std::map<int64_t, const SCEV *> ImmMapTy;
+ DenseMap<const SCEV *, ImmMapTy> Map;
+ DenseMap<const SCEV *, SmallBitVector> UsedByIndicesMap;
+ SmallVector<const SCEV *, 8> Sequence;
+ for (const SCEV *Use : RegUses) {
+ const SCEV *Reg = Use; // Make a copy for ExtractImmediate to modify.
+ int64_t Imm = ExtractImmediate(Reg, SE);
+ auto Pair = Map.insert(std::make_pair(Reg, ImmMapTy()));
+ if (Pair.second)
+ Sequence.push_back(Reg);
+ Pair.first->second.insert(std::make_pair(Imm, Use));
+ UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(Use);
+ }
+
+ // Now examine each set of registers with the same base value. Build up
+ // a list of work to do and do the work in a separate step so that we're
+ // not adding formulae and register counts while we're searching.
+ SmallVector<WorkItem, 32> WorkItems;
+ SmallSet<std::pair<size_t, int64_t>, 32> UniqueItems;
+ for (const SCEV *Reg : Sequence) {
+ const ImmMapTy &Imms = Map.find(Reg)->second;
+
+ // It's not worthwhile looking for reuse if there's only one offset.
+ if (Imms.size() == 1)
+ continue;
+
+ DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
+ for (const auto &Entry : Imms)
+ dbgs() << ' ' << Entry.first;
+ dbgs() << '\n');
+
+ // Examine each offset.
+ for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
+ J != JE; ++J) {
+ const SCEV *OrigReg = J->second;
+
+ int64_t JImm = J->first;
+ const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg);
+
+ if (!isa<SCEVConstant>(OrigReg) &&
+ UsedByIndicesMap[Reg].count() == 1) {
+ DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg << '\n');
+ continue;
+ }
+
+ // Conservatively examine offsets between this orig reg a few selected
+ // other orig regs.
+ ImmMapTy::const_iterator OtherImms[] = {
+ Imms.begin(), std::prev(Imms.end()),
+ Imms.lower_bound((Imms.begin()->first + std::prev(Imms.end())->first) /
+ 2)
+ };
+ for (size_t i = 0, e = array_lengthof(OtherImms); i != e; ++i) {
+ ImmMapTy::const_iterator M = OtherImms[i];
+ if (M == J || M == JE) continue;
+
+ // Compute the difference between the two.
+ int64_t Imm = (uint64_t)JImm - M->first;
+ for (int LUIdx = UsedByIndices.find_first(); LUIdx != -1;
+ LUIdx = UsedByIndices.find_next(LUIdx))
+ // Make a memo of this use, offset, and register tuple.
+ if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second)
+ WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg));
+ }
+ }
+ }
+
+ Map.clear();
+ Sequence.clear();
+ UsedByIndicesMap.clear();
+ UniqueItems.clear();
+
+ // Now iterate through the worklist and add new formulae.
+ for (const WorkItem &WI : WorkItems) {
+ size_t LUIdx = WI.LUIdx;
+ LSRUse &LU = Uses[LUIdx];
+ int64_t Imm = WI.Imm;
+ const SCEV *OrigReg = WI.OrigReg;
+
+ Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
+ const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm));
+ unsigned BitWidth = SE.getTypeSizeInBits(IntTy);
+
+ // TODO: Use a more targeted data structure.
+ for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
+ Formula F = LU.Formulae[L];
+ // FIXME: The code for the scaled and unscaled registers looks
+ // very similar but slightly different. Investigate if they
+ // could be merged. That way, we would not have to unscale the
+ // Formula.
+ F.unscale();
+ // Use the immediate in the scaled register.
+ if (F.ScaledReg == OrigReg) {
+ int64_t Offset = (uint64_t)F.BaseOffset + Imm * (uint64_t)F.Scale;
+ // Don't create 50 + reg(-50).
+ if (F.referencesReg(SE.getSCEV(
+ ConstantInt::get(IntTy, -(uint64_t)Offset))))
+ continue;
+ Formula NewF = F;
+ NewF.BaseOffset = Offset;
+ if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
+ NewF))
+ continue;
+ NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg);
+
+ // If the new scale is a constant in a register, and adding the constant
+ // value to the immediate would produce a value closer to zero than the
+ // immediate itself, then the formula isn't worthwhile.
+ if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg))
+ if (C->getValue()->isNegative() != (NewF.BaseOffset < 0) &&
+ (C->getAPInt().abs() * APInt(BitWidth, F.Scale))
+ .ule(std::abs(NewF.BaseOffset)))
+ continue;
+
+ // OK, looks good.
+ NewF.canonicalize();
+ (void)InsertFormula(LU, LUIdx, NewF);
+ } else {
+ // Use the immediate in a base register.
+ for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) {
+ const SCEV *BaseReg = F.BaseRegs[N];
+ if (BaseReg != OrigReg)
+ continue;
+ Formula NewF = F;
+ NewF.BaseOffset = (uint64_t)NewF.BaseOffset + Imm;
+ if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset,
+ LU.Kind, LU.AccessTy, NewF)) {
+ if (!TTI.isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm))
+ continue;
+ NewF = F;
+ NewF.UnfoldedOffset = (uint64_t)NewF.UnfoldedOffset + Imm;
+ }
+ NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);
+
+ // If the new formula has a constant in a register, and adding the
+ // constant value to the immediate would produce a value closer to
+ // zero than the immediate itself, then the formula isn't worthwhile.
+ for (const SCEV *NewReg : NewF.BaseRegs)
+ if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewReg))
+ if ((C->getAPInt() + NewF.BaseOffset)
+ .abs()
+ .slt(std::abs(NewF.BaseOffset)) &&
+ (C->getAPInt() + NewF.BaseOffset).countTrailingZeros() >=
+ countTrailingZeros<uint64_t>(NewF.BaseOffset))
+ goto skip_formula;
+
+ // Ok, looks good.
+ NewF.canonicalize();
+ (void)InsertFormula(LU, LUIdx, NewF);
+ break;
+ skip_formula:;
+ }
+ }
+ }
+ }
+}
+
+/// Generate formulae for each use.
+void
+LSRInstance::GenerateAllReuseFormulae() {
+ // This is split into multiple loops so that hasRegsUsedByUsesOtherThan
+ // queries are more precise.
+ for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+ LSRUse &LU = Uses[LUIdx];
+ for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
+ GenerateReassociations(LU, LUIdx, LU.Formulae[i]);
+ for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
+ GenerateCombinations(LU, LUIdx, LU.Formulae[i]);
+ }
+ for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+ LSRUse &LU = Uses[LUIdx];
+ for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
+ GenerateSymbolicOffsets(LU, LUIdx, LU.Formulae[i]);
+ for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
+ GenerateConstantOffsets(LU, LUIdx, LU.Formulae[i]);
+ for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
+ GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]);
+ for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
+ GenerateScales(LU, LUIdx, LU.Formulae[i]);
+ }
+ for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+ LSRUse &LU = Uses[LUIdx];
+ for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
+ GenerateTruncates(LU, LUIdx, LU.Formulae[i]);
+ }
+
+ GenerateCrossUseConstantOffsets();
+
+ DEBUG(dbgs() << "\n"
+ "After generating reuse formulae:\n";
+ print_uses(dbgs()));
+}
+
+/// If there are multiple formulae with the same set of registers used
+/// by other uses, pick the best one and delete the others.
+void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
+ DenseSet<const SCEV *> VisitedRegs;
+ SmallPtrSet<const SCEV *, 16> Regs;
+ SmallPtrSet<const SCEV *, 16> LoserRegs;
+#ifndef NDEBUG
+ bool ChangedFormulae = false;
+#endif
+
+ // Collect the best formula for each unique set of shared registers. This
+ // is reset for each use.
+ typedef DenseMap<SmallVector<const SCEV *, 4>, size_t, UniquifierDenseMapInfo>
+ BestFormulaeTy;
+ BestFormulaeTy BestFormulae;
+
+ for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+ LSRUse &LU = Uses[LUIdx];
+ DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs()); dbgs() << '\n');
+
+ bool Any = false;
+ for (size_t FIdx = 0, NumForms = LU.Formulae.size();
+ FIdx != NumForms; ++FIdx) {
+ Formula &F = LU.Formulae[FIdx];
+
+ // Some formulas are instant losers. For example, they may depend on
+ // nonexistent AddRecs from other loops. These need to be filtered
+ // immediately, otherwise heuristics could choose them over others leading
+ // to an unsatisfactory solution. Passing LoserRegs into RateFormula here
+ // avoids the need to recompute this information across formulae using the
+ // same bad AddRec. Passing LoserRegs is also essential unless we remove
+ // the corresponding bad register from the Regs set.
+ Cost CostF;
+ Regs.clear();
+ CostF.RateFormula(TTI, F, Regs, VisitedRegs, L, LU.Offsets, SE, DT, LU,
+ &LoserRegs);
+ if (CostF.isLoser()) {
+ // During initial formula generation, undesirable formulae are generated
+ // by uses within other loops that have some non-trivial address mode or
+ // use the postinc form of the IV. LSR needs to provide these formulae
+ // as the basis of rediscovering the desired formula that uses an AddRec
+ // corresponding to the existing phi. Once all formulae have been
+ // generated, these initial losers may be pruned.
+ DEBUG(dbgs() << " Filtering loser "; F.print(dbgs());
+ dbgs() << "\n");
+ }
+ else {
+ SmallVector<const SCEV *, 4> Key;
+ for (const SCEV *Reg : F.BaseRegs) {
+ if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx))
+ Key.push_back(Reg);
+ }
+ if (F.ScaledReg &&
+ RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx))
+ Key.push_back(F.ScaledReg);
+ // Unstable sort by host order ok, because this is only used for
+ // uniquifying.
+ std::sort(Key.begin(), Key.end());
+
+ std::pair<BestFormulaeTy::const_iterator, bool> P =
+ BestFormulae.insert(std::make_pair(Key, FIdx));
+ if (P.second)
+ continue;
+
+ Formula &Best = LU.Formulae[P.first->second];
+
+ Cost CostBest;
+ Regs.clear();
+ CostBest.RateFormula(TTI, Best, Regs, VisitedRegs, L, LU.Offsets, SE,
+ DT, LU);
+ if (CostF < CostBest)
+ std::swap(F, Best);
+ DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
+ dbgs() << "\n"
+ " in favor of formula "; Best.print(dbgs());
+ dbgs() << '\n');
+ }
+#ifndef NDEBUG
+ ChangedFormulae = true;
+#endif
+ LU.DeleteFormula(F);
+ --FIdx;
+ --NumForms;
+ Any = true;
+ }
+
+ // Now that we've filtered out some formulae, recompute the Regs set.
+ if (Any)
+ LU.RecomputeRegs(LUIdx, RegUses);
+
+ // Reset this to prepare for the next use.
+ BestFormulae.clear();
+ }
+
+ DEBUG(if (ChangedFormulae) {
+ dbgs() << "\n"
+ "After filtering out undesirable candidates:\n";
+ print_uses(dbgs());
+ });
+}
+
+// This is a rough guess that seems to work fairly well.
+static const size_t ComplexityLimit = UINT16_MAX;
+
+/// Estimate the worst-case number of solutions the solver might have to
+/// consider. It almost never considers this many solutions because it prune the
+/// search space, but the pruning isn't always sufficient.
+size_t LSRInstance::EstimateSearchSpaceComplexity() const {
+ size_t Power = 1;
+ for (const LSRUse &LU : Uses) {
+ size_t FSize = LU.Formulae.size();
+ if (FSize >= ComplexityLimit) {
+ Power = ComplexityLimit;
+ break;
+ }
+ Power *= FSize;
+ if (Power >= ComplexityLimit)
+ break;
+ }
+ return Power;
+}
+
+/// When one formula uses a superset of the registers of another formula, it
+/// won't help reduce register pressure (though it may not necessarily hurt
+/// register pressure); remove it to simplify the system.
+void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
+ if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
+ DEBUG(dbgs() << "The search space is too complex.\n");
+
+ DEBUG(dbgs() << "Narrowing the search space by eliminating formulae "
+ "which use a superset of registers used by other "
+ "formulae.\n");
+
+ for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+ LSRUse &LU = Uses[LUIdx];
+ bool Any = false;
+ for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
+ Formula &F = LU.Formulae[i];
+ // Look for a formula with a constant or GV in a register. If the use
+ // also has a formula with that same value in an immediate field,
+ // delete the one that uses a register.
+ for (SmallVectorImpl<const SCEV *>::const_iterator
+ I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) {
+ if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) {
+ Formula NewF = F;
+ NewF.BaseOffset += C->getValue()->getSExtValue();
+ NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
+ (I - F.BaseRegs.begin()));
+ if (LU.HasFormulaWithSameRegs(NewF)) {
+ DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
+ LU.DeleteFormula(F);
+ --i;
+ --e;
+ Any = true;
+ break;
+ }
+ } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) {
+ if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue()))
+ if (!F.BaseGV) {
+ Formula NewF = F;
+ NewF.BaseGV = GV;
+ NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
+ (I - F.BaseRegs.begin()));
+ if (LU.HasFormulaWithSameRegs(NewF)) {
+ DEBUG(dbgs() << " Deleting "; F.print(dbgs());
+ dbgs() << '\n');
+ LU.DeleteFormula(F);
+ --i;
+ --e;
+ Any = true;
+ break;
+ }
+ }
+ }
+ }
+ }
+ if (Any)
+ LU.RecomputeRegs(LUIdx, RegUses);
+ }
+
+ DEBUG(dbgs() << "After pre-selection:\n";
+ print_uses(dbgs()));
+ }
+}
+
+/// When there are many registers for expressions like A, A+1, A+2, etc.,
+/// allocate a single register for them.
+void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
+ if (EstimateSearchSpaceComplexity() < ComplexityLimit)
+ return;
+
+ DEBUG(dbgs() << "The search space is too complex.\n"
+ "Narrowing the search space by assuming that uses separated "
+ "by a constant offset will use the same registers.\n");
+
+ // This is especially useful for unrolled loops.
+
+ for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+ LSRUse &LU = Uses[LUIdx];
+ for (const Formula &F : LU.Formulae) {
+ if (F.BaseOffset == 0 || (F.Scale != 0 && F.Scale != 1))
+ continue;
+
+ LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU);
+ if (!LUThatHas)
+ continue;
+
+ if (!reconcileNewOffset(*LUThatHas, F.BaseOffset, /*HasBaseReg=*/ false,
+ LU.Kind, LU.AccessTy))
+ continue;
+
+ DEBUG(dbgs() << " Deleting use "; LU.print(dbgs()); dbgs() << '\n');
+
+ LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
+
+ // Update the relocs to reference the new use.
+ for (LSRFixup &Fixup : Fixups) {
+ if (Fixup.LUIdx == LUIdx) {
+ Fixup.LUIdx = LUThatHas - &Uses.front();
+ Fixup.Offset += F.BaseOffset;
+ // Add the new offset to LUThatHas' offset list.
+ if (LUThatHas->Offsets.back() != Fixup.Offset) {
+ LUThatHas->Offsets.push_back(Fixup.Offset);
+ if (Fixup.Offset > LUThatHas->MaxOffset)
+ LUThatHas->MaxOffset = Fixup.Offset;
+ if (Fixup.Offset < LUThatHas->MinOffset)
+ LUThatHas->MinOffset = Fixup.Offset;
+ }
+ DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n');
+ }
+ if (Fixup.LUIdx == NumUses-1)
+ Fixup.LUIdx = LUIdx;
+ }
+
+ // Delete formulae from the new use which are no longer legal.
+ bool Any = false;
+ for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
+ Formula &F = LUThatHas->Formulae[i];
+ if (!isLegalUse(TTI, LUThatHas->MinOffset, LUThatHas->MaxOffset,
+ LUThatHas->Kind, LUThatHas->AccessTy, F)) {
+ DEBUG(dbgs() << " Deleting "; F.print(dbgs());
+ dbgs() << '\n');
+ LUThatHas->DeleteFormula(F);
+ --i;
+ --e;
+ Any = true;
+ }
+ }
+
+ if (Any)
+ LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses);
+
+ // Delete the old use.
+ DeleteUse(LU, LUIdx);
+ --LUIdx;
+ --NumUses;
+ break;
+ }
+ }
+
+ DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
+}
+
+/// Call FilterOutUndesirableDedicatedRegisters again, if necessary, now that
+/// we've done more filtering, as it may be able to find more formulae to
+/// eliminate.
+void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
+ if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
+ DEBUG(dbgs() << "The search space is too complex.\n");
+
+ DEBUG(dbgs() << "Narrowing the search space by re-filtering out "
+ "undesirable dedicated registers.\n");
+
+ FilterOutUndesirableDedicatedRegisters();
+
+ DEBUG(dbgs() << "After pre-selection:\n";
+ print_uses(dbgs()));
+ }
+}
+
+/// Pick a register which seems likely to be profitable, and then in any use
+/// which has any reference to that register, delete all formulae which do not
+/// reference that register.
+void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
+ // With all other options exhausted, loop until the system is simple
+ // enough to handle.
+ SmallPtrSet<const SCEV *, 4> Taken;
+ while (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
+ // Ok, we have too many of formulae on our hands to conveniently handle.
+ // Use a rough heuristic to thin out the list.
+ DEBUG(dbgs() << "The search space is too complex.\n");
+
+ // Pick the register which is used by the most LSRUses, which is likely
+ // to be a good reuse register candidate.
+ const SCEV *Best = nullptr;
+ unsigned BestNum = 0;
+ for (const SCEV *Reg : RegUses) {
+ if (Taken.count(Reg))
+ continue;
+ if (!Best)
+ Best = Reg;
+ else {
+ unsigned Count = RegUses.getUsedByIndices(Reg).count();
+ if (Count > BestNum) {
+ Best = Reg;
+ BestNum = Count;
+ }
+ }
+ }
+
+ DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
+ << " will yield profitable reuse.\n");
+ Taken.insert(Best);
+
+ // In any use with formulae which references this register, delete formulae
+ // which don't reference it.
+ for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+ LSRUse &LU = Uses[LUIdx];
+ if (!LU.Regs.count(Best)) continue;
+
+ bool Any = false;
+ for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
+ Formula &F = LU.Formulae[i];
+ if (!F.referencesReg(Best)) {
+ DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
+ LU.DeleteFormula(F);
+ --e;
+ --i;
+ Any = true;
+ assert(e != 0 && "Use has no formulae left! Is Regs inconsistent?");
+ continue;
+ }
+ }
+
+ if (Any)
+ LU.RecomputeRegs(LUIdx, RegUses);
+ }
+
+ DEBUG(dbgs() << "After pre-selection:\n";
+ print_uses(dbgs()));
+ }
+}
+
+/// If there are an extraordinary number of formulae to choose from, use some
+/// rough heuristics to prune down the number of formulae. This keeps the main
+/// solver from taking an extraordinary amount of time in some worst-case
+/// scenarios.
+void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
+ NarrowSearchSpaceByDetectingSupersets();
+ NarrowSearchSpaceByCollapsingUnrolledCode();
+ NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
+ NarrowSearchSpaceByPickingWinnerRegs();
+}
+
+/// This is the recursive solver.
+void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
+ Cost &SolutionCost,
+ SmallVectorImpl<const Formula *> &Workspace,
+ const Cost &CurCost,
+ const SmallPtrSet<const SCEV *, 16> &CurRegs,
+ DenseSet<const SCEV *> &VisitedRegs) const {
+ // Some ideas:
+ // - prune more:
+ // - use more aggressive filtering
+ // - sort the formula so that the most profitable solutions are found first
+ // - sort the uses too
+ // - search faster:
+ // - don't compute a cost, and then compare. compare while computing a cost
+ // and bail early.
+ // - track register sets with SmallBitVector
+
+ const LSRUse &LU = Uses[Workspace.size()];
+
+ // If this use references any register that's already a part of the
+ // in-progress solution, consider it a requirement that a formula must
+ // reference that register in order to be considered. This prunes out
+ // unprofitable searching.
+ SmallSetVector<const SCEV *, 4> ReqRegs;
+ for (const SCEV *S : CurRegs)
+ if (LU.Regs.count(S))
+ ReqRegs.insert(S);
+
+ SmallPtrSet<const SCEV *, 16> NewRegs;
+ Cost NewCost;
+ for (const Formula &F : LU.Formulae) {
+ // Ignore formulae which may not be ideal in terms of register reuse of
+ // ReqRegs. The formula should use all required registers before
+ // introducing new ones.
+ int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size());
+ for (const SCEV *Reg : ReqRegs) {
+ if ((F.ScaledReg && F.ScaledReg == Reg) ||
+ std::find(F.BaseRegs.begin(), F.BaseRegs.end(), Reg) !=
+ F.BaseRegs.end()) {
+ --NumReqRegsToFind;
+ if (NumReqRegsToFind == 0)
+ break;
+ }
+ }
+ if (NumReqRegsToFind != 0) {
+ // If none of the formulae satisfied the required registers, then we could
+ // clear ReqRegs and try again. Currently, we simply give up in this case.
+ continue;
+ }
+
+ // Evaluate the cost of the current formula. If it's already worse than
+ // the current best, prune the search at that point.
+ NewCost = CurCost;
+ NewRegs = CurRegs;
+ NewCost.RateFormula(TTI, F, NewRegs, VisitedRegs, L, LU.Offsets, SE, DT,
+ LU);
+ if (NewCost < SolutionCost) {
+ Workspace.push_back(&F);
+ if (Workspace.size() != Uses.size()) {
+ SolveRecurse(Solution, SolutionCost, Workspace, NewCost,
+ NewRegs, VisitedRegs);
+ if (F.getNumRegs() == 1 && Workspace.size() == 1)
+ VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
+ } else {
+ DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
+ dbgs() << ".\n Regs:";
+ for (const SCEV *S : NewRegs)
+ dbgs() << ' ' << *S;
+ dbgs() << '\n');
+
+ SolutionCost = NewCost;
+ Solution = Workspace;
+ }
+ Workspace.pop_back();
+ }
+ }
+}
+
+/// Choose one formula from each use. Return the results in the given Solution
+/// vector.
+void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
+ SmallVector<const Formula *, 8> Workspace;
+ Cost SolutionCost;
+ SolutionCost.Lose();
+ Cost CurCost;
+ SmallPtrSet<const SCEV *, 16> CurRegs;
+ DenseSet<const SCEV *> VisitedRegs;
+ Workspace.reserve(Uses.size());
+
+ // SolveRecurse does all the work.
+ SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
+ CurRegs, VisitedRegs);
+ if (Solution.empty()) {
+ DEBUG(dbgs() << "\nNo Satisfactory Solution\n");
+ return;
+ }
+
+ // Ok, we've now made all our decisions.
+ DEBUG(dbgs() << "\n"
+ "The chosen solution requires "; SolutionCost.print(dbgs());
+ dbgs() << ":\n";
+ for (size_t i = 0, e = Uses.size(); i != e; ++i) {
+ dbgs() << " ";
+ Uses[i].print(dbgs());
+ dbgs() << "\n"
+ " ";
+ Solution[i]->print(dbgs());
+ dbgs() << '\n';
+ });
+
+ assert(Solution.size() == Uses.size() && "Malformed solution!");
+}
+
+/// Helper for AdjustInsertPositionForExpand. Climb up the dominator tree far as
+/// we can go while still being dominated by the input positions. This helps
+/// canonicalize the insert position, which encourages sharing.
+BasicBlock::iterator
+LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
+ const SmallVectorImpl<Instruction *> &Inputs)
+ const {
+ for (;;) {
+ const Loop *IPLoop = LI.getLoopFor(IP->getParent());
+ unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0;
+
+ BasicBlock *IDom;
+ for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) {
+ if (!Rung) return IP;
+ Rung = Rung->getIDom();
+ if (!Rung) return IP;
+ IDom = Rung->getBlock();
+
+ // Don't climb into a loop though.
+ const Loop *IDomLoop = LI.getLoopFor(IDom);
+ unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0;
+ if (IDomDepth <= IPLoopDepth &&
+ (IDomDepth != IPLoopDepth || IDomLoop == IPLoop))
+ break;
+ }
+
+ bool AllDominate = true;
+ Instruction *BetterPos = nullptr;
+ Instruction *Tentative = IDom->getTerminator();
+ for (Instruction *Inst : Inputs) {
+ if (Inst == Tentative || !DT.dominates(Inst, Tentative)) {
+ AllDominate = false;
+ break;
+ }
+ // Attempt to find an insert position in the middle of the block,
+ // instead of at the end, so that it can be used for other expansions.
+ if (IDom == Inst->getParent() &&
+ (!BetterPos || !DT.dominates(Inst, BetterPos)))
+ BetterPos = &*std::next(BasicBlock::iterator(Inst));
+ }
+ if (!AllDominate)
+ break;
+ if (BetterPos)
+ IP = BetterPos->getIterator();
+ else
+ IP = Tentative->getIterator();
+ }
+
+ return IP;
+}
+
+/// Determine an input position which will be dominated by the operands and
+/// which will dominate the result.
+BasicBlock::iterator
+LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP,
+ const LSRFixup &LF,
+ const LSRUse &LU,
+ SCEVExpander &Rewriter) const {
+ // Collect some instructions which must be dominated by the
+ // expanding replacement. These must be dominated by any operands that
+ // will be required in the expansion.
+ SmallVector<Instruction *, 4> Inputs;
+ if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace))
+ Inputs.push_back(I);
+ if (LU.Kind == LSRUse::ICmpZero)
+ if (Instruction *I =
+ dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1)))
+ Inputs.push_back(I);
+ if (LF.PostIncLoops.count(L)) {
+ if (LF.isUseFullyOutsideLoop(L))
+ Inputs.push_back(L->getLoopLatch()->getTerminator());
+ else
+ Inputs.push_back(IVIncInsertPos);
+ }
+ // The expansion must also be dominated by the increment positions of any
+ // loops it for which it is using post-inc mode.
+ for (const Loop *PIL : LF.PostIncLoops) {
+ if (PIL == L) continue;
+
+ // Be dominated by the loop exit.
+ SmallVector<BasicBlock *, 4> ExitingBlocks;
+ PIL->getExitingBlocks(ExitingBlocks);
+ if (!ExitingBlocks.empty()) {
+ BasicBlock *BB = ExitingBlocks[0];
+ for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i)
+ BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]);
+ Inputs.push_back(BB->getTerminator());
+ }
+ }
+
+ assert(!isa<PHINode>(LowestIP) && !LowestIP->isEHPad()
+ && !isa<DbgInfoIntrinsic>(LowestIP) &&
+ "Insertion point must be a normal instruction");
+
+ // Then, climb up the immediate dominator tree as far as we can go while
+ // still being dominated by the input positions.
+ BasicBlock::iterator IP = HoistInsertPosition(LowestIP, Inputs);
+
+ // Don't insert instructions before PHI nodes.
+ while (isa<PHINode>(IP)) ++IP;
+
+ // Ignore landingpad instructions.
+ while (!isa<TerminatorInst>(IP) && IP->isEHPad()) ++IP;
+
+ // Ignore debug intrinsics.
+ while (isa<DbgInfoIntrinsic>(IP)) ++IP;
+
+ // Set IP below instructions recently inserted by SCEVExpander. This keeps the
+ // IP consistent across expansions and allows the previously inserted
+ // instructions to be reused by subsequent expansion.
+ while (Rewriter.isInsertedInstruction(&*IP) && IP != LowestIP)
+ ++IP;
+
+ return IP;
+}
+
+/// Emit instructions for the leading candidate expression for this LSRUse (this
+/// is called "expanding").
+Value *LSRInstance::Expand(const LSRFixup &LF,
+ const Formula &F,
+ BasicBlock::iterator IP,
+ SCEVExpander &Rewriter,
+ SmallVectorImpl<WeakVH> &DeadInsts) const {
+ const LSRUse &LU = Uses[LF.LUIdx];
+ if (LU.RigidFormula)
+ return LF.OperandValToReplace;
+
+ // Determine an input position which will be dominated by the operands and
+ // which will dominate the result.
+ IP = AdjustInsertPositionForExpand(IP, LF, LU, Rewriter);
+
+ // Inform the Rewriter if we have a post-increment use, so that it can
+ // perform an advantageous expansion.
+ Rewriter.setPostInc(LF.PostIncLoops);
+
+ // This is the type that the user actually needs.
+ Type *OpTy = LF.OperandValToReplace->getType();
+ // This will be the type that we'll initially expand to.
+ Type *Ty = F.getType();
+ if (!Ty)
+ // No type known; just expand directly to the ultimate type.
+ Ty = OpTy;
+ else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy))
+ // Expand directly to the ultimate type if it's the right size.
+ Ty = OpTy;
+ // This is the type to do integer arithmetic in.
+ Type *IntTy = SE.getEffectiveSCEVType(Ty);
+
+ // Build up a list of operands to add together to form the full base.
+ SmallVector<const SCEV *, 8> Ops;
+
+ // Expand the BaseRegs portion.
+ for (const SCEV *Reg : F.BaseRegs) {
+ assert(!Reg->isZero() && "Zero allocated in a base register!");
+
+ // If we're expanding for a post-inc user, make the post-inc adjustment.
+ PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
+ Reg = TransformForPostIncUse(Denormalize, Reg,
+ LF.UserInst, LF.OperandValToReplace,
+ Loops, SE, DT);
+
+ Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr, &*IP)));
+ }
+
+ // Expand the ScaledReg portion.
+ Value *ICmpScaledV = nullptr;
+ if (F.Scale != 0) {
+ const SCEV *ScaledS = F.ScaledReg;
+
+ // If we're expanding for a post-inc user, make the post-inc adjustment.
+ PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
+ ScaledS = TransformForPostIncUse(Denormalize, ScaledS,
+ LF.UserInst, LF.OperandValToReplace,
+ Loops, SE, DT);
+
+ if (LU.Kind == LSRUse::ICmpZero) {
+ // Expand ScaleReg as if it was part of the base regs.
+ if (F.Scale == 1)
+ Ops.push_back(
+ SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr, &*IP)));
+ else {
+ // An interesting way of "folding" with an icmp is to use a negated
+ // scale, which we'll implement by inserting it into the other operand
+ // of the icmp.
+ assert(F.Scale == -1 &&
+ "The only scale supported by ICmpZero uses is -1!");
+ ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr, &*IP);
+ }
+ } else {
+ // Otherwise just expand the scaled register and an explicit scale,
+ // which is expected to be matched as part of the address.
+
+ // Flush the operand list to suppress SCEVExpander hoisting address modes.
+ // Unless the addressing mode will not be folded.
+ if (!Ops.empty() && LU.Kind == LSRUse::Address &&
+ isAMCompletelyFolded(TTI, LU, F)) {
+ Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, &*IP);
+ Ops.clear();
+ Ops.push_back(SE.getUnknown(FullV));
+ }
+ ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr, &*IP));
+ if (F.Scale != 1)
+ ScaledS =
+ SE.getMulExpr(ScaledS, SE.getConstant(ScaledS->getType(), F.Scale));
+ Ops.push_back(ScaledS);
+ }
+ }
+
+ // Expand the GV portion.
+ if (F.BaseGV) {
+ // Flush the operand list to suppress SCEVExpander hoisting.
+ if (!Ops.empty()) {
+ Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, &*IP);
+ Ops.clear();
+ Ops.push_back(SE.getUnknown(FullV));
+ }
+ Ops.push_back(SE.getUnknown(F.BaseGV));
+ }
+
+ // Flush the operand list to suppress SCEVExpander hoisting of both folded and
+ // unfolded offsets. LSR assumes they both live next to their uses.
+ if (!Ops.empty()) {
+ Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, &*IP);
+ Ops.clear();
+ Ops.push_back(SE.getUnknown(FullV));
+ }
+
+ // Expand the immediate portion.
+ int64_t Offset = (uint64_t)F.BaseOffset + LF.Offset;
+ if (Offset != 0) {
+ if (LU.Kind == LSRUse::ICmpZero) {
+ // The other interesting way of "folding" with an ICmpZero is to use a
+ // negated immediate.
+ if (!ICmpScaledV)
+ ICmpScaledV = ConstantInt::get(IntTy, -(uint64_t)Offset);
+ else {
+ Ops.push_back(SE.getUnknown(ICmpScaledV));
+ ICmpScaledV = ConstantInt::get(IntTy, Offset);
+ }
+ } else {
+ // Just add the immediate values. These again are expected to be matched
+ // as part of the address.
+ Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy, Offset)));
+ }
+ }
+
+ // Expand the unfolded offset portion.
+ int64_t UnfoldedOffset = F.UnfoldedOffset;
+ if (UnfoldedOffset != 0) {
+ // Just add the immediate values.
+ Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy,
+ UnfoldedOffset)));
+ }
+
+ // Emit instructions summing all the operands.
+ const SCEV *FullS = Ops.empty() ?
+ SE.getConstant(IntTy, 0) :
+ SE.getAddExpr(Ops);
+ Value *FullV = Rewriter.expandCodeFor(FullS, Ty, &*IP);
+
+ // We're done expanding now, so reset the rewriter.
+ Rewriter.clearPostInc();
+
+ // An ICmpZero Formula represents an ICmp which we're handling as a
+ // comparison against zero. Now that we've expanded an expression for that
+ // form, update the ICmp's other operand.
+ if (LU.Kind == LSRUse::ICmpZero) {
+ ICmpInst *CI = cast<ICmpInst>(LF.UserInst);
+ DeadInsts.emplace_back(CI->getOperand(1));
+ assert(!F.BaseGV && "ICmp does not support folding a global value and "
+ "a scale at the same time!");
+ if (F.Scale == -1) {
+ if (ICmpScaledV->getType() != OpTy) {
+ Instruction *Cast =
+ CastInst::Create(CastInst::getCastOpcode(ICmpScaledV, false,
+ OpTy, false),
+ ICmpScaledV, OpTy, "tmp", CI);
+ ICmpScaledV = Cast;
+ }
+ CI->setOperand(1, ICmpScaledV);
+ } else {
+ // A scale of 1 means that the scale has been expanded as part of the
+ // base regs.
+ assert((F.Scale == 0 || F.Scale == 1) &&
+ "ICmp does not support folding a global value and "
+ "a scale at the same time!");
+ Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy),
+ -(uint64_t)Offset);
+ if (C->getType() != OpTy)
+ C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
+ OpTy, false),
+ C, OpTy);
+
+ CI->setOperand(1, C);
+ }
+ }
+
+ return FullV;
+}
+
+/// Helper for Rewrite. PHI nodes are special because the use of their operands
+/// effectively happens in their predecessor blocks, so the expression may need
+/// to be expanded in multiple places.
+void LSRInstance::RewriteForPHI(PHINode *PN,
+ const LSRFixup &LF,
+ const Formula &F,
+ SCEVExpander &Rewriter,
+ SmallVectorImpl<WeakVH> &DeadInsts) const {
+ DenseMap<BasicBlock *, Value *> Inserted;
+ for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+ if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
+ BasicBlock *BB = PN->getIncomingBlock(i);
+
+ // If this is a critical edge, split the edge so that we do not insert
+ // the code on all predecessor/successor paths. We do this unless this
+ // is the canonical backedge for this loop, which complicates post-inc
+ // users.
+ if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
+ !isa<IndirectBrInst>(BB->getTerminator())) {
+ BasicBlock *Parent = PN->getParent();
+ Loop *PNLoop = LI.getLoopFor(Parent);
+ if (!PNLoop || Parent != PNLoop->getHeader()) {
+ // Split the critical edge.
+ BasicBlock *NewBB = nullptr;
+ if (!Parent->isLandingPad()) {
+ NewBB = SplitCriticalEdge(BB, Parent,
+ CriticalEdgeSplittingOptions(&DT, &LI)
+ .setMergeIdenticalEdges()
+ .setDontDeleteUselessPHIs());
+ } else {
+ SmallVector<BasicBlock*, 2> NewBBs;
+ SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs, &DT, &LI);
+ NewBB = NewBBs[0];
+ }
+ // If NewBB==NULL, then SplitCriticalEdge refused to split because all
+ // phi predecessors are identical. The simple thing to do is skip
+ // splitting in this case rather than complicate the API.
+ if (NewBB) {
+ // If PN is outside of the loop and BB is in the loop, we want to
+ // move the block to be immediately before the PHI block, not
+ // immediately after BB.
+ if (L->contains(BB) && !L->contains(PN))
+ NewBB->moveBefore(PN->getParent());
+
+ // Splitting the edge can reduce the number of PHI entries we have.
+ e = PN->getNumIncomingValues();
+ BB = NewBB;
+ i = PN->getBasicBlockIndex(BB);
+ }
+ }
+ }
+
+ std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
+ Inserted.insert(std::make_pair(BB, static_cast<Value *>(nullptr)));
+ if (!Pair.second)
+ PN->setIncomingValue(i, Pair.first->second);
+ else {
+ Value *FullV = Expand(LF, F, BB->getTerminator()->getIterator(),
+ Rewriter, DeadInsts);
+
+ // If this is reuse-by-noop-cast, insert the noop cast.
+ Type *OpTy = LF.OperandValToReplace->getType();
+ if (FullV->getType() != OpTy)
+ FullV =
+ CastInst::Create(CastInst::getCastOpcode(FullV, false,
+ OpTy, false),
+ FullV, LF.OperandValToReplace->getType(),
+ "tmp", BB->getTerminator());
+
+ PN->setIncomingValue(i, FullV);
+ Pair.first->second = FullV;
+ }
+ }
+}
+
+/// Emit instructions for the leading candidate expression for this LSRUse (this
+/// is called "expanding"), and update the UserInst to reference the newly
+/// expanded value.
+void LSRInstance::Rewrite(const LSRFixup &LF,
+ const Formula &F,
+ SCEVExpander &Rewriter,
+ SmallVectorImpl<WeakVH> &DeadInsts) const {
+ // First, find an insertion point that dominates UserInst. For PHI nodes,
+ // find the nearest block which dominates all the relevant uses.
+ if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) {
+ RewriteForPHI(PN, LF, F, Rewriter, DeadInsts);
+ } else {
+ Value *FullV =
+ Expand(LF, F, LF.UserInst->getIterator(), Rewriter, DeadInsts);
+
+ // If this is reuse-by-noop-cast, insert the noop cast.
+ Type *OpTy = LF.OperandValToReplace->getType();
+ if (FullV->getType() != OpTy) {
+ Instruction *Cast =
+ CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
+ FullV, OpTy, "tmp", LF.UserInst);
+ FullV = Cast;
+ }
+
+ // Update the user. ICmpZero is handled specially here (for now) because
+ // Expand may have updated one of the operands of the icmp already, and
+ // its new value may happen to be equal to LF.OperandValToReplace, in
+ // which case doing replaceUsesOfWith leads to replacing both operands
+ // with the same value. TODO: Reorganize this.
+ if (Uses[LF.LUIdx].Kind == LSRUse::ICmpZero)
+ LF.UserInst->setOperand(0, FullV);
+ else
+ LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV);
+ }
+
+ DeadInsts.emplace_back(LF.OperandValToReplace);
+}
+
+/// Rewrite all the fixup locations with new values, following the chosen
+/// solution.
+void LSRInstance::ImplementSolution(
+ const SmallVectorImpl<const Formula *> &Solution) {
+ // Keep track of instructions we may have made dead, so that
+ // we can remove them after we are done working.
+ SmallVector<WeakVH, 16> DeadInsts;
+
+ SCEVExpander Rewriter(SE, L->getHeader()->getModule()->getDataLayout(),
+ "lsr");
+#ifndef NDEBUG
+ Rewriter.setDebugType(DEBUG_TYPE);
+#endif
+ Rewriter.disableCanonicalMode();
+ Rewriter.enableLSRMode();
+ Rewriter.setIVIncInsertPos(L, IVIncInsertPos);
+
+ // Mark phi nodes that terminate chains so the expander tries to reuse them.
+ for (const IVChain &Chain : IVChainVec) {
+ if (PHINode *PN = dyn_cast<PHINode>(Chain.tailUserInst()))
+ Rewriter.setChainedPhi(PN);
+ }
+
+ // Expand the new value definitions and update the users.
+ for (const LSRFixup &Fixup : Fixups) {
+ Rewrite(Fixup, *Solution[Fixup.LUIdx], Rewriter, DeadInsts);
+
+ Changed = true;
+ }
+
+ for (const IVChain &Chain : IVChainVec) {
+ GenerateIVChain(Chain, Rewriter, DeadInsts);
+ Changed = true;
+ }
+ // Clean up after ourselves. This must be done before deleting any
+ // instructions.
+ Rewriter.clear();
+
+ Changed |= DeleteTriviallyDeadInstructions(DeadInsts);
+}
+
+LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
+ DominatorTree &DT, LoopInfo &LI,
+ const TargetTransformInfo &TTI)
+ : IU(IU), SE(SE), DT(DT), LI(LI), TTI(TTI), L(L), Changed(false),
+ IVIncInsertPos(nullptr) {
+ // If LoopSimplify form is not available, stay out of trouble.
+ if (!L->isLoopSimplifyForm())
+ return;
+
+ // If there's no interesting work to be done, bail early.
+ if (IU.empty()) return;
+
+ // If there's too much analysis to be done, bail early. We won't be able to
+ // model the problem anyway.
+ unsigned NumUsers = 0;
+ for (const IVStrideUse &U : IU) {
+ if (++NumUsers > MaxIVUsers) {
+ (void)U;
+ DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << U << "\n");
+ return;
+ }
+ }
+
+#ifndef NDEBUG
+ // All dominating loops must have preheaders, or SCEVExpander may not be able
+ // to materialize an AddRecExpr whose Start is an outer AddRecExpr.
+ //
+ // IVUsers analysis should only create users that are dominated by simple loop
+ // headers. Since this loop should dominate all of its users, its user list
+ // should be empty if this loop itself is not within a simple loop nest.
+ for (DomTreeNode *Rung = DT.getNode(L->getLoopPreheader());
+ Rung; Rung = Rung->getIDom()) {
+ BasicBlock *BB = Rung->getBlock();
+ const Loop *DomLoop = LI.getLoopFor(BB);
+ if (DomLoop && DomLoop->getHeader() == BB) {
+ assert(DomLoop->getLoopPreheader() && "LSR needs a simplified loop nest");
+ }
+ }
+#endif // DEBUG
+
+ DEBUG(dbgs() << "\nLSR on loop ";
+ L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false);
+ dbgs() << ":\n");
+
+ // First, perform some low-level loop optimizations.
+ OptimizeShadowIV();
+ OptimizeLoopTermCond();
+
+ // If loop preparation eliminates all interesting IV users, bail.
+ if (IU.empty()) return;
+
+ // Skip nested loops until we can model them better with formulae.
+ if (!L->empty()) {
+ DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
+ return;
+ }
+
+ // Start collecting data and preparing for the solver.
+ CollectChains();
+ CollectInterestingTypesAndFactors();
+ CollectFixupsAndInitialFormulae();
+ CollectLoopInvariantFixupsAndFormulae();
+
+ assert(!Uses.empty() && "IVUsers reported at least one use");
+ DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n";
+ print_uses(dbgs()));
+
+ // Now use the reuse data to generate a bunch of interesting ways
+ // to formulate the values needed for the uses.
+ GenerateAllReuseFormulae();
+
+ FilterOutUndesirableDedicatedRegisters();
+ NarrowSearchSpaceUsingHeuristics();
+
+ SmallVector<const Formula *, 8> Solution;
+ Solve(Solution);
+
+ // Release memory that is no longer needed.
+ Factors.clear();
+ Types.clear();
+ RegUses.clear();
+
+ if (Solution.empty())
+ return;
+
+#ifndef NDEBUG
+ // Formulae should be legal.
+ for (const LSRUse &LU : Uses) {
+ for (const Formula &F : LU.Formulae)
+ assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
+ F) && "Illegal formula generated!");
+ };
+#endif
+
+ // Now that we've decided what we want, make it so.
+ ImplementSolution(Solution);
+}
+
+void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
+ if (Factors.empty() && Types.empty()) return;
+
+ OS << "LSR has identified the following interesting factors and types: ";
+ bool First = true;
+
+ for (int64_t Factor : Factors) {
+ if (!First) OS << ", ";
+ First = false;
+ OS << '*' << Factor;
+ }
+
+ for (Type *Ty : Types) {
+ if (!First) OS << ", ";
+ First = false;
+ OS << '(' << *Ty << ')';
+ }
+ OS << '\n';
+}
+
+void LSRInstance::print_fixups(raw_ostream &OS) const {
+ OS << "LSR is examining the following fixup sites:\n";
+ for (const LSRFixup &LF : Fixups) {
+ dbgs() << " ";
+ LF.print(OS);
+ OS << '\n';
+ }
+}
+
+void LSRInstance::print_uses(raw_ostream &OS) const {
+ OS << "LSR is examining the following uses:\n";
+ for (const LSRUse &LU : Uses) {
+ dbgs() << " ";
+ LU.print(OS);
+ OS << '\n';
+ for (const Formula &F : LU.Formulae) {
+ OS << " ";
+ F.print(OS);
+ OS << '\n';
+ }
+ }
+}
+
+void LSRInstance::print(raw_ostream &OS) const {
+ print_factors_and_types(OS);
+ print_fixups(OS);
+ print_uses(OS);
+}
+
+LLVM_DUMP_METHOD
+void LSRInstance::dump() const {
+ print(errs()); errs() << '\n';
+}
+
+namespace {
+
+class LoopStrengthReduce : public LoopPass {
+public:
+ static char ID; // Pass ID, replacement for typeid
+ LoopStrengthReduce();
+
+private:
+ bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+};
+
+}
+
+char LoopStrengthReduce::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce",
+ "Loop Strength Reduction", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(IVUsers)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce",
+ "Loop Strength Reduction", false, false)
+
+
+Pass *llvm::createLoopStrengthReducePass() {
+ return new LoopStrengthReduce();
+}
+
+LoopStrengthReduce::LoopStrengthReduce() : LoopPass(ID) {
+ initializeLoopStrengthReducePass(*PassRegistry::getPassRegistry());
+}
+
+void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
+ // We split critical edges, so we change the CFG. However, we do update
+ // many analyses if they are around.
+ AU.addPreservedID(LoopSimplifyID);
+
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
+ AU.addRequiredID(LoopSimplifyID);
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addPreserved<ScalarEvolutionWrapperPass>();
+ // Requiring LoopSimplify a second time here prevents IVUsers from running
+ // twice, since LoopSimplify was invalidated by running ScalarEvolution.
+ AU.addRequiredID(LoopSimplifyID);
+ AU.addRequired<IVUsers>();
+ AU.addPreserved<IVUsers>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+}
+
+bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
+ if (skipOptnoneFunction(L))
+ return false;
+
+ auto &IU = getAnalysis<IVUsers>();
+ auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+ *L->getHeader()->getParent());
+ bool Changed = false;
+
+ // Run the main LSR transformation.
+ Changed |= LSRInstance(L, IU, SE, DT, LI, TTI).getChanged();
+
+ // Remove any extra phis created by processing inner loops.
+ Changed |= DeleteDeadPHIs(L->getHeader());
+ if (EnablePhiElim && L->isLoopSimplifyForm()) {
+ SmallVector<WeakVH, 16> DeadInsts;
+ const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+ SCEVExpander Rewriter(getAnalysis<ScalarEvolutionWrapperPass>().getSE(), DL,
+ "lsr");
+#ifndef NDEBUG
+ Rewriter.setDebugType(DEBUG_TYPE);
+#endif
+ unsigned numFolded = Rewriter.replaceCongruentIVs(
+ L, &getAnalysis<DominatorTreeWrapperPass>().getDomTree(), DeadInsts,
+ &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+ *L->getHeader()->getParent()));
+ if (numFolded) {
+ Changed = true;
+ DeleteTriviallyDeadInstructions(DeadInsts);
+ DeleteDeadPHIs(L->getHeader());
+ }
+ }
+ return Changed;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
new file mode 100644
index 0000000..ecef6db
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -0,0 +1,1030 @@
+//===-- LoopUnroll.cpp - Loop unroller pass -------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements a simple loop unroller. It works best when loops have
+// been canonicalized by the -indvars pass, allowing it to determine the trip
+// counts of loops easily.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
+#include <climits>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-unroll"
+
+static cl::opt<unsigned>
+ UnrollThreshold("unroll-threshold", cl::Hidden,
+ cl::desc("The baseline cost threshold for loop unrolling"));
+
+static cl::opt<unsigned> UnrollPercentDynamicCostSavedThreshold(
+ "unroll-percent-dynamic-cost-saved-threshold", cl::Hidden,
+ cl::desc("The percentage of estimated dynamic cost which must be saved by "
+ "unrolling to allow unrolling up to the max threshold."));
+
+static cl::opt<unsigned> UnrollDynamicCostSavingsDiscount(
+ "unroll-dynamic-cost-savings-discount", cl::Hidden,
+ cl::desc("This is the amount discounted from the total unroll cost when "
+ "the unrolled form has a high dynamic cost savings (triggered by "
+ "the '-unroll-perecent-dynamic-cost-saved-threshold' flag)."));
+
+static cl::opt<unsigned> UnrollMaxIterationsCountToAnalyze(
+ "unroll-max-iteration-count-to-analyze", cl::init(0), cl::Hidden,
+ cl::desc("Don't allow loop unrolling to simulate more than this number of"
+ "iterations when checking full unroll profitability"));
+
+static cl::opt<unsigned>
+UnrollCount("unroll-count", cl::Hidden,
+ cl::desc("Use this unroll count for all loops including those with "
+ "unroll_count pragma values, for testing purposes"));
+
+static cl::opt<bool>
+UnrollAllowPartial("unroll-allow-partial", cl::Hidden,
+ cl::desc("Allows loops to be partially unrolled until "
+ "-unroll-threshold loop size is reached."));
+
+static cl::opt<bool>
+UnrollRuntime("unroll-runtime", cl::ZeroOrMore, cl::Hidden,
+ cl::desc("Unroll loops with run-time trip counts"));
+
+static cl::opt<unsigned>
+PragmaUnrollThreshold("pragma-unroll-threshold", cl::init(16 * 1024), cl::Hidden,
+ cl::desc("Unrolled size limit for loops with an unroll(full) or "
+ "unroll_count pragma."));
+
+
+/// A magic value for use with the Threshold parameter to indicate
+/// that the loop unroll should be performed regardless of how much
+/// code expansion would result.
+static const unsigned NoThreshold = UINT_MAX;
+
+/// Default unroll count for loops with run-time trip count if
+/// -unroll-count is not set
+static const unsigned DefaultUnrollRuntimeCount = 8;
+
+/// Gather the various unrolling parameters based on the defaults, compiler
+/// flags, TTI overrides, pragmas, and user specified parameters.
+static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
+ Loop *L, const TargetTransformInfo &TTI, Optional<unsigned> UserThreshold,
+ Optional<unsigned> UserCount, Optional<bool> UserAllowPartial,
+ Optional<bool> UserRuntime, unsigned PragmaCount, bool PragmaFullUnroll,
+ bool PragmaEnableUnroll, unsigned TripCount) {
+ TargetTransformInfo::UnrollingPreferences UP;
+
+ // Set up the defaults
+ UP.Threshold = 150;
+ UP.PercentDynamicCostSavedThreshold = 20;
+ UP.DynamicCostSavingsDiscount = 2000;
+ UP.OptSizeThreshold = 50;
+ UP.PartialThreshold = UP.Threshold;
+ UP.PartialOptSizeThreshold = UP.OptSizeThreshold;
+ UP.Count = 0;
+ UP.MaxCount = UINT_MAX;
+ UP.Partial = false;
+ UP.Runtime = false;
+ UP.AllowExpensiveTripCount = false;
+
+ // Override with any target specific settings
+ TTI.getUnrollingPreferences(L, UP);
+
+ // Apply size attributes
+ if (L->getHeader()->getParent()->optForSize()) {
+ UP.Threshold = UP.OptSizeThreshold;
+ UP.PartialThreshold = UP.PartialOptSizeThreshold;
+ }
+
+ // Apply unroll count pragmas
+ if (PragmaCount)
+ UP.Count = PragmaCount;
+ else if (PragmaFullUnroll)
+ UP.Count = TripCount;
+
+ // Apply any user values specified by cl::opt
+ if (UnrollThreshold.getNumOccurrences() > 0) {
+ UP.Threshold = UnrollThreshold;
+ UP.PartialThreshold = UnrollThreshold;
+ }
+ if (UnrollPercentDynamicCostSavedThreshold.getNumOccurrences() > 0)
+ UP.PercentDynamicCostSavedThreshold =
+ UnrollPercentDynamicCostSavedThreshold;
+ if (UnrollDynamicCostSavingsDiscount.getNumOccurrences() > 0)
+ UP.DynamicCostSavingsDiscount = UnrollDynamicCostSavingsDiscount;
+ if (UnrollCount.getNumOccurrences() > 0)
+ UP.Count = UnrollCount;
+ if (UnrollAllowPartial.getNumOccurrences() > 0)
+ UP.Partial = UnrollAllowPartial;
+ if (UnrollRuntime.getNumOccurrences() > 0)
+ UP.Runtime = UnrollRuntime;
+
+ // Apply user values provided by argument
+ if (UserThreshold.hasValue()) {
+ UP.Threshold = *UserThreshold;
+ UP.PartialThreshold = *UserThreshold;
+ }
+ if (UserCount.hasValue())
+ UP.Count = *UserCount;
+ if (UserAllowPartial.hasValue())
+ UP.Partial = *UserAllowPartial;
+ if (UserRuntime.hasValue())
+ UP.Runtime = *UserRuntime;
+
+ if (PragmaCount > 0 ||
+ ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount != 0)) {
+ // If the loop has an unrolling pragma, we want to be more aggressive with
+ // unrolling limits. Set thresholds to at least the PragmaTheshold value
+ // which is larger than the default limits.
+ if (UP.Threshold != NoThreshold)
+ UP.Threshold = std::max<unsigned>(UP.Threshold, PragmaUnrollThreshold);
+ if (UP.PartialThreshold != NoThreshold)
+ UP.PartialThreshold =
+ std::max<unsigned>(UP.PartialThreshold, PragmaUnrollThreshold);
+ }
+
+ return UP;
+}
+
+namespace {
+// This class is used to get an estimate of the optimization effects that we
+// could get from complete loop unrolling. It comes from the fact that some
+// loads might be replaced with concrete constant values and that could trigger
+// a chain of instruction simplifications.
+//
+// E.g. we might have:
+// int a[] = {0, 1, 0};
+// v = 0;
+// for (i = 0; i < 3; i ++)
+// v += b[i]*a[i];
+// If we completely unroll the loop, we would get:
+// v = b[0]*a[0] + b[1]*a[1] + b[2]*a[2]
+// Which then will be simplified to:
+// v = b[0]* 0 + b[1]* 1 + b[2]* 0
+// And finally:
+// v = b[1]
+class UnrolledInstAnalyzer : private InstVisitor<UnrolledInstAnalyzer, bool> {
+ typedef InstVisitor<UnrolledInstAnalyzer, bool> Base;
+ friend class InstVisitor<UnrolledInstAnalyzer, bool>;
+ struct SimplifiedAddress {
+ Value *Base = nullptr;
+ ConstantInt *Offset = nullptr;
+ };
+
+public:
+ UnrolledInstAnalyzer(unsigned Iteration,
+ DenseMap<Value *, Constant *> &SimplifiedValues,
+ ScalarEvolution &SE)
+ : SimplifiedValues(SimplifiedValues), SE(SE) {
+ IterationNumber = SE.getConstant(APInt(64, Iteration));
+ }
+
+ // Allow access to the initial visit method.
+ using Base::visit;
+
+private:
+ /// \brief A cache of pointer bases and constant-folded offsets corresponding
+ /// to GEP (or derived from GEP) instructions.
+ ///
+ /// In order to find the base pointer one needs to perform non-trivial
+ /// traversal of the corresponding SCEV expression, so it's good to have the
+ /// results saved.
+ DenseMap<Value *, SimplifiedAddress> SimplifiedAddresses;
+
+ /// \brief SCEV expression corresponding to number of currently simulated
+ /// iteration.
+ const SCEV *IterationNumber;
+
+ /// \brief A Value->Constant map for keeping values that we managed to
+ /// constant-fold on the given iteration.
+ ///
+ /// While we walk the loop instructions, we build up and maintain a mapping
+ /// of simplified values specific to this iteration. The idea is to propagate
+ /// any special information we have about loads that can be replaced with
+ /// constants after complete unrolling, and account for likely simplifications
+ /// post-unrolling.
+ DenseMap<Value *, Constant *> &SimplifiedValues;
+
+ ScalarEvolution &SE;
+
+ /// \brief Try to simplify instruction \param I using its SCEV expression.
+ ///
+ /// The idea is that some AddRec expressions become constants, which then
+ /// could trigger folding of other instructions. However, that only happens
+ /// for expressions whose start value is also constant, which isn't always the
+ /// case. In another common and important case the start value is just some
+ /// address (i.e. SCEVUnknown) - in this case we compute the offset and save
+ /// it along with the base address instead.
+ bool simplifyInstWithSCEV(Instruction *I) {
+ if (!SE.isSCEVable(I->getType()))
+ return false;
+
+ const SCEV *S = SE.getSCEV(I);
+ if (auto *SC = dyn_cast<SCEVConstant>(S)) {
+ SimplifiedValues[I] = SC->getValue();
+ return true;
+ }
+
+ auto *AR = dyn_cast<SCEVAddRecExpr>(S);
+ if (!AR)
+ return false;
+
+ const SCEV *ValueAtIteration = AR->evaluateAtIteration(IterationNumber, SE);
+ // Check if the AddRec expression becomes a constant.
+ if (auto *SC = dyn_cast<SCEVConstant>(ValueAtIteration)) {
+ SimplifiedValues[I] = SC->getValue();
+ return true;
+ }
+
+ // Check if the offset from the base address becomes a constant.
+ auto *Base = dyn_cast<SCEVUnknown>(SE.getPointerBase(S));
+ if (!Base)
+ return false;
+ auto *Offset =
+ dyn_cast<SCEVConstant>(SE.getMinusSCEV(ValueAtIteration, Base));
+ if (!Offset)
+ return false;
+ SimplifiedAddress Address;
+ Address.Base = Base->getValue();
+ Address.Offset = Offset->getValue();
+ SimplifiedAddresses[I] = Address;
+ return true;
+ }
+
+ /// Base case for the instruction visitor.
+ bool visitInstruction(Instruction &I) {
+ return simplifyInstWithSCEV(&I);
+ }
+
+ /// Try to simplify binary operator I.
+ ///
+ /// TODO: Probably it's worth to hoist the code for estimating the
+ /// simplifications effects to a separate class, since we have a very similar
+ /// code in InlineCost already.
+ bool visitBinaryOperator(BinaryOperator &I) {
+ Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
+ if (!isa<Constant>(LHS))
+ if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS))
+ LHS = SimpleLHS;
+ if (!isa<Constant>(RHS))
+ if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
+ RHS = SimpleRHS;
+
+ Value *SimpleV = nullptr;
+ const DataLayout &DL = I.getModule()->getDataLayout();
+ if (auto FI = dyn_cast<FPMathOperator>(&I))
+ SimpleV =
+ SimplifyFPBinOp(I.getOpcode(), LHS, RHS, FI->getFastMathFlags(), DL);
+ else
+ SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS, DL);
+
+ if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
+ SimplifiedValues[&I] = C;
+
+ if (SimpleV)
+ return true;
+ return Base::visitBinaryOperator(I);
+ }
+
+ /// Try to fold load I.
+ bool visitLoad(LoadInst &I) {
+ Value *AddrOp = I.getPointerOperand();
+
+ auto AddressIt = SimplifiedAddresses.find(AddrOp);
+ if (AddressIt == SimplifiedAddresses.end())
+ return false;
+ ConstantInt *SimplifiedAddrOp = AddressIt->second.Offset;
+
+ auto *GV = dyn_cast<GlobalVariable>(AddressIt->second.Base);
+ // We're only interested in loads that can be completely folded to a
+ // constant.
+ if (!GV || !GV->hasDefinitiveInitializer() || !GV->isConstant())
+ return false;
+
+ ConstantDataSequential *CDS =
+ dyn_cast<ConstantDataSequential>(GV->getInitializer());
+ if (!CDS)
+ return false;
+
+ // We might have a vector load from an array. FIXME: for now we just bail
+ // out in this case, but we should be able to resolve and simplify such
+ // loads.
+ if(!CDS->isElementTypeCompatible(I.getType()))
+ return false;
+
+ int ElemSize = CDS->getElementType()->getPrimitiveSizeInBits() / 8U;
+ assert(SimplifiedAddrOp->getValue().getActiveBits() < 64 &&
+ "Unexpectedly large index value.");
+ int64_t Index = SimplifiedAddrOp->getSExtValue() / ElemSize;
+ if (Index >= CDS->getNumElements()) {
+ // FIXME: For now we conservatively ignore out of bound accesses, but
+ // we're allowed to perform the optimization in this case.
+ return false;
+ }
+
+ Constant *CV = CDS->getElementAsConstant(Index);
+ assert(CV && "Constant expected.");
+ SimplifiedValues[&I] = CV;
+
+ return true;
+ }
+
+ bool visitCastInst(CastInst &I) {
+ // Propagate constants through casts.
+ Constant *COp = dyn_cast<Constant>(I.getOperand(0));
+ if (!COp)
+ COp = SimplifiedValues.lookup(I.getOperand(0));
+ if (COp)
+ if (Constant *C =
+ ConstantExpr::getCast(I.getOpcode(), COp, I.getType())) {
+ SimplifiedValues[&I] = C;
+ return true;
+ }
+
+ return Base::visitCastInst(I);
+ }
+
+ bool visitCmpInst(CmpInst &I) {
+ Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
+
+ // First try to handle simplified comparisons.
+ if (!isa<Constant>(LHS))
+ if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS))
+ LHS = SimpleLHS;
+ if (!isa<Constant>(RHS))
+ if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
+ RHS = SimpleRHS;
+
+ if (!isa<Constant>(LHS) && !isa<Constant>(RHS)) {
+ auto SimplifiedLHS = SimplifiedAddresses.find(LHS);
+ if (SimplifiedLHS != SimplifiedAddresses.end()) {
+ auto SimplifiedRHS = SimplifiedAddresses.find(RHS);
+ if (SimplifiedRHS != SimplifiedAddresses.end()) {
+ SimplifiedAddress &LHSAddr = SimplifiedLHS->second;
+ SimplifiedAddress &RHSAddr = SimplifiedRHS->second;
+ if (LHSAddr.Base == RHSAddr.Base) {
+ LHS = LHSAddr.Offset;
+ RHS = RHSAddr.Offset;
+ }
+ }
+ }
+ }
+
+ if (Constant *CLHS = dyn_cast<Constant>(LHS)) {
+ if (Constant *CRHS = dyn_cast<Constant>(RHS)) {
+ if (Constant *C = ConstantExpr::getCompare(I.getPredicate(), CLHS, CRHS)) {
+ SimplifiedValues[&I] = C;
+ return true;
+ }
+ }
+ }
+
+ return Base::visitCmpInst(I);
+ }
+};
+} // namespace
+
+
+namespace {
+struct EstimatedUnrollCost {
+ /// \brief The estimated cost after unrolling.
+ int UnrolledCost;
+
+ /// \brief The estimated dynamic cost of executing the instructions in the
+ /// rolled form.
+ int RolledDynamicCost;
+};
+}
+
+/// \brief Figure out if the loop is worth full unrolling.
+///
+/// Complete loop unrolling can make some loads constant, and we need to know
+/// if that would expose any further optimization opportunities. This routine
+/// estimates this optimization. It computes cost of unrolled loop
+/// (UnrolledCost) and dynamic cost of the original loop (RolledDynamicCost). By
+/// dynamic cost we mean that we won't count costs of blocks that are known not
+/// to be executed (i.e. if we have a branch in the loop and we know that at the
+/// given iteration its condition would be resolved to true, we won't add up the
+/// cost of the 'false'-block).
+/// \returns Optional value, holding the RolledDynamicCost and UnrolledCost. If
+/// the analysis failed (no benefits expected from the unrolling, or the loop is
+/// too big to analyze), the returned value is None.
+static Optional<EstimatedUnrollCost>
+analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
+ ScalarEvolution &SE, const TargetTransformInfo &TTI,
+ int MaxUnrolledLoopSize) {
+ // We want to be able to scale offsets by the trip count and add more offsets
+ // to them without checking for overflows, and we already don't want to
+ // analyze *massive* trip counts, so we force the max to be reasonably small.
+ assert(UnrollMaxIterationsCountToAnalyze < (INT_MAX / 2) &&
+ "The unroll iterations max is too large!");
+
+ // Don't simulate loops with a big or unknown tripcount
+ if (!UnrollMaxIterationsCountToAnalyze || !TripCount ||
+ TripCount > UnrollMaxIterationsCountToAnalyze)
+ return None;
+
+ SmallSetVector<BasicBlock *, 16> BBWorklist;
+ DenseMap<Value *, Constant *> SimplifiedValues;
+ SmallVector<std::pair<Value *, Constant *>, 4> SimplifiedInputValues;
+
+ // The estimated cost of the unrolled form of the loop. We try to estimate
+ // this by simplifying as much as we can while computing the estimate.
+ int UnrolledCost = 0;
+ // We also track the estimated dynamic (that is, actually executed) cost in
+ // the rolled form. This helps identify cases when the savings from unrolling
+ // aren't just exposing dead control flows, but actual reduced dynamic
+ // instructions due to the simplifications which we expect to occur after
+ // unrolling.
+ int RolledDynamicCost = 0;
+
+ // Ensure that we don't violate the loop structure invariants relied on by
+ // this analysis.
+ assert(L->isLoopSimplifyForm() && "Must put loop into normal form first.");
+ assert(L->isLCSSAForm(DT) &&
+ "Must have loops in LCSSA form to track live-out values.");
+
+ DEBUG(dbgs() << "Starting LoopUnroll profitability analysis...\n");
+
+ // Simulate execution of each iteration of the loop counting instructions,
+ // which would be simplified.
+ // Since the same load will take different values on different iterations,
+ // we literally have to go through all loop's iterations.
+ for (unsigned Iteration = 0; Iteration < TripCount; ++Iteration) {
+ DEBUG(dbgs() << " Analyzing iteration " << Iteration << "\n");
+
+ // Prepare for the iteration by collecting any simplified entry or backedge
+ // inputs.
+ for (Instruction &I : *L->getHeader()) {
+ auto *PHI = dyn_cast<PHINode>(&I);
+ if (!PHI)
+ break;
+
+ // The loop header PHI nodes must have exactly two input: one from the
+ // loop preheader and one from the loop latch.
+ assert(
+ PHI->getNumIncomingValues() == 2 &&
+ "Must have an incoming value only for the preheader and the latch.");
+
+ Value *V = PHI->getIncomingValueForBlock(
+ Iteration == 0 ? L->getLoopPreheader() : L->getLoopLatch());
+ Constant *C = dyn_cast<Constant>(V);
+ if (Iteration != 0 && !C)
+ C = SimplifiedValues.lookup(V);
+ if (C)
+ SimplifiedInputValues.push_back({PHI, C});
+ }
+
+ // Now clear and re-populate the map for the next iteration.
+ SimplifiedValues.clear();
+ while (!SimplifiedInputValues.empty())
+ SimplifiedValues.insert(SimplifiedInputValues.pop_back_val());
+
+ UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SE);
+
+ BBWorklist.clear();
+ BBWorklist.insert(L->getHeader());
+ // Note that we *must not* cache the size, this loop grows the worklist.
+ for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
+ BasicBlock *BB = BBWorklist[Idx];
+
+ // Visit all instructions in the given basic block and try to simplify
+ // it. We don't change the actual IR, just count optimization
+ // opportunities.
+ for (Instruction &I : *BB) {
+ int InstCost = TTI.getUserCost(&I);
+
+ // Visit the instruction to analyze its loop cost after unrolling,
+ // and if the visitor returns false, include this instruction in the
+ // unrolled cost.
+ if (!Analyzer.visit(I))
+ UnrolledCost += InstCost;
+ else {
+ DEBUG(dbgs() << " " << I
+ << " would be simplified if loop is unrolled.\n");
+ (void)0;
+ }
+
+ // Also track this instructions expected cost when executing the rolled
+ // loop form.
+ RolledDynamicCost += InstCost;
+
+ // If unrolled body turns out to be too big, bail out.
+ if (UnrolledCost > MaxUnrolledLoopSize) {
+ DEBUG(dbgs() << " Exceeded threshold.. exiting.\n"
+ << " UnrolledCost: " << UnrolledCost
+ << ", MaxUnrolledLoopSize: " << MaxUnrolledLoopSize
+ << "\n");
+ return None;
+ }
+ }
+
+ TerminatorInst *TI = BB->getTerminator();
+
+ // Add in the live successors by first checking whether we have terminator
+ // that may be simplified based on the values simplified by this call.
+ if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+ if (BI->isConditional()) {
+ if (Constant *SimpleCond =
+ SimplifiedValues.lookup(BI->getCondition())) {
+ BasicBlock *Succ = nullptr;
+ // Just take the first successor if condition is undef
+ if (isa<UndefValue>(SimpleCond))
+ Succ = BI->getSuccessor(0);
+ else
+ Succ = BI->getSuccessor(
+ cast<ConstantInt>(SimpleCond)->isZero() ? 1 : 0);
+ if (L->contains(Succ))
+ BBWorklist.insert(Succ);
+ continue;
+ }
+ }
+ } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+ if (Constant *SimpleCond =
+ SimplifiedValues.lookup(SI->getCondition())) {
+ BasicBlock *Succ = nullptr;
+ // Just take the first successor if condition is undef
+ if (isa<UndefValue>(SimpleCond))
+ Succ = SI->getSuccessor(0);
+ else
+ Succ = SI->findCaseValue(cast<ConstantInt>(SimpleCond))
+ .getCaseSuccessor();
+ if (L->contains(Succ))
+ BBWorklist.insert(Succ);
+ continue;
+ }
+ }
+
+ // Add BB's successors to the worklist.
+ for (BasicBlock *Succ : successors(BB))
+ if (L->contains(Succ))
+ BBWorklist.insert(Succ);
+ }
+
+ // If we found no optimization opportunities on the first iteration, we
+ // won't find them on later ones too.
+ if (UnrolledCost == RolledDynamicCost) {
+ DEBUG(dbgs() << " No opportunities found.. exiting.\n"
+ << " UnrolledCost: " << UnrolledCost << "\n");
+ return None;
+ }
+ }
+ DEBUG(dbgs() << "Analysis finished:\n"
+ << "UnrolledCost: " << UnrolledCost << ", "
+ << "RolledDynamicCost: " << RolledDynamicCost << "\n");
+ return {{UnrolledCost, RolledDynamicCost}};
+}
+
+/// ApproximateLoopSize - Approximate the size of the loop.
+static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
+ bool &NotDuplicatable,
+ const TargetTransformInfo &TTI,
+ AssumptionCache *AC) {
+ SmallPtrSet<const Value *, 32> EphValues;
+ CodeMetrics::collectEphemeralValues(L, AC, EphValues);
+
+ CodeMetrics Metrics;
+ for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
+ I != E; ++I)
+ Metrics.analyzeBasicBlock(*I, TTI, EphValues);
+ NumCalls = Metrics.NumInlineCandidates;
+ NotDuplicatable = Metrics.notDuplicatable;
+
+ unsigned LoopSize = Metrics.NumInsts;
+
+ // Don't allow an estimate of size zero. This would allows unrolling of loops
+ // with huge iteration counts, which is a compile time problem even if it's
+ // not a problem for code quality. Also, the code using this size may assume
+ // that each loop has at least three instructions (likely a conditional
+ // branch, a comparison feeding that branch, and some kind of loop increment
+ // feeding that comparison instruction).
+ LoopSize = std::max(LoopSize, 3u);
+
+ return LoopSize;
+}
+
+// Returns the loop hint metadata node with the given name (for example,
+// "llvm.loop.unroll.count"). If no such metadata node exists, then nullptr is
+// returned.
+static MDNode *GetUnrollMetadataForLoop(const Loop *L, StringRef Name) {
+ if (MDNode *LoopID = L->getLoopID())
+ return GetUnrollMetadata(LoopID, Name);
+ return nullptr;
+}
+
+// Returns true if the loop has an unroll(full) pragma.
+static bool HasUnrollFullPragma(const Loop *L) {
+ return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.full");
+}
+
+// Returns true if the loop has an unroll(enable) pragma. This metadata is used
+// for both "#pragma unroll" and "#pragma clang loop unroll(enable)" directives.
+static bool HasUnrollEnablePragma(const Loop *L) {
+ return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.enable");
+}
+
+// Returns true if the loop has an unroll(disable) pragma.
+static bool HasUnrollDisablePragma(const Loop *L) {
+ return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.disable");
+}
+
+// Returns true if the loop has an runtime unroll(disable) pragma.
+static bool HasRuntimeUnrollDisablePragma(const Loop *L) {
+ return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.runtime.disable");
+}
+
+// If loop has an unroll_count pragma return the (necessarily
+// positive) value from the pragma. Otherwise return 0.
+static unsigned UnrollCountPragmaValue(const Loop *L) {
+ MDNode *MD = GetUnrollMetadataForLoop(L, "llvm.loop.unroll.count");
+ if (MD) {
+ assert(MD->getNumOperands() == 2 &&
+ "Unroll count hint metadata should have two operands.");
+ unsigned Count =
+ mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
+ assert(Count >= 1 && "Unroll count must be positive.");
+ return Count;
+ }
+ return 0;
+}
+
+// Remove existing unroll metadata and add unroll disable metadata to
+// indicate the loop has already been unrolled. This prevents a loop
+// from being unrolled more than is directed by a pragma if the loop
+// unrolling pass is run more than once (which it generally is).
+static void SetLoopAlreadyUnrolled(Loop *L) {
+ MDNode *LoopID = L->getLoopID();
+ if (!LoopID) return;
+
+ // First remove any existing loop unrolling metadata.
+ SmallVector<Metadata *, 4> MDs;
+ // Reserve first location for self reference to the LoopID metadata node.
+ MDs.push_back(nullptr);
+ for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+ bool IsUnrollMetadata = false;
+ MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
+ if (MD) {
+ const MDString *S = dyn_cast<MDString>(MD->getOperand(0));
+ IsUnrollMetadata = S && S->getString().startswith("llvm.loop.unroll.");
+ }
+ if (!IsUnrollMetadata)
+ MDs.push_back(LoopID->getOperand(i));
+ }
+
+ // Add unroll(disable) metadata to disable future unrolling.
+ LLVMContext &Context = L->getHeader()->getContext();
+ SmallVector<Metadata *, 1> DisableOperands;
+ DisableOperands.push_back(MDString::get(Context, "llvm.loop.unroll.disable"));
+ MDNode *DisableNode = MDNode::get(Context, DisableOperands);
+ MDs.push_back(DisableNode);
+
+ MDNode *NewLoopID = MDNode::get(Context, MDs);
+ // Set operand 0 to refer to the loop id itself.
+ NewLoopID->replaceOperandWith(0, NewLoopID);
+ L->setLoopID(NewLoopID);
+}
+
+static bool canUnrollCompletely(Loop *L, unsigned Threshold,
+ unsigned PercentDynamicCostSavedThreshold,
+ unsigned DynamicCostSavingsDiscount,
+ uint64_t UnrolledCost,
+ uint64_t RolledDynamicCost) {
+ if (Threshold == NoThreshold) {
+ DEBUG(dbgs() << " Can fully unroll, because no threshold is set.\n");
+ return true;
+ }
+
+ if (UnrolledCost <= Threshold) {
+ DEBUG(dbgs() << " Can fully unroll, because unrolled cost: "
+ << UnrolledCost << "<" << Threshold << "\n");
+ return true;
+ }
+
+ assert(UnrolledCost && "UnrolledCost can't be 0 at this point.");
+ assert(RolledDynamicCost >= UnrolledCost &&
+ "Cannot have a higher unrolled cost than a rolled cost!");
+
+ // Compute the percentage of the dynamic cost in the rolled form that is
+ // saved when unrolled. If unrolling dramatically reduces the estimated
+ // dynamic cost of the loop, we use a higher threshold to allow more
+ // unrolling.
+ unsigned PercentDynamicCostSaved =
+ (uint64_t)(RolledDynamicCost - UnrolledCost) * 100ull / RolledDynamicCost;
+
+ if (PercentDynamicCostSaved >= PercentDynamicCostSavedThreshold &&
+ (int64_t)UnrolledCost - (int64_t)DynamicCostSavingsDiscount <=
+ (int64_t)Threshold) {
+ DEBUG(dbgs() << " Can fully unroll, because unrolling will reduce the "
+ "expected dynamic cost by " << PercentDynamicCostSaved
+ << "% (threshold: " << PercentDynamicCostSavedThreshold
+ << "%)\n"
+ << " and the unrolled cost (" << UnrolledCost
+ << ") is less than the max threshold ("
+ << DynamicCostSavingsDiscount << ").\n");
+ return true;
+ }
+
+ DEBUG(dbgs() << " Too large to fully unroll:\n");
+ DEBUG(dbgs() << " Threshold: " << Threshold << "\n");
+ DEBUG(dbgs() << " Max threshold: " << DynamicCostSavingsDiscount << "\n");
+ DEBUG(dbgs() << " Percent cost saved threshold: "
+ << PercentDynamicCostSavedThreshold << "%\n");
+ DEBUG(dbgs() << " Unrolled cost: " << UnrolledCost << "\n");
+ DEBUG(dbgs() << " Rolled dynamic cost: " << RolledDynamicCost << "\n");
+ DEBUG(dbgs() << " Percent cost saved: " << PercentDynamicCostSaved
+ << "\n");
+ return false;
+}
+
+static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
+ ScalarEvolution *SE, const TargetTransformInfo &TTI,
+ AssumptionCache &AC, bool PreserveLCSSA,
+ Optional<unsigned> ProvidedCount,
+ Optional<unsigned> ProvidedThreshold,
+ Optional<bool> ProvidedAllowPartial,
+ Optional<bool> ProvidedRuntime) {
+ BasicBlock *Header = L->getHeader();
+ DEBUG(dbgs() << "Loop Unroll: F[" << Header->getParent()->getName()
+ << "] Loop %" << Header->getName() << "\n");
+
+ if (HasUnrollDisablePragma(L)) {
+ return false;
+ }
+ bool PragmaFullUnroll = HasUnrollFullPragma(L);
+ bool PragmaEnableUnroll = HasUnrollEnablePragma(L);
+ unsigned PragmaCount = UnrollCountPragmaValue(L);
+ bool HasPragma = PragmaFullUnroll || PragmaEnableUnroll || PragmaCount > 0;
+
+ // Find trip count and trip multiple if count is not available
+ unsigned TripCount = 0;
+ unsigned TripMultiple = 1;
+ // If there are multiple exiting blocks but one of them is the latch, use the
+ // latch for the trip count estimation. Otherwise insist on a single exiting
+ // block for the trip count estimation.
+ BasicBlock *ExitingBlock = L->getLoopLatch();
+ if (!ExitingBlock || !L->isLoopExiting(ExitingBlock))
+ ExitingBlock = L->getExitingBlock();
+ if (ExitingBlock) {
+ TripCount = SE->getSmallConstantTripCount(L, ExitingBlock);
+ TripMultiple = SE->getSmallConstantTripMultiple(L, ExitingBlock);
+ }
+
+ TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
+ L, TTI, ProvidedThreshold, ProvidedCount, ProvidedAllowPartial,
+ ProvidedRuntime, PragmaCount, PragmaFullUnroll, PragmaEnableUnroll,
+ TripCount);
+
+ unsigned Count = UP.Count;
+ bool CountSetExplicitly = Count != 0;
+ // Use a heuristic count if we didn't set anything explicitly.
+ if (!CountSetExplicitly)
+ Count = TripCount == 0 ? DefaultUnrollRuntimeCount : TripCount;
+ if (TripCount && Count > TripCount)
+ Count = TripCount;
+
+ unsigned NumInlineCandidates;
+ bool notDuplicatable;
+ unsigned LoopSize =
+ ApproximateLoopSize(L, NumInlineCandidates, notDuplicatable, TTI, &AC);
+ DEBUG(dbgs() << " Loop Size = " << LoopSize << "\n");
+
+ // When computing the unrolled size, note that the conditional branch on the
+ // backedge and the comparison feeding it are not replicated like the rest of
+ // the loop body (which is why 2 is subtracted).
+ uint64_t UnrolledSize = (uint64_t)(LoopSize-2) * Count + 2;
+ if (notDuplicatable) {
+ DEBUG(dbgs() << " Not unrolling loop which contains non-duplicatable"
+ << " instructions.\n");
+ return false;
+ }
+ if (NumInlineCandidates != 0) {
+ DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n");
+ return false;
+ }
+
+ // Given Count, TripCount and thresholds determine the type of
+ // unrolling which is to be performed.
+ enum { Full = 0, Partial = 1, Runtime = 2 };
+ int Unrolling;
+ if (TripCount && Count == TripCount) {
+ Unrolling = Partial;
+ // If the loop is really small, we don't need to run an expensive analysis.
+ if (canUnrollCompletely(L, UP.Threshold, 100, UP.DynamicCostSavingsDiscount,
+ UnrolledSize, UnrolledSize)) {
+ Unrolling = Full;
+ } else {
+ // The loop isn't that small, but we still can fully unroll it if that
+ // helps to remove a significant number of instructions.
+ // To check that, run additional analysis on the loop.
+ if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost(
+ L, TripCount, DT, *SE, TTI,
+ UP.Threshold + UP.DynamicCostSavingsDiscount))
+ if (canUnrollCompletely(L, UP.Threshold,
+ UP.PercentDynamicCostSavedThreshold,
+ UP.DynamicCostSavingsDiscount,
+ Cost->UnrolledCost, Cost->RolledDynamicCost)) {
+ Unrolling = Full;
+ }
+ }
+ } else if (TripCount && Count < TripCount) {
+ Unrolling = Partial;
+ } else {
+ Unrolling = Runtime;
+ }
+
+ // Reduce count based on the type of unrolling and the threshold values.
+ unsigned OriginalCount = Count;
+ bool AllowRuntime = PragmaEnableUnroll || (PragmaCount > 0) || UP.Runtime;
+ // Don't unroll a runtime trip count loop with unroll full pragma.
+ if (HasRuntimeUnrollDisablePragma(L) || PragmaFullUnroll) {
+ AllowRuntime = false;
+ }
+ if (Unrolling == Partial) {
+ bool AllowPartial = PragmaEnableUnroll || UP.Partial;
+ if (!AllowPartial && !CountSetExplicitly) {
+ DEBUG(dbgs() << " will not try to unroll partially because "
+ << "-unroll-allow-partial not given\n");
+ return false;
+ }
+ if (UP.PartialThreshold != NoThreshold &&
+ UnrolledSize > UP.PartialThreshold) {
+ // Reduce unroll count to be modulo of TripCount for partial unrolling.
+ Count = (std::max(UP.PartialThreshold, 3u) - 2) / (LoopSize - 2);
+ while (Count != 0 && TripCount % Count != 0)
+ Count--;
+ }
+ } else if (Unrolling == Runtime) {
+ if (!AllowRuntime && !CountSetExplicitly) {
+ DEBUG(dbgs() << " will not try to unroll loop with runtime trip count "
+ << "-unroll-runtime not given\n");
+ return false;
+ }
+ // Reduce unroll count to be the largest power-of-two factor of
+ // the original count which satisfies the threshold limit.
+ while (Count != 0 && UnrolledSize > UP.PartialThreshold) {
+ Count >>= 1;
+ UnrolledSize = (LoopSize-2) * Count + 2;
+ }
+ if (Count > UP.MaxCount)
+ Count = UP.MaxCount;
+ DEBUG(dbgs() << " partially unrolling with count: " << Count << "\n");
+ }
+
+ if (HasPragma) {
+ if (PragmaCount != 0)
+ // If loop has an unroll count pragma mark loop as unrolled to prevent
+ // unrolling beyond that requested by the pragma.
+ SetLoopAlreadyUnrolled(L);
+
+ // Emit optimization remarks if we are unable to unroll the loop
+ // as directed by a pragma.
+ DebugLoc LoopLoc = L->getStartLoc();
+ Function *F = Header->getParent();
+ LLVMContext &Ctx = F->getContext();
+ if ((PragmaCount > 0) && Count != OriginalCount) {
+ emitOptimizationRemarkMissed(
+ Ctx, DEBUG_TYPE, *F, LoopLoc,
+ "Unable to unroll loop the number of times directed by "
+ "unroll_count pragma because unrolled size is too large.");
+ } else if (PragmaFullUnroll && !TripCount) {
+ emitOptimizationRemarkMissed(
+ Ctx, DEBUG_TYPE, *F, LoopLoc,
+ "Unable to fully unroll loop as directed by unroll(full) pragma "
+ "because loop has a runtime trip count.");
+ } else if (PragmaEnableUnroll && Count != TripCount && Count < 2) {
+ emitOptimizationRemarkMissed(
+ Ctx, DEBUG_TYPE, *F, LoopLoc,
+ "Unable to unroll loop as directed by unroll(enable) pragma because "
+ "unrolled size is too large.");
+ } else if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount &&
+ Count != TripCount) {
+ emitOptimizationRemarkMissed(
+ Ctx, DEBUG_TYPE, *F, LoopLoc,
+ "Unable to fully unroll loop as directed by unroll pragma because "
+ "unrolled size is too large.");
+ }
+ }
+
+ if (Unrolling != Full && Count < 2) {
+ // Partial unrolling by 1 is a nop. For full unrolling, a factor
+ // of 1 makes sense because loop control can be eliminated.
+ return false;
+ }
+
+ // Unroll the loop.
+ if (!UnrollLoop(L, Count, TripCount, AllowRuntime, UP.AllowExpensiveTripCount,
+ TripMultiple, LI, SE, &DT, &AC, PreserveLCSSA))
+ return false;
+
+ return true;
+}
+
+namespace {
+class LoopUnroll : public LoopPass {
+public:
+ static char ID; // Pass ID, replacement for typeid
+ LoopUnroll(Optional<unsigned> Threshold = None,
+ Optional<unsigned> Count = None,
+ Optional<bool> AllowPartial = None, Optional<bool> Runtime = None)
+ : LoopPass(ID), ProvidedCount(Count), ProvidedThreshold(Threshold),
+ ProvidedAllowPartial(AllowPartial), ProvidedRuntime(Runtime) {
+ initializeLoopUnrollPass(*PassRegistry::getPassRegistry());
+ }
+
+ Optional<unsigned> ProvidedCount;
+ Optional<unsigned> ProvidedThreshold;
+ Optional<bool> ProvidedAllowPartial;
+ Optional<bool> ProvidedRuntime;
+
+ bool runOnLoop(Loop *L, LPPassManager &) override {
+ if (skipOptnoneFunction(L))
+ return false;
+
+ Function &F = *L->getHeader()->getParent();
+
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ const TargetTransformInfo &TTI =
+ getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
+
+ return tryToUnrollLoop(L, DT, LI, SE, TTI, AC, PreserveLCSSA, ProvidedCount,
+ ProvidedThreshold, ProvidedAllowPartial,
+ ProvidedRuntime);
+ }
+
+ /// This transformation requires natural loop information & requires that
+ /// loop preheaders be inserted into the CFG...
+ ///
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
+ AU.addRequiredID(LoopSimplifyID);
+ AU.addPreservedID(LoopSimplifyID);
+ AU.addRequiredID(LCSSAID);
+ AU.addPreservedID(LCSSAID);
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addPreserved<ScalarEvolutionWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ // FIXME: Loop unroll requires LCSSA. And LCSSA requires dom info.
+ // If loop unroll does not preserve dom info then LCSSA pass on next
+ // loop will receive invalid dom info.
+ // For now, recreate dom info, if loop is unrolled.
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ }
+};
+}
+
+char LoopUnroll::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(LCSSA)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
+
+Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial,
+ int Runtime) {
+ // TODO: It would make more sense for this function to take the optionals
+ // directly, but that's dangerous since it would silently break out of tree
+ // callers.
+ return new LoopUnroll(Threshold == -1 ? None : Optional<unsigned>(Threshold),
+ Count == -1 ? None : Optional<unsigned>(Count),
+ AllowPartial == -1 ? None
+ : Optional<bool>(AllowPartial),
+ Runtime == -1 ? None : Optional<bool>(Runtime));
+}
+
+Pass *llvm::createSimpleLoopUnrollPass() {
+ return llvm::createLoopUnrollPass(-1, -1, 0, 0);
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
new file mode 100644
index 0000000..95d7f8a
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -0,0 +1,1342 @@
+//===-- LoopUnswitch.cpp - Hoist loop-invariant conditionals in loop ------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass transforms loops that contain branches on loop-invariant conditions
+// to have multiple loops. For example, it turns the left into the right code:
+//
+// for (...) if (lic)
+// A for (...)
+// if (lic) A; B; C
+// B else
+// C for (...)
+// A; C
+//
+// This can increase the size of the code exponentially (doubling it every time
+// a loop is unswitched) so we only unswitch if the resultant code will be
+// smaller than a threshold.
+//
+// This pass expects LICM to be run before it to hoist invariant conditions out
+// of the loop, to make the unswitching opportunity obvious.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/BlockFrequencyInfoImpl.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+#include <map>
+#include <set>
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-unswitch"
+
+STATISTIC(NumBranches, "Number of branches unswitched");
+STATISTIC(NumSwitches, "Number of switches unswitched");
+STATISTIC(NumSelects , "Number of selects unswitched");
+STATISTIC(NumTrivial , "Number of unswitches that are trivial");
+STATISTIC(NumSimplify, "Number of simplifications of unswitched code");
+STATISTIC(TotalInsts, "Total number of instructions analyzed");
+
+// The specific value of 100 here was chosen based only on intuition and a
+// few specific examples.
+static cl::opt<unsigned>
+Threshold("loop-unswitch-threshold", cl::desc("Max loop size to unswitch"),
+ cl::init(100), cl::Hidden);
+
+static cl::opt<bool>
+LoopUnswitchWithBlockFrequency("loop-unswitch-with-block-frequency",
+ cl::init(false), cl::Hidden,
+ cl::desc("Enable the use of the block frequency analysis to access PGO "
+ "heuristics to minimize code growth in cold regions."));
+
+static cl::opt<unsigned>
+ColdnessThreshold("loop-unswitch-coldness-threshold", cl::init(1), cl::Hidden,
+ cl::desc("Coldness threshold in percentage. The loop header frequency "
+ "(relative to the entry frequency) is compared with this "
+ "threshold to determine if non-trivial unswitching should be "
+ "enabled."));
+
+namespace {
+
+ class LUAnalysisCache {
+
+ typedef DenseMap<const SwitchInst*, SmallPtrSet<const Value *, 8> >
+ UnswitchedValsMap;
+
+ typedef UnswitchedValsMap::iterator UnswitchedValsIt;
+
+ struct LoopProperties {
+ unsigned CanBeUnswitchedCount;
+ unsigned WasUnswitchedCount;
+ unsigned SizeEstimation;
+ UnswitchedValsMap UnswitchedVals;
+ };
+
+ // Here we use std::map instead of DenseMap, since we need to keep valid
+ // LoopProperties pointer for current loop for better performance.
+ typedef std::map<const Loop*, LoopProperties> LoopPropsMap;
+ typedef LoopPropsMap::iterator LoopPropsMapIt;
+
+ LoopPropsMap LoopsProperties;
+ UnswitchedValsMap *CurLoopInstructions;
+ LoopProperties *CurrentLoopProperties;
+
+ // A loop unswitching with an estimated cost above this threshold
+ // is not performed. MaxSize is turned into unswitching quota for
+ // the current loop, and reduced correspondingly, though note that
+ // the quota is returned by releaseMemory() when the loop has been
+ // processed, so that MaxSize will return to its previous
+ // value. So in most cases MaxSize will equal the Threshold flag
+ // when a new loop is processed. An exception to that is that
+ // MaxSize will have a smaller value while processing nested loops
+ // that were introduced due to loop unswitching of an outer loop.
+ //
+ // FIXME: The way that MaxSize works is subtle and depends on the
+ // pass manager processing loops and calling releaseMemory() in a
+ // specific order. It would be good to find a more straightforward
+ // way of doing what MaxSize does.
+ unsigned MaxSize;
+
+ public:
+ LUAnalysisCache()
+ : CurLoopInstructions(nullptr), CurrentLoopProperties(nullptr),
+ MaxSize(Threshold) {}
+
+ // Analyze loop. Check its size, calculate is it possible to unswitch
+ // it. Returns true if we can unswitch this loop.
+ bool countLoop(const Loop *L, const TargetTransformInfo &TTI,
+ AssumptionCache *AC);
+
+ // Clean all data related to given loop.
+ void forgetLoop(const Loop *L);
+
+ // Mark case value as unswitched.
+ // Since SI instruction can be partly unswitched, in order to avoid
+ // extra unswitching in cloned loops keep track all unswitched values.
+ void setUnswitched(const SwitchInst *SI, const Value *V);
+
+ // Check was this case value unswitched before or not.
+ bool isUnswitched(const SwitchInst *SI, const Value *V);
+
+ // Returns true if another unswitching could be done within the cost
+ // threshold.
+ bool CostAllowsUnswitching();
+
+ // Clone all loop-unswitch related loop properties.
+ // Redistribute unswitching quotas.
+ // Note, that new loop data is stored inside the VMap.
+ void cloneData(const Loop *NewLoop, const Loop *OldLoop,
+ const ValueToValueMapTy &VMap);
+ };
+
+ class LoopUnswitch : public LoopPass {
+ LoopInfo *LI; // Loop information
+ LPPassManager *LPM;
+ AssumptionCache *AC;
+
+ // Used to check if second loop needs processing after
+ // RewriteLoopBodyWithConditionConstant rewrites first loop.
+ std::vector<Loop*> LoopProcessWorklist;
+
+ LUAnalysisCache BranchesInfo;
+
+ bool EnabledPGO;
+
+ // BFI and ColdEntryFreq are only used when PGO and
+ // LoopUnswitchWithBlockFrequency are enabled.
+ BlockFrequencyInfo BFI;
+ BlockFrequency ColdEntryFreq;
+
+ bool OptimizeForSize;
+ bool redoLoop;
+
+ Loop *currentLoop;
+ DominatorTree *DT;
+ BasicBlock *loopHeader;
+ BasicBlock *loopPreheader;
+
+ // LoopBlocks contains all of the basic blocks of the loop, including the
+ // preheader of the loop, the body of the loop, and the exit blocks of the
+ // loop, in that order.
+ std::vector<BasicBlock*> LoopBlocks;
+ // NewBlocks contained cloned copy of basic blocks from LoopBlocks.
+ std::vector<BasicBlock*> NewBlocks;
+
+ public:
+ static char ID; // Pass ID, replacement for typeid
+ explicit LoopUnswitch(bool Os = false) :
+ LoopPass(ID), OptimizeForSize(Os), redoLoop(false),
+ currentLoop(nullptr), DT(nullptr), loopHeader(nullptr),
+ loopPreheader(nullptr) {
+ initializeLoopUnswitchPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+ bool processCurrentLoop();
+
+ /// This transformation requires natural loop information & requires that
+ /// loop preheaders be inserted into the CFG.
+ ///
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequiredID(LoopSimplifyID);
+ AU.addPreservedID(LoopSimplifyID);
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
+ AU.addRequiredID(LCSSAID);
+ AU.addPreservedID(LCSSAID);
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<ScalarEvolutionWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ }
+
+ private:
+
+ void releaseMemory() override {
+ BranchesInfo.forgetLoop(currentLoop);
+ }
+
+ void initLoopData() {
+ loopHeader = currentLoop->getHeader();
+ loopPreheader = currentLoop->getLoopPreheader();
+ }
+
+ /// Split all of the edges from inside the loop to their exit blocks.
+ /// Update the appropriate Phi nodes as we do so.
+ void SplitExitEdges(Loop *L,
+ const SmallVectorImpl<BasicBlock *> &ExitBlocks);
+
+ bool TryTrivialLoopUnswitch(bool &Changed);
+
+ bool UnswitchIfProfitable(Value *LoopCond, Constant *Val,
+ TerminatorInst *TI = nullptr);
+ void UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val,
+ BasicBlock *ExitBlock, TerminatorInst *TI);
+ void UnswitchNontrivialCondition(Value *LIC, Constant *OnVal, Loop *L,
+ TerminatorInst *TI);
+
+ void RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
+ Constant *Val, bool isEqual);
+
+ void EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
+ BasicBlock *TrueDest,
+ BasicBlock *FalseDest,
+ Instruction *InsertPt,
+ TerminatorInst *TI);
+
+ void SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L);
+ };
+}
+
+// Analyze loop. Check its size, calculate is it possible to unswitch
+// it. Returns true if we can unswitch this loop.
+bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI,
+ AssumptionCache *AC) {
+
+ LoopPropsMapIt PropsIt;
+ bool Inserted;
+ std::tie(PropsIt, Inserted) =
+ LoopsProperties.insert(std::make_pair(L, LoopProperties()));
+
+ LoopProperties &Props = PropsIt->second;
+
+ if (Inserted) {
+ // New loop.
+
+ // Limit the number of instructions to avoid causing significant code
+ // expansion, and the number of basic blocks, to avoid loops with
+ // large numbers of branches which cause loop unswitching to go crazy.
+ // This is a very ad-hoc heuristic.
+
+ SmallPtrSet<const Value *, 32> EphValues;
+ CodeMetrics::collectEphemeralValues(L, AC, EphValues);
+
+ // FIXME: This is overly conservative because it does not take into
+ // consideration code simplification opportunities and code that can
+ // be shared by the resultant unswitched loops.
+ CodeMetrics Metrics;
+ for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); I != E;
+ ++I)
+ Metrics.analyzeBasicBlock(*I, TTI, EphValues);
+
+ Props.SizeEstimation = Metrics.NumInsts;
+ Props.CanBeUnswitchedCount = MaxSize / (Props.SizeEstimation);
+ Props.WasUnswitchedCount = 0;
+ MaxSize -= Props.SizeEstimation * Props.CanBeUnswitchedCount;
+
+ if (Metrics.notDuplicatable) {
+ DEBUG(dbgs() << "NOT unswitching loop %"
+ << L->getHeader()->getName() << ", contents cannot be "
+ << "duplicated!\n");
+ return false;
+ }
+ }
+
+ // Be careful. This links are good only before new loop addition.
+ CurrentLoopProperties = &Props;
+ CurLoopInstructions = &Props.UnswitchedVals;
+
+ return true;
+}
+
+// Clean all data related to given loop.
+void LUAnalysisCache::forgetLoop(const Loop *L) {
+
+ LoopPropsMapIt LIt = LoopsProperties.find(L);
+
+ if (LIt != LoopsProperties.end()) {
+ LoopProperties &Props = LIt->second;
+ MaxSize += (Props.CanBeUnswitchedCount + Props.WasUnswitchedCount) *
+ Props.SizeEstimation;
+ LoopsProperties.erase(LIt);
+ }
+
+ CurrentLoopProperties = nullptr;
+ CurLoopInstructions = nullptr;
+}
+
+// Mark case value as unswitched.
+// Since SI instruction can be partly unswitched, in order to avoid
+// extra unswitching in cloned loops keep track all unswitched values.
+void LUAnalysisCache::setUnswitched(const SwitchInst *SI, const Value *V) {
+ (*CurLoopInstructions)[SI].insert(V);
+}
+
+// Check was this case value unswitched before or not.
+bool LUAnalysisCache::isUnswitched(const SwitchInst *SI, const Value *V) {
+ return (*CurLoopInstructions)[SI].count(V);
+}
+
+bool LUAnalysisCache::CostAllowsUnswitching() {
+ return CurrentLoopProperties->CanBeUnswitchedCount > 0;
+}
+
+// Clone all loop-unswitch related loop properties.
+// Redistribute unswitching quotas.
+// Note, that new loop data is stored inside the VMap.
+void LUAnalysisCache::cloneData(const Loop *NewLoop, const Loop *OldLoop,
+ const ValueToValueMapTy &VMap) {
+
+ LoopProperties &NewLoopProps = LoopsProperties[NewLoop];
+ LoopProperties &OldLoopProps = *CurrentLoopProperties;
+ UnswitchedValsMap &Insts = OldLoopProps.UnswitchedVals;
+
+ // Reallocate "can-be-unswitched quota"
+
+ --OldLoopProps.CanBeUnswitchedCount;
+ ++OldLoopProps.WasUnswitchedCount;
+ NewLoopProps.WasUnswitchedCount = 0;
+ unsigned Quota = OldLoopProps.CanBeUnswitchedCount;
+ NewLoopProps.CanBeUnswitchedCount = Quota / 2;
+ OldLoopProps.CanBeUnswitchedCount = Quota - Quota / 2;
+
+ NewLoopProps.SizeEstimation = OldLoopProps.SizeEstimation;
+
+ // Clone unswitched values info:
+ // for new loop switches we clone info about values that was
+ // already unswitched and has redundant successors.
+ for (UnswitchedValsIt I = Insts.begin(); I != Insts.end(); ++I) {
+ const SwitchInst *OldInst = I->first;
+ Value *NewI = VMap.lookup(OldInst);
+ const SwitchInst *NewInst = cast_or_null<SwitchInst>(NewI);
+ assert(NewInst && "All instructions that are in SrcBB must be in VMap.");
+
+ NewLoopProps.UnswitchedVals[NewInst] = OldLoopProps.UnswitchedVals[OldInst];
+ }
+}
+
+char LoopUnswitch::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopUnswitch, "loop-unswitch", "Unswitch loops",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LCSSA)
+INITIALIZE_PASS_END(LoopUnswitch, "loop-unswitch", "Unswitch loops",
+ false, false)
+
+Pass *llvm::createLoopUnswitchPass(bool Os) {
+ return new LoopUnswitch(Os);
+}
+
+/// Cond is a condition that occurs in L. If it is invariant in the loop, or has
+/// an invariant piece, return the invariant. Otherwise, return null.
+static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed) {
+
+ // We started analyze new instruction, increment scanned instructions counter.
+ ++TotalInsts;
+
+ // We can never unswitch on vector conditions.
+ if (Cond->getType()->isVectorTy())
+ return nullptr;
+
+ // Constants should be folded, not unswitched on!
+ if (isa<Constant>(Cond)) return nullptr;
+
+ // TODO: Handle: br (VARIANT|INVARIANT).
+
+ // Hoist simple values out.
+ if (L->makeLoopInvariant(Cond, Changed))
+ return Cond;
+
+ if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Cond))
+ if (BO->getOpcode() == Instruction::And ||
+ BO->getOpcode() == Instruction::Or) {
+ // If either the left or right side is invariant, we can unswitch on this,
+ // which will cause the branch to go away in one loop and the condition to
+ // simplify in the other one.
+ if (Value *LHS = FindLIVLoopCondition(BO->getOperand(0), L, Changed))
+ return LHS;
+ if (Value *RHS = FindLIVLoopCondition(BO->getOperand(1), L, Changed))
+ return RHS;
+ }
+
+ return nullptr;
+}
+
+bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) {
+ if (skipOptnoneFunction(L))
+ return false;
+
+ AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
+ *L->getHeader()->getParent());
+ LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ LPM = &LPM_Ref;
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ currentLoop = L;
+ Function *F = currentLoop->getHeader()->getParent();
+
+ EnabledPGO = F->getEntryCount().hasValue();
+
+ if (LoopUnswitchWithBlockFrequency && EnabledPGO) {
+ BranchProbabilityInfo BPI(*F, *LI);
+ BFI.calculate(*L->getHeader()->getParent(), BPI, *LI);
+
+ // Use BranchProbability to compute a minimum frequency based on
+ // function entry baseline frequency. Loops with headers below this
+ // frequency are considered as cold.
+ const BranchProbability ColdProb(ColdnessThreshold, 100);
+ ColdEntryFreq = BlockFrequency(BFI.getEntryFreq()) * ColdProb;
+ }
+
+ bool Changed = false;
+ do {
+ assert(currentLoop->isLCSSAForm(*DT));
+ redoLoop = false;
+ Changed |= processCurrentLoop();
+ } while(redoLoop);
+
+ // FIXME: Reconstruct dom info, because it is not preserved properly.
+ if (Changed)
+ DT->recalculate(*F);
+ return Changed;
+}
+
+/// Do actual work and unswitch loop if possible and profitable.
+bool LoopUnswitch::processCurrentLoop() {
+ bool Changed = false;
+
+ initLoopData();
+
+ // If LoopSimplify was unable to form a preheader, don't do any unswitching.
+ if (!loopPreheader)
+ return false;
+
+ // Loops with indirectbr cannot be cloned.
+ if (!currentLoop->isSafeToClone())
+ return false;
+
+ // Without dedicated exits, splitting the exit edge may fail.
+ if (!currentLoop->hasDedicatedExits())
+ return false;
+
+ LLVMContext &Context = loopHeader->getContext();
+
+ // Analyze loop cost, and stop unswitching if loop content can not be duplicated.
+ if (!BranchesInfo.countLoop(
+ currentLoop, getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+ *currentLoop->getHeader()->getParent()),
+ AC))
+ return false;
+
+ // Try trivial unswitch first before loop over other basic blocks in the loop.
+ if (TryTrivialLoopUnswitch(Changed)) {
+ return true;
+ }
+
+ // Do not unswitch loops containing convergent operations, as we might be
+ // making them control dependent on the unswitch value when they were not
+ // before.
+ // FIXME: This could be refined to only bail if the convergent operation is
+ // not already control-dependent on the unswitch value.
+ for (const auto BB : currentLoop->blocks()) {
+ for (auto &I : *BB) {
+ auto CS = CallSite(&I);
+ if (!CS) continue;
+ if (CS.hasFnAttr(Attribute::Convergent))
+ return false;
+ }
+ }
+
+ // Do not do non-trivial unswitch while optimizing for size.
+ // FIXME: Use Function::optForSize().
+ if (OptimizeForSize ||
+ loopHeader->getParent()->hasFnAttribute(Attribute::OptimizeForSize))
+ return false;
+
+ if (LoopUnswitchWithBlockFrequency && EnabledPGO) {
+ // Compute the weighted frequency of the hottest block in the
+ // loop (loopHeader in this case since inner loops should be
+ // processed before outer loop). If it is less than ColdFrequency,
+ // we should not unswitch.
+ BlockFrequency LoopEntryFreq = BFI.getBlockFreq(loopHeader);
+ if (LoopEntryFreq < ColdEntryFreq)
+ return false;
+ }
+
+ // Loop over all of the basic blocks in the loop. If we find an interior
+ // block that is branching on a loop-invariant condition, we can unswitch this
+ // loop.
+ for (Loop::block_iterator I = currentLoop->block_begin(),
+ E = currentLoop->block_end(); I != E; ++I) {
+ TerminatorInst *TI = (*I)->getTerminator();
+ if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+ // If this isn't branching on an invariant condition, we can't unswitch
+ // it.
+ if (BI->isConditional()) {
+ // See if this, or some part of it, is loop invariant. If so, we can
+ // unswitch on it if we desire.
+ Value *LoopCond = FindLIVLoopCondition(BI->getCondition(),
+ currentLoop, Changed);
+ if (LoopCond &&
+ UnswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context), TI)) {
+ ++NumBranches;
+ return true;
+ }
+ }
+ } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+ Value *LoopCond = FindLIVLoopCondition(SI->getCondition(),
+ currentLoop, Changed);
+ unsigned NumCases = SI->getNumCases();
+ if (LoopCond && NumCases) {
+ // Find a value to unswitch on:
+ // FIXME: this should chose the most expensive case!
+ // FIXME: scan for a case with a non-critical edge?
+ Constant *UnswitchVal = nullptr;
+
+ // Do not process same value again and again.
+ // At this point we have some cases already unswitched and
+ // some not yet unswitched. Let's find the first not yet unswitched one.
+ for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
+ i != e; ++i) {
+ Constant *UnswitchValCandidate = i.getCaseValue();
+ if (!BranchesInfo.isUnswitched(SI, UnswitchValCandidate)) {
+ UnswitchVal = UnswitchValCandidate;
+ break;
+ }
+ }
+
+ if (!UnswitchVal)
+ continue;
+
+ if (UnswitchIfProfitable(LoopCond, UnswitchVal)) {
+ ++NumSwitches;
+ return true;
+ }
+ }
+ }
+
+ // Scan the instructions to check for unswitchable values.
+ for (BasicBlock::iterator BBI = (*I)->begin(), E = (*I)->end();
+ BBI != E; ++BBI)
+ if (SelectInst *SI = dyn_cast<SelectInst>(BBI)) {
+ Value *LoopCond = FindLIVLoopCondition(SI->getCondition(),
+ currentLoop, Changed);
+ if (LoopCond && UnswitchIfProfitable(LoopCond,
+ ConstantInt::getTrue(Context))) {
+ ++NumSelects;
+ return true;
+ }
+ }
+ }
+ return Changed;
+}
+
+/// Check to see if all paths from BB exit the loop with no side effects
+/// (including infinite loops).
+///
+/// If true, we return true and set ExitBB to the block we
+/// exit through.
+///
+static bool isTrivialLoopExitBlockHelper(Loop *L, BasicBlock *BB,
+ BasicBlock *&ExitBB,
+ std::set<BasicBlock*> &Visited) {
+ if (!Visited.insert(BB).second) {
+ // Already visited. Without more analysis, this could indicate an infinite
+ // loop.
+ return false;
+ }
+ if (!L->contains(BB)) {
+ // Otherwise, this is a loop exit, this is fine so long as this is the
+ // first exit.
+ if (ExitBB) return false;
+ ExitBB = BB;
+ return true;
+ }
+
+ // Otherwise, this is an unvisited intra-loop node. Check all successors.
+ for (succ_iterator SI = succ_begin(BB), E = succ_end(BB); SI != E; ++SI) {
+ // Check to see if the successor is a trivial loop exit.
+ if (!isTrivialLoopExitBlockHelper(L, *SI, ExitBB, Visited))
+ return false;
+ }
+
+ // Okay, everything after this looks good, check to make sure that this block
+ // doesn't include any side effects.
+ for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+ if (I->mayHaveSideEffects())
+ return false;
+
+ return true;
+}
+
+/// Return true if the specified block unconditionally leads to an exit from
+/// the specified loop, and has no side-effects in the process. If so, return
+/// the block that is exited to, otherwise return null.
+static BasicBlock *isTrivialLoopExitBlock(Loop *L, BasicBlock *BB) {
+ std::set<BasicBlock*> Visited;
+ Visited.insert(L->getHeader()); // Branches to header make infinite loops.
+ BasicBlock *ExitBB = nullptr;
+ if (isTrivialLoopExitBlockHelper(L, BB, ExitBB, Visited))
+ return ExitBB;
+ return nullptr;
+}
+
+/// We have found that we can unswitch currentLoop when LoopCond == Val to
+/// simplify the loop. If we decide that this is profitable,
+/// unswitch the loop, reprocess the pieces, then return true.
+bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val,
+ TerminatorInst *TI) {
+ // Check to see if it would be profitable to unswitch current loop.
+ if (!BranchesInfo.CostAllowsUnswitching()) {
+ DEBUG(dbgs() << "NOT unswitching loop %"
+ << currentLoop->getHeader()->getName()
+ << " at non-trivial condition '" << *Val
+ << "' == " << *LoopCond << "\n"
+ << ". Cost too high.\n");
+ return false;
+ }
+
+ UnswitchNontrivialCondition(LoopCond, Val, currentLoop, TI);
+ return true;
+}
+
+/// Recursively clone the specified loop and all of its children,
+/// mapping the blocks with the specified map.
+static Loop *CloneLoop(Loop *L, Loop *PL, ValueToValueMapTy &VM,
+ LoopInfo *LI, LPPassManager *LPM) {
+ Loop &New = LPM->addLoop(PL);
+
+ // Add all of the blocks in L to the new loop.
+ for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
+ I != E; ++I)
+ if (LI->getLoopFor(*I) == L)
+ New.addBasicBlockToLoop(cast<BasicBlock>(VM[*I]), *LI);
+
+ // Add all of the subloops to the new loop.
+ for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I)
+ CloneLoop(*I, &New, VM, LI, LPM);
+
+ return &New;
+}
+
+static void copyMetadata(Instruction *DstInst, const Instruction *SrcInst,
+ bool Swapped) {
+ if (!SrcInst || !SrcInst->hasMetadata())
+ return;
+
+ SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
+ SrcInst->getAllMetadata(MDs);
+ for (auto &MD : MDs) {
+ switch (MD.first) {
+ default:
+ break;
+ case LLVMContext::MD_prof:
+ if (Swapped && MD.second->getNumOperands() == 3 &&
+ isa<MDString>(MD.second->getOperand(0))) {
+ MDString *MDName = cast<MDString>(MD.second->getOperand(0));
+ if (MDName->getString() == "branch_weights") {
+ auto *ValT = cast_or_null<ConstantAsMetadata>(
+ MD.second->getOperand(1))->getValue();
+ auto *ValF = cast_or_null<ConstantAsMetadata>(
+ MD.second->getOperand(2))->getValue();
+ assert(ValT && ValF && "Invalid Operands of branch_weights");
+ auto NewMD =
+ MDBuilder(DstInst->getParent()->getContext())
+ .createBranchWeights(cast<ConstantInt>(ValF)->getZExtValue(),
+ cast<ConstantInt>(ValT)->getZExtValue());
+ MD.second = NewMD;
+ }
+ }
+ // fallthrough.
+ case LLVMContext::MD_make_implicit:
+ case LLVMContext::MD_dbg:
+ DstInst->setMetadata(MD.first, MD.second);
+ }
+ }
+}
+
+/// Emit a conditional branch on two values if LIC == Val, branch to TrueDst,
+/// otherwise branch to FalseDest. Insert the code immediately before InsertPt.
+void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
+ BasicBlock *TrueDest,
+ BasicBlock *FalseDest,
+ Instruction *InsertPt,
+ TerminatorInst *TI) {
+ // Insert a conditional branch on LIC to the two preheaders. The original
+ // code is the true version and the new code is the false version.
+ Value *BranchVal = LIC;
+ bool Swapped = false;
+ if (!isa<ConstantInt>(Val) ||
+ Val->getType() != Type::getInt1Ty(LIC->getContext()))
+ BranchVal = new ICmpInst(InsertPt, ICmpInst::ICMP_EQ, LIC, Val);
+ else if (Val != ConstantInt::getTrue(Val->getContext())) {
+ // We want to enter the new loop when the condition is true.
+ std::swap(TrueDest, FalseDest);
+ Swapped = true;
+ }
+
+ // Insert the new branch.
+ BranchInst *BI = BranchInst::Create(TrueDest, FalseDest, BranchVal, InsertPt);
+ copyMetadata(BI, TI, Swapped);
+
+ // If either edge is critical, split it. This helps preserve LoopSimplify
+ // form for enclosing loops.
+ auto Options = CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA();
+ SplitCriticalEdge(BI, 0, Options);
+ SplitCriticalEdge(BI, 1, Options);
+}
+
+/// Given a loop that has a trivial unswitchable condition in it (a cond branch
+/// from its header block to its latch block, where the path through the loop
+/// that doesn't execute its body has no side-effects), unswitch it. This
+/// doesn't involve any code duplication, just moving the conditional branch
+/// outside of the loop and updating loop info.
+void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val,
+ BasicBlock *ExitBlock,
+ TerminatorInst *TI) {
+ DEBUG(dbgs() << "loop-unswitch: Trivial-Unswitch loop %"
+ << loopHeader->getName() << " [" << L->getBlocks().size()
+ << " blocks] in Function "
+ << L->getHeader()->getParent()->getName() << " on cond: " << *Val
+ << " == " << *Cond << "\n");
+
+ // First step, split the preheader, so that we know that there is a safe place
+ // to insert the conditional branch. We will change loopPreheader to have a
+ // conditional branch on Cond.
+ BasicBlock *NewPH = SplitEdge(loopPreheader, loopHeader, DT, LI);
+
+ // Now that we have a place to insert the conditional branch, create a place
+ // to branch to: this is the exit block out of the loop that we should
+ // short-circuit to.
+
+ // Split this block now, so that the loop maintains its exit block, and so
+ // that the jump from the preheader can execute the contents of the exit block
+ // without actually branching to it (the exit block should be dominated by the
+ // loop header, not the preheader).
+ assert(!L->contains(ExitBlock) && "Exit block is in the loop?");
+ BasicBlock *NewExit = SplitBlock(ExitBlock, &ExitBlock->front(), DT, LI);
+
+ // Okay, now we have a position to branch from and a position to branch to,
+ // insert the new conditional branch.
+ EmitPreheaderBranchOnCondition(Cond, Val, NewExit, NewPH,
+ loopPreheader->getTerminator(), TI);
+ LPM->deleteSimpleAnalysisValue(loopPreheader->getTerminator(), L);
+ loopPreheader->getTerminator()->eraseFromParent();
+
+ // We need to reprocess this loop, it could be unswitched again.
+ redoLoop = true;
+
+ // Now that we know that the loop is never entered when this condition is a
+ // particular value, rewrite the loop with this info. We know that this will
+ // at least eliminate the old branch.
+ RewriteLoopBodyWithConditionConstant(L, Cond, Val, false);
+ ++NumTrivial;
+}
+
+/// Check if the first non-constant condition starting from the loop header is
+/// a trivial unswitch condition: that is, a condition controls whether or not
+/// the loop does anything at all. If it is a trivial condition, unswitching
+/// produces no code duplications (equivalently, it produces a simpler loop and
+/// a new empty loop, which gets deleted). Therefore always unswitch trivial
+/// condition.
+bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
+ BasicBlock *CurrentBB = currentLoop->getHeader();
+ TerminatorInst *CurrentTerm = CurrentBB->getTerminator();
+ LLVMContext &Context = CurrentBB->getContext();
+
+ // If loop header has only one reachable successor (currently via an
+ // unconditional branch or constant foldable conditional branch, but
+ // should also consider adding constant foldable switch instruction in
+ // future), we should keep looking for trivial condition candidates in
+ // the successor as well. An alternative is to constant fold conditions
+ // and merge successors into loop header (then we only need to check header's
+ // terminator). The reason for not doing this in LoopUnswitch pass is that
+ // it could potentially break LoopPassManager's invariants. Folding dead
+ // branches could either eliminate the current loop or make other loops
+ // unreachable. LCSSA form might also not be preserved after deleting
+ // branches. The following code keeps traversing loop header's successors
+ // until it finds the trivial condition candidate (condition that is not a
+ // constant). Since unswitching generates branches with constant conditions,
+ // this scenario could be very common in practice.
+ SmallSet<BasicBlock*, 8> Visited;
+
+ while (true) {
+ // If we exit loop or reach a previous visited block, then
+ // we can not reach any trivial condition candidates (unfoldable
+ // branch instructions or switch instructions) and no unswitch
+ // can happen. Exit and return false.
+ if (!currentLoop->contains(CurrentBB) || !Visited.insert(CurrentBB).second)
+ return false;
+
+ // Check if this loop will execute any side-effecting instructions (e.g.
+ // stores, calls, volatile loads) in the part of the loop that the code
+ // *would* execute. Check the header first.
+ for (Instruction &I : *CurrentBB)
+ if (I.mayHaveSideEffects())
+ return false;
+
+ // FIXME: add check for constant foldable switch instructions.
+ if (BranchInst *BI = dyn_cast<BranchInst>(CurrentTerm)) {
+ if (BI->isUnconditional()) {
+ CurrentBB = BI->getSuccessor(0);
+ } else if (BI->getCondition() == ConstantInt::getTrue(Context)) {
+ CurrentBB = BI->getSuccessor(0);
+ } else if (BI->getCondition() == ConstantInt::getFalse(Context)) {
+ CurrentBB = BI->getSuccessor(1);
+ } else {
+ // Found a trivial condition candidate: non-foldable conditional branch.
+ break;
+ }
+ } else {
+ break;
+ }
+
+ CurrentTerm = CurrentBB->getTerminator();
+ }
+
+ // CondVal is the condition that controls the trivial condition.
+ // LoopExitBB is the BasicBlock that loop exits when meets trivial condition.
+ Constant *CondVal = nullptr;
+ BasicBlock *LoopExitBB = nullptr;
+
+ if (BranchInst *BI = dyn_cast<BranchInst>(CurrentTerm)) {
+ // If this isn't branching on an invariant condition, we can't unswitch it.
+ if (!BI->isConditional())
+ return false;
+
+ Value *LoopCond = FindLIVLoopCondition(BI->getCondition(),
+ currentLoop, Changed);
+
+ // Unswitch only if the trivial condition itself is an LIV (not
+ // partial LIV which could occur in and/or)
+ if (!LoopCond || LoopCond != BI->getCondition())
+ return false;
+
+ // Check to see if a successor of the branch is guaranteed to
+ // exit through a unique exit block without having any
+ // side-effects. If so, determine the value of Cond that causes
+ // it to do this.
+ if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop,
+ BI->getSuccessor(0)))) {
+ CondVal = ConstantInt::getTrue(Context);
+ } else if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop,
+ BI->getSuccessor(1)))) {
+ CondVal = ConstantInt::getFalse(Context);
+ }
+
+ // If we didn't find a single unique LoopExit block, or if the loop exit
+ // block contains phi nodes, this isn't trivial.
+ if (!LoopExitBB || isa<PHINode>(LoopExitBB->begin()))
+ return false; // Can't handle this.
+
+ UnswitchTrivialCondition(currentLoop, LoopCond, CondVal, LoopExitBB,
+ CurrentTerm);
+ ++NumBranches;
+ return true;
+ } else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurrentTerm)) {
+ // If this isn't switching on an invariant condition, we can't unswitch it.
+ Value *LoopCond = FindLIVLoopCondition(SI->getCondition(),
+ currentLoop, Changed);
+
+ // Unswitch only if the trivial condition itself is an LIV (not
+ // partial LIV which could occur in and/or)
+ if (!LoopCond || LoopCond != SI->getCondition())
+ return false;
+
+ // Check to see if a successor of the switch is guaranteed to go to the
+ // latch block or exit through a one exit block without having any
+ // side-effects. If so, determine the value of Cond that causes it to do
+ // this.
+ // Note that we can't trivially unswitch on the default case or
+ // on already unswitched cases.
+ for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
+ i != e; ++i) {
+ BasicBlock *LoopExitCandidate;
+ if ((LoopExitCandidate = isTrivialLoopExitBlock(currentLoop,
+ i.getCaseSuccessor()))) {
+ // Okay, we found a trivial case, remember the value that is trivial.
+ ConstantInt *CaseVal = i.getCaseValue();
+
+ // Check that it was not unswitched before, since already unswitched
+ // trivial vals are looks trivial too.
+ if (BranchesInfo.isUnswitched(SI, CaseVal))
+ continue;
+ LoopExitBB = LoopExitCandidate;
+ CondVal = CaseVal;
+ break;
+ }
+ }
+
+ // If we didn't find a single unique LoopExit block, or if the loop exit
+ // block contains phi nodes, this isn't trivial.
+ if (!LoopExitBB || isa<PHINode>(LoopExitBB->begin()))
+ return false; // Can't handle this.
+
+ UnswitchTrivialCondition(currentLoop, LoopCond, CondVal, LoopExitBB,
+ nullptr);
+ ++NumSwitches;
+ return true;
+ }
+ return false;
+}
+
+/// Split all of the edges from inside the loop to their exit blocks.
+/// Update the appropriate Phi nodes as we do so.
+void LoopUnswitch::SplitExitEdges(Loop *L,
+ const SmallVectorImpl<BasicBlock *> &ExitBlocks){
+
+ for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
+ BasicBlock *ExitBlock = ExitBlocks[i];
+ SmallVector<BasicBlock *, 4> Preds(pred_begin(ExitBlock),
+ pred_end(ExitBlock));
+
+ // Although SplitBlockPredecessors doesn't preserve loop-simplify in
+ // general, if we call it on all predecessors of all exits then it does.
+ SplitBlockPredecessors(ExitBlock, Preds, ".us-lcssa", DT, LI,
+ /*PreserveLCSSA*/ true);
+ }
+}
+
+/// We determined that the loop is profitable to unswitch when LIC equal Val.
+/// Split it into loop versions and test the condition outside of either loop.
+/// Return the loops created as Out1/Out2.
+void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
+ Loop *L, TerminatorInst *TI) {
+ Function *F = loopHeader->getParent();
+ DEBUG(dbgs() << "loop-unswitch: Unswitching loop %"
+ << loopHeader->getName() << " [" << L->getBlocks().size()
+ << " blocks] in Function " << F->getName()
+ << " when '" << *Val << "' == " << *LIC << "\n");
+
+ if (auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>())
+ SEWP->getSE().forgetLoop(L);
+
+ LoopBlocks.clear();
+ NewBlocks.clear();
+
+ // First step, split the preheader and exit blocks, and add these blocks to
+ // the LoopBlocks list.
+ BasicBlock *NewPreheader = SplitEdge(loopPreheader, loopHeader, DT, LI);
+ LoopBlocks.push_back(NewPreheader);
+
+ // We want the loop to come after the preheader, but before the exit blocks.
+ LoopBlocks.insert(LoopBlocks.end(), L->block_begin(), L->block_end());
+
+ SmallVector<BasicBlock*, 8> ExitBlocks;
+ L->getUniqueExitBlocks(ExitBlocks);
+
+ // Split all of the edges from inside the loop to their exit blocks. Update
+ // the appropriate Phi nodes as we do so.
+ SplitExitEdges(L, ExitBlocks);
+
+ // The exit blocks may have been changed due to edge splitting, recompute.
+ ExitBlocks.clear();
+ L->getUniqueExitBlocks(ExitBlocks);
+
+ // Add exit blocks to the loop blocks.
+ LoopBlocks.insert(LoopBlocks.end(), ExitBlocks.begin(), ExitBlocks.end());
+
+ // Next step, clone all of the basic blocks that make up the loop (including
+ // the loop preheader and exit blocks), keeping track of the mapping between
+ // the instructions and blocks.
+ NewBlocks.reserve(LoopBlocks.size());
+ ValueToValueMapTy VMap;
+ for (unsigned i = 0, e = LoopBlocks.size(); i != e; ++i) {
+ BasicBlock *NewBB = CloneBasicBlock(LoopBlocks[i], VMap, ".us", F);
+
+ NewBlocks.push_back(NewBB);
+ VMap[LoopBlocks[i]] = NewBB; // Keep the BB mapping.
+ LPM->cloneBasicBlockSimpleAnalysis(LoopBlocks[i], NewBB, L);
+ }
+
+ // Splice the newly inserted blocks into the function right before the
+ // original preheader.
+ F->getBasicBlockList().splice(NewPreheader->getIterator(),
+ F->getBasicBlockList(),
+ NewBlocks[0]->getIterator(), F->end());
+
+ // FIXME: We could register any cloned assumptions instead of clearing the
+ // whole function's cache.
+ AC->clear();
+
+ // Now we create the new Loop object for the versioned loop.
+ Loop *NewLoop = CloneLoop(L, L->getParentLoop(), VMap, LI, LPM);
+
+ // Recalculate unswitching quota, inherit simplified switches info for NewBB,
+ // Probably clone more loop-unswitch related loop properties.
+ BranchesInfo.cloneData(NewLoop, L, VMap);
+
+ Loop *ParentLoop = L->getParentLoop();
+ if (ParentLoop) {
+ // Make sure to add the cloned preheader and exit blocks to the parent loop
+ // as well.
+ ParentLoop->addBasicBlockToLoop(NewBlocks[0], *LI);
+ }
+
+ for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
+ BasicBlock *NewExit = cast<BasicBlock>(VMap[ExitBlocks[i]]);
+ // The new exit block should be in the same loop as the old one.
+ if (Loop *ExitBBLoop = LI->getLoopFor(ExitBlocks[i]))
+ ExitBBLoop->addBasicBlockToLoop(NewExit, *LI);
+
+ assert(NewExit->getTerminator()->getNumSuccessors() == 1 &&
+ "Exit block should have been split to have one successor!");
+ BasicBlock *ExitSucc = NewExit->getTerminator()->getSuccessor(0);
+
+ // If the successor of the exit block had PHI nodes, add an entry for
+ // NewExit.
+ for (BasicBlock::iterator I = ExitSucc->begin();
+ PHINode *PN = dyn_cast<PHINode>(I); ++I) {
+ Value *V = PN->getIncomingValueForBlock(ExitBlocks[i]);
+ ValueToValueMapTy::iterator It = VMap.find(V);
+ if (It != VMap.end()) V = It->second;
+ PN->addIncoming(V, NewExit);
+ }
+
+ if (LandingPadInst *LPad = NewExit->getLandingPadInst()) {
+ PHINode *PN = PHINode::Create(LPad->getType(), 0, "",
+ &*ExitSucc->getFirstInsertionPt());
+
+ for (pred_iterator I = pred_begin(ExitSucc), E = pred_end(ExitSucc);
+ I != E; ++I) {
+ BasicBlock *BB = *I;
+ LandingPadInst *LPI = BB->getLandingPadInst();
+ LPI->replaceAllUsesWith(PN);
+ PN->addIncoming(LPI, BB);
+ }
+ }
+ }
+
+ // Rewrite the code to refer to itself.
+ for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i)
+ for (BasicBlock::iterator I = NewBlocks[i]->begin(),
+ E = NewBlocks[i]->end(); I != E; ++I)
+ RemapInstruction(&*I, VMap,
+ RF_NoModuleLevelChanges | RF_IgnoreMissingEntries);
+
+ // Rewrite the original preheader to select between versions of the loop.
+ BranchInst *OldBR = cast<BranchInst>(loopPreheader->getTerminator());
+ assert(OldBR->isUnconditional() && OldBR->getSuccessor(0) == LoopBlocks[0] &&
+ "Preheader splitting did not work correctly!");
+
+ // Emit the new branch that selects between the two versions of this loop.
+ EmitPreheaderBranchOnCondition(LIC, Val, NewBlocks[0], LoopBlocks[0], OldBR,
+ TI);
+ LPM->deleteSimpleAnalysisValue(OldBR, L);
+ OldBR->eraseFromParent();
+
+ LoopProcessWorklist.push_back(NewLoop);
+ redoLoop = true;
+
+ // Keep a WeakVH holding onto LIC. If the first call to RewriteLoopBody
+ // deletes the instruction (for example by simplifying a PHI that feeds into
+ // the condition that we're unswitching on), we don't rewrite the second
+ // iteration.
+ WeakVH LICHandle(LIC);
+
+ // Now we rewrite the original code to know that the condition is true and the
+ // new code to know that the condition is false.
+ RewriteLoopBodyWithConditionConstant(L, LIC, Val, false);
+
+ // It's possible that simplifying one loop could cause the other to be
+ // changed to another value or a constant. If its a constant, don't simplify
+ // it.
+ if (!LoopProcessWorklist.empty() && LoopProcessWorklist.back() == NewLoop &&
+ LICHandle && !isa<Constant>(LICHandle))
+ RewriteLoopBodyWithConditionConstant(NewLoop, LICHandle, Val, true);
+}
+
+/// Remove all instances of I from the worklist vector specified.
+static void RemoveFromWorklist(Instruction *I,
+ std::vector<Instruction*> &Worklist) {
+
+ Worklist.erase(std::remove(Worklist.begin(), Worklist.end(), I),
+ Worklist.end());
+}
+
+/// When we find that I really equals V, remove I from the
+/// program, replacing all uses with V and update the worklist.
+static void ReplaceUsesOfWith(Instruction *I, Value *V,
+ std::vector<Instruction*> &Worklist,
+ Loop *L, LPPassManager *LPM) {
+ DEBUG(dbgs() << "Replace with '" << *V << "': " << *I);
+
+ // Add uses to the worklist, which may be dead now.
+ for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+ if (Instruction *Use = dyn_cast<Instruction>(I->getOperand(i)))
+ Worklist.push_back(Use);
+
+ // Add users to the worklist which may be simplified now.
+ for (User *U : I->users())
+ Worklist.push_back(cast<Instruction>(U));
+ LPM->deleteSimpleAnalysisValue(I, L);
+ RemoveFromWorklist(I, Worklist);
+ I->replaceAllUsesWith(V);
+ I->eraseFromParent();
+ ++NumSimplify;
+}
+
+/// We know either that the value LIC has the value specified by Val in the
+/// specified loop, or we know it does NOT have that value.
+/// Rewrite any uses of LIC or of properties correlated to it.
+void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
+ Constant *Val,
+ bool IsEqual) {
+ assert(!isa<Constant>(LIC) && "Why are we unswitching on a constant?");
+
+ // FIXME: Support correlated properties, like:
+ // for (...)
+ // if (li1 < li2)
+ // ...
+ // if (li1 > li2)
+ // ...
+
+ // FOLD boolean conditions (X|LIC), (X&LIC). Fold conditional branches,
+ // selects, switches.
+ std::vector<Instruction*> Worklist;
+ LLVMContext &Context = Val->getContext();
+
+ // If we know that LIC == Val, or that LIC == NotVal, just replace uses of LIC
+ // in the loop with the appropriate one directly.
+ if (IsEqual || (isa<ConstantInt>(Val) &&
+ Val->getType()->isIntegerTy(1))) {
+ Value *Replacement;
+ if (IsEqual)
+ Replacement = Val;
+ else
+ Replacement = ConstantInt::get(Type::getInt1Ty(Val->getContext()),
+ !cast<ConstantInt>(Val)->getZExtValue());
+
+ for (User *U : LIC->users()) {
+ Instruction *UI = dyn_cast<Instruction>(U);
+ if (!UI || !L->contains(UI))
+ continue;
+ Worklist.push_back(UI);
+ }
+
+ for (std::vector<Instruction*>::iterator UI = Worklist.begin(),
+ UE = Worklist.end(); UI != UE; ++UI)
+ (*UI)->replaceUsesOfWith(LIC, Replacement);
+
+ SimplifyCode(Worklist, L);
+ return;
+ }
+
+ // Otherwise, we don't know the precise value of LIC, but we do know that it
+ // is certainly NOT "Val". As such, simplify any uses in the loop that we
+ // can. This case occurs when we unswitch switch statements.
+ for (User *U : LIC->users()) {
+ Instruction *UI = dyn_cast<Instruction>(U);
+ if (!UI || !L->contains(UI))
+ continue;
+
+ Worklist.push_back(UI);
+
+ // TODO: We could do other simplifications, for example, turning
+ // 'icmp eq LIC, Val' -> false.
+
+ // If we know that LIC is not Val, use this info to simplify code.
+ SwitchInst *SI = dyn_cast<SwitchInst>(UI);
+ if (!SI || !isa<ConstantInt>(Val)) continue;
+
+ SwitchInst::CaseIt DeadCase = SI->findCaseValue(cast<ConstantInt>(Val));
+ // Default case is live for multiple values.
+ if (DeadCase == SI->case_default()) continue;
+
+ // Found a dead case value. Don't remove PHI nodes in the
+ // successor if they become single-entry, those PHI nodes may
+ // be in the Users list.
+
+ BasicBlock *Switch = SI->getParent();
+ BasicBlock *SISucc = DeadCase.getCaseSuccessor();
+ BasicBlock *Latch = L->getLoopLatch();
+
+ BranchesInfo.setUnswitched(SI, Val);
+
+ if (!SI->findCaseDest(SISucc)) continue; // Edge is critical.
+ // If the DeadCase successor dominates the loop latch, then the
+ // transformation isn't safe since it will delete the sole predecessor edge
+ // to the latch.
+ if (Latch && DT->dominates(SISucc, Latch))
+ continue;
+
+ // FIXME: This is a hack. We need to keep the successor around
+ // and hooked up so as to preserve the loop structure, because
+ // trying to update it is complicated. So instead we preserve the
+ // loop structure and put the block on a dead code path.
+ SplitEdge(Switch, SISucc, DT, LI);
+ // Compute the successors instead of relying on the return value
+ // of SplitEdge, since it may have split the switch successor
+ // after PHI nodes.
+ BasicBlock *NewSISucc = DeadCase.getCaseSuccessor();
+ BasicBlock *OldSISucc = *succ_begin(NewSISucc);
+ // Create an "unreachable" destination.
+ BasicBlock *Abort = BasicBlock::Create(Context, "us-unreachable",
+ Switch->getParent(),
+ OldSISucc);
+ new UnreachableInst(Context, Abort);
+ // Force the new case destination to branch to the "unreachable"
+ // block while maintaining a (dead) CFG edge to the old block.
+ NewSISucc->getTerminator()->eraseFromParent();
+ BranchInst::Create(Abort, OldSISucc,
+ ConstantInt::getTrue(Context), NewSISucc);
+ // Release the PHI operands for this edge.
+ for (BasicBlock::iterator II = NewSISucc->begin();
+ PHINode *PN = dyn_cast<PHINode>(II); ++II)
+ PN->setIncomingValue(PN->getBasicBlockIndex(Switch),
+ UndefValue::get(PN->getType()));
+ // Tell the domtree about the new block. We don't fully update the
+ // domtree here -- instead we force it to do a full recomputation
+ // after the pass is complete -- but we do need to inform it of
+ // new blocks.
+ DT->addNewBlock(Abort, NewSISucc);
+ }
+
+ SimplifyCode(Worklist, L);
+}
+
+/// Now that we have simplified some instructions in the loop, walk over it and
+/// constant prop, dce, and fold control flow where possible. Note that this is
+/// effectively a very simple loop-structure-aware optimizer. During processing
+/// of this loop, L could very well be deleted, so it must not be used.
+///
+/// FIXME: When the loop optimizer is more mature, separate this out to a new
+/// pass.
+///
+void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) {
+ const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+ while (!Worklist.empty()) {
+ Instruction *I = Worklist.back();
+ Worklist.pop_back();
+
+ // Simple DCE.
+ if (isInstructionTriviallyDead(I)) {
+ DEBUG(dbgs() << "Remove dead instruction '" << *I);
+
+ // Add uses to the worklist, which may be dead now.
+ for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+ if (Instruction *Use = dyn_cast<Instruction>(I->getOperand(i)))
+ Worklist.push_back(Use);
+ LPM->deleteSimpleAnalysisValue(I, L);
+ RemoveFromWorklist(I, Worklist);
+ I->eraseFromParent();
+ ++NumSimplify;
+ continue;
+ }
+
+ // See if instruction simplification can hack this up. This is common for
+ // things like "select false, X, Y" after unswitching made the condition be
+ // 'false'. TODO: update the domtree properly so we can pass it here.
+ if (Value *V = SimplifyInstruction(I, DL))
+ if (LI->replacementPreservesLCSSAForm(I, V)) {
+ ReplaceUsesOfWith(I, V, Worklist, L, LPM);
+ continue;
+ }
+
+ // Special case hacks that appear commonly in unswitched code.
+ if (BranchInst *BI = dyn_cast<BranchInst>(I)) {
+ if (BI->isUnconditional()) {
+ // If BI's parent is the only pred of the successor, fold the two blocks
+ // together.
+ BasicBlock *Pred = BI->getParent();
+ BasicBlock *Succ = BI->getSuccessor(0);
+ BasicBlock *SinglePred = Succ->getSinglePredecessor();
+ if (!SinglePred) continue; // Nothing to do.
+ assert(SinglePred == Pred && "CFG broken");
+
+ DEBUG(dbgs() << "Merging blocks: " << Pred->getName() << " <- "
+ << Succ->getName() << "\n");
+
+ // Resolve any single entry PHI nodes in Succ.
+ while (PHINode *PN = dyn_cast<PHINode>(Succ->begin()))
+ ReplaceUsesOfWith(PN, PN->getIncomingValue(0), Worklist, L, LPM);
+
+ // If Succ has any successors with PHI nodes, update them to have
+ // entries coming from Pred instead of Succ.
+ Succ->replaceAllUsesWith(Pred);
+
+ // Move all of the successor contents from Succ to Pred.
+ Pred->getInstList().splice(BI->getIterator(), Succ->getInstList(),
+ Succ->begin(), Succ->end());
+ LPM->deleteSimpleAnalysisValue(BI, L);
+ BI->eraseFromParent();
+ RemoveFromWorklist(BI, Worklist);
+
+ // Remove Succ from the loop tree.
+ LI->removeBlock(Succ);
+ LPM->deleteSimpleAnalysisValue(Succ, L);
+ Succ->eraseFromParent();
+ ++NumSimplify;
+ continue;
+ }
+
+ continue;
+ }
+ }
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp b/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp
new file mode 100644
index 0000000..41511bc
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp
@@ -0,0 +1,148 @@
+//===- LowerAtomic.cpp - Lower atomic intrinsics --------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers atomic intrinsics to non-atomic form for use in a known
+// non-preemptible environment.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Pass.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loweratomic"
+
+static bool LowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) {
+ IRBuilder<> Builder(CXI);
+ Value *Ptr = CXI->getPointerOperand();
+ Value *Cmp = CXI->getCompareOperand();
+ Value *Val = CXI->getNewValOperand();
+
+ LoadInst *Orig = Builder.CreateLoad(Ptr);
+ Value *Equal = Builder.CreateICmpEQ(Orig, Cmp);
+ Value *Res = Builder.CreateSelect(Equal, Val, Orig);
+ Builder.CreateStore(Res, Ptr);
+
+ Res = Builder.CreateInsertValue(UndefValue::get(CXI->getType()), Orig, 0);
+ Res = Builder.CreateInsertValue(Res, Equal, 1);
+
+ CXI->replaceAllUsesWith(Res);
+ CXI->eraseFromParent();
+ return true;
+}
+
+static bool LowerAtomicRMWInst(AtomicRMWInst *RMWI) {
+ IRBuilder<> Builder(RMWI);
+ Value *Ptr = RMWI->getPointerOperand();
+ Value *Val = RMWI->getValOperand();
+
+ LoadInst *Orig = Builder.CreateLoad(Ptr);
+ Value *Res = nullptr;
+
+ switch (RMWI->getOperation()) {
+ default: llvm_unreachable("Unexpected RMW operation");
+ case AtomicRMWInst::Xchg:
+ Res = Val;
+ break;
+ case AtomicRMWInst::Add:
+ Res = Builder.CreateAdd(Orig, Val);
+ break;
+ case AtomicRMWInst::Sub:
+ Res = Builder.CreateSub(Orig, Val);
+ break;
+ case AtomicRMWInst::And:
+ Res = Builder.CreateAnd(Orig, Val);
+ break;
+ case AtomicRMWInst::Nand:
+ Res = Builder.CreateNot(Builder.CreateAnd(Orig, Val));
+ break;
+ case AtomicRMWInst::Or:
+ Res = Builder.CreateOr(Orig, Val);
+ break;
+ case AtomicRMWInst::Xor:
+ Res = Builder.CreateXor(Orig, Val);
+ break;
+ case AtomicRMWInst::Max:
+ Res = Builder.CreateSelect(Builder.CreateICmpSLT(Orig, Val),
+ Val, Orig);
+ break;
+ case AtomicRMWInst::Min:
+ Res = Builder.CreateSelect(Builder.CreateICmpSLT(Orig, Val),
+ Orig, Val);
+ break;
+ case AtomicRMWInst::UMax:
+ Res = Builder.CreateSelect(Builder.CreateICmpULT(Orig, Val),
+ Val, Orig);
+ break;
+ case AtomicRMWInst::UMin:
+ Res = Builder.CreateSelect(Builder.CreateICmpULT(Orig, Val),
+ Orig, Val);
+ break;
+ }
+ Builder.CreateStore(Res, Ptr);
+ RMWI->replaceAllUsesWith(Orig);
+ RMWI->eraseFromParent();
+ return true;
+}
+
+static bool LowerFenceInst(FenceInst *FI) {
+ FI->eraseFromParent();
+ return true;
+}
+
+static bool LowerLoadInst(LoadInst *LI) {
+ LI->setAtomic(NotAtomic);
+ return true;
+}
+
+static bool LowerStoreInst(StoreInst *SI) {
+ SI->setAtomic(NotAtomic);
+ return true;
+}
+
+namespace {
+ struct LowerAtomic : public BasicBlockPass {
+ static char ID;
+ LowerAtomic() : BasicBlockPass(ID) {
+ initializeLowerAtomicPass(*PassRegistry::getPassRegistry());
+ }
+ bool runOnBasicBlock(BasicBlock &BB) override {
+ if (skipOptnoneFunction(BB))
+ return false;
+ bool Changed = false;
+ for (BasicBlock::iterator DI = BB.begin(), DE = BB.end(); DI != DE; ) {
+ Instruction *Inst = &*DI++;
+ if (FenceInst *FI = dyn_cast<FenceInst>(Inst))
+ Changed |= LowerFenceInst(FI);
+ else if (AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(Inst))
+ Changed |= LowerAtomicCmpXchgInst(CXI);
+ else if (AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(Inst))
+ Changed |= LowerAtomicRMWInst(RMWI);
+ else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+ if (LI->isAtomic())
+ LowerLoadInst(LI);
+ } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+ if (SI->isAtomic())
+ LowerStoreInst(SI);
+ }
+ }
+ return Changed;
+ }
+ };
+}
+
+char LowerAtomic::ID = 0;
+INITIALIZE_PASS(LowerAtomic, "loweratomic",
+ "Lower atomic intrinsics to non-atomic form",
+ false, false)
+
+Pass *llvm::createLowerAtomicPass() { return new LowerAtomic(); }
diff --git a/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
new file mode 100644
index 0000000..2ace902
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
@@ -0,0 +1,192 @@
+//===- LowerExpectIntrinsic.cpp - Lower expect intrinsic ------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers the 'expect' intrinsic to LLVM metadata.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "lower-expect-intrinsic"
+
+STATISTIC(ExpectIntrinsicsHandled,
+ "Number of 'expect' intrinsic instructions handled");
+
+static cl::opt<uint32_t>
+LikelyBranchWeight("likely-branch-weight", cl::Hidden, cl::init(64),
+ cl::desc("Weight of the branch likely to be taken (default = 64)"));
+static cl::opt<uint32_t>
+UnlikelyBranchWeight("unlikely-branch-weight", cl::Hidden, cl::init(4),
+ cl::desc("Weight of the branch unlikely to be taken (default = 4)"));
+
+static bool handleSwitchExpect(SwitchInst &SI) {
+ CallInst *CI = dyn_cast<CallInst>(SI.getCondition());
+ if (!CI)
+ return false;
+
+ Function *Fn = CI->getCalledFunction();
+ if (!Fn || Fn->getIntrinsicID() != Intrinsic::expect)
+ return false;
+
+ Value *ArgValue = CI->getArgOperand(0);
+ ConstantInt *ExpectedValue = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+ if (!ExpectedValue)
+ return false;
+
+ SwitchInst::CaseIt Case = SI.findCaseValue(ExpectedValue);
+ unsigned n = SI.getNumCases(); // +1 for default case.
+ SmallVector<uint32_t, 16> Weights(n + 1, UnlikelyBranchWeight);
+
+ if (Case == SI.case_default())
+ Weights[0] = LikelyBranchWeight;
+ else
+ Weights[Case.getCaseIndex() + 1] = LikelyBranchWeight;
+
+ SI.setMetadata(LLVMContext::MD_prof,
+ MDBuilder(CI->getContext()).createBranchWeights(Weights));
+
+ SI.setCondition(ArgValue);
+ return true;
+}
+
+static bool handleBranchExpect(BranchInst &BI) {
+ if (BI.isUnconditional())
+ return false;
+
+ // Handle non-optimized IR code like:
+ // %expval = call i64 @llvm.expect.i64(i64 %conv1, i64 1)
+ // %tobool = icmp ne i64 %expval, 0
+ // br i1 %tobool, label %if.then, label %if.end
+ //
+ // Or the following simpler case:
+ // %expval = call i1 @llvm.expect.i1(i1 %cmp, i1 1)
+ // br i1 %expval, label %if.then, label %if.end
+
+ CallInst *CI;
+
+ ICmpInst *CmpI = dyn_cast<ICmpInst>(BI.getCondition());
+ if (!CmpI) {
+ CI = dyn_cast<CallInst>(BI.getCondition());
+ } else {
+ if (CmpI->getPredicate() != CmpInst::ICMP_NE)
+ return false;
+ CI = dyn_cast<CallInst>(CmpI->getOperand(0));
+ }
+
+ if (!CI)
+ return false;
+
+ Function *Fn = CI->getCalledFunction();
+ if (!Fn || Fn->getIntrinsicID() != Intrinsic::expect)
+ return false;
+
+ Value *ArgValue = CI->getArgOperand(0);
+ ConstantInt *ExpectedValue = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+ if (!ExpectedValue)
+ return false;
+
+ MDBuilder MDB(CI->getContext());
+ MDNode *Node;
+
+ // If expect value is equal to 1 it means that we are more likely to take
+ // branch 0, in other case more likely is branch 1.
+ if (ExpectedValue->isOne())
+ Node = MDB.createBranchWeights(LikelyBranchWeight, UnlikelyBranchWeight);
+ else
+ Node = MDB.createBranchWeights(UnlikelyBranchWeight, LikelyBranchWeight);
+
+ BI.setMetadata(LLVMContext::MD_prof, Node);
+
+ if (CmpI)
+ CmpI->setOperand(0, ArgValue);
+ else
+ BI.setCondition(ArgValue);
+ return true;
+}
+
+static bool lowerExpectIntrinsic(Function &F) {
+ bool Changed = false;
+
+ for (BasicBlock &BB : F) {
+ // Create "block_weights" metadata.
+ if (BranchInst *BI = dyn_cast<BranchInst>(BB.getTerminator())) {
+ if (handleBranchExpect(*BI))
+ ExpectIntrinsicsHandled++;
+ } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB.getTerminator())) {
+ if (handleSwitchExpect(*SI))
+ ExpectIntrinsicsHandled++;
+ }
+
+ // Remove llvm.expect intrinsics.
+ for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) {
+ CallInst *CI = dyn_cast<CallInst>(BI++);
+ if (!CI)
+ continue;
+
+ Function *Fn = CI->getCalledFunction();
+ if (Fn && Fn->getIntrinsicID() == Intrinsic::expect) {
+ Value *Exp = CI->getArgOperand(0);
+ CI->replaceAllUsesWith(Exp);
+ CI->eraseFromParent();
+ Changed = true;
+ }
+ }
+ }
+
+ return Changed;
+}
+
+PreservedAnalyses LowerExpectIntrinsicPass::run(Function &F) {
+ if (lowerExpectIntrinsic(F))
+ return PreservedAnalyses::none();
+
+ return PreservedAnalyses::all();
+}
+
+namespace {
+/// \brief Legacy pass for lowering expect intrinsics out of the IR.
+///
+/// When this pass is run over a function it uses expect intrinsics which feed
+/// branches and switches to provide branch weight metadata for those
+/// terminators. It then removes the expect intrinsics from the IR so the rest
+/// of the optimizer can ignore them.
+class LowerExpectIntrinsic : public FunctionPass {
+public:
+ static char ID;
+ LowerExpectIntrinsic() : FunctionPass(ID) {
+ initializeLowerExpectIntrinsicPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override { return lowerExpectIntrinsic(F); }
+};
+}
+
+char LowerExpectIntrinsic::ID = 0;
+INITIALIZE_PASS(LowerExpectIntrinsic, "lower-expect",
+ "Lower 'expect' Intrinsics", false, false)
+
+FunctionPass *llvm::createLowerExpectIntrinsicPass() {
+ return new LowerExpectIntrinsic();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
new file mode 100644
index 0000000..6b43b0f
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -0,0 +1,1304 @@
+//===- MemCpyOptimizer.cpp - Optimize use of memcpy and friends -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs various transformations related to eliminating memcpy
+// calls, or transforming sets of stores into memset's.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+using namespace llvm;
+
+#define DEBUG_TYPE "memcpyopt"
+
+STATISTIC(NumMemCpyInstr, "Number of memcpy instructions deleted");
+STATISTIC(NumMemSetInfer, "Number of memsets inferred");
+STATISTIC(NumMoveToCpy, "Number of memmoves converted to memcpy");
+STATISTIC(NumCpyToSet, "Number of memcpys converted to memset");
+
+static int64_t GetOffsetFromIndex(const GEPOperator *GEP, unsigned Idx,
+ bool &VariableIdxFound,
+ const DataLayout &DL) {
+ // Skip over the first indices.
+ gep_type_iterator GTI = gep_type_begin(GEP);
+ for (unsigned i = 1; i != Idx; ++i, ++GTI)
+ /*skip along*/;
+
+ // Compute the offset implied by the rest of the indices.
+ int64_t Offset = 0;
+ for (unsigned i = Idx, e = GEP->getNumOperands(); i != e; ++i, ++GTI) {
+ ConstantInt *OpC = dyn_cast<ConstantInt>(GEP->getOperand(i));
+ if (!OpC)
+ return VariableIdxFound = true;
+ if (OpC->isZero()) continue; // No offset.
+
+ // Handle struct indices, which add their field offset to the pointer.
+ if (StructType *STy = dyn_cast<StructType>(*GTI)) {
+ Offset += DL.getStructLayout(STy)->getElementOffset(OpC->getZExtValue());
+ continue;
+ }
+
+ // Otherwise, we have a sequential type like an array or vector. Multiply
+ // the index by the ElementSize.
+ uint64_t Size = DL.getTypeAllocSize(GTI.getIndexedType());
+ Offset += Size*OpC->getSExtValue();
+ }
+
+ return Offset;
+}
+
+/// Return true if Ptr1 is provably equal to Ptr2 plus a constant offset, and
+/// return that constant offset. For example, Ptr1 might be &A[42], and Ptr2
+/// might be &A[40]. In this case offset would be -8.
+static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset,
+ const DataLayout &DL) {
+ Ptr1 = Ptr1->stripPointerCasts();
+ Ptr2 = Ptr2->stripPointerCasts();
+
+ // Handle the trivial case first.
+ if (Ptr1 == Ptr2) {
+ Offset = 0;
+ return true;
+ }
+
+ GEPOperator *GEP1 = dyn_cast<GEPOperator>(Ptr1);
+ GEPOperator *GEP2 = dyn_cast<GEPOperator>(Ptr2);
+
+ bool VariableIdxFound = false;
+
+ // If one pointer is a GEP and the other isn't, then see if the GEP is a
+ // constant offset from the base, as in "P" and "gep P, 1".
+ if (GEP1 && !GEP2 && GEP1->getOperand(0)->stripPointerCasts() == Ptr2) {
+ Offset = -GetOffsetFromIndex(GEP1, 1, VariableIdxFound, DL);
+ return !VariableIdxFound;
+ }
+
+ if (GEP2 && !GEP1 && GEP2->getOperand(0)->stripPointerCasts() == Ptr1) {
+ Offset = GetOffsetFromIndex(GEP2, 1, VariableIdxFound, DL);
+ return !VariableIdxFound;
+ }
+
+ // Right now we handle the case when Ptr1/Ptr2 are both GEPs with an identical
+ // base. After that base, they may have some number of common (and
+ // potentially variable) indices. After that they handle some constant
+ // offset, which determines their offset from each other. At this point, we
+ // handle no other case.
+ if (!GEP1 || !GEP2 || GEP1->getOperand(0) != GEP2->getOperand(0))
+ return false;
+
+ // Skip any common indices and track the GEP types.
+ unsigned Idx = 1;
+ for (; Idx != GEP1->getNumOperands() && Idx != GEP2->getNumOperands(); ++Idx)
+ if (GEP1->getOperand(Idx) != GEP2->getOperand(Idx))
+ break;
+
+ int64_t Offset1 = GetOffsetFromIndex(GEP1, Idx, VariableIdxFound, DL);
+ int64_t Offset2 = GetOffsetFromIndex(GEP2, Idx, VariableIdxFound, DL);
+ if (VariableIdxFound) return false;
+
+ Offset = Offset2-Offset1;
+ return true;
+}
+
+
+/// Represents a range of memset'd bytes with the ByteVal value.
+/// This allows us to analyze stores like:
+/// store 0 -> P+1
+/// store 0 -> P+0
+/// store 0 -> P+3
+/// store 0 -> P+2
+/// which sometimes happens with stores to arrays of structs etc. When we see
+/// the first store, we make a range [1, 2). The second store extends the range
+/// to [0, 2). The third makes a new range [2, 3). The fourth store joins the
+/// two ranges into [0, 3) which is memset'able.
+namespace {
+struct MemsetRange {
+ // Start/End - A semi range that describes the span that this range covers.
+ // The range is closed at the start and open at the end: [Start, End).
+ int64_t Start, End;
+
+ /// StartPtr - The getelementptr instruction that points to the start of the
+ /// range.
+ Value *StartPtr;
+
+ /// Alignment - The known alignment of the first store.
+ unsigned Alignment;
+
+ /// TheStores - The actual stores that make up this range.
+ SmallVector<Instruction*, 16> TheStores;
+
+ bool isProfitableToUseMemset(const DataLayout &DL) const;
+};
+} // end anon namespace
+
+bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const {
+ // If we found more than 4 stores to merge or 16 bytes, use memset.
+ if (TheStores.size() >= 4 || End-Start >= 16) return true;
+
+ // If there is nothing to merge, don't do anything.
+ if (TheStores.size() < 2) return false;
+
+ // If any of the stores are a memset, then it is always good to extend the
+ // memset.
+ for (Instruction *SI : TheStores)
+ if (!isa<StoreInst>(SI))
+ return true;
+
+ // Assume that the code generator is capable of merging pairs of stores
+ // together if it wants to.
+ if (TheStores.size() == 2) return false;
+
+ // If we have fewer than 8 stores, it can still be worthwhile to do this.
+ // For example, merging 4 i8 stores into an i32 store is useful almost always.
+ // However, merging 2 32-bit stores isn't useful on a 32-bit architecture (the
+ // memset will be split into 2 32-bit stores anyway) and doing so can
+ // pessimize the llvm optimizer.
+ //
+ // Since we don't have perfect knowledge here, make some assumptions: assume
+ // the maximum GPR width is the same size as the largest legal integer
+ // size. If so, check to see whether we will end up actually reducing the
+ // number of stores used.
+ unsigned Bytes = unsigned(End-Start);
+ unsigned MaxIntSize = DL.getLargestLegalIntTypeSize();
+ if (MaxIntSize == 0)
+ MaxIntSize = 1;
+ unsigned NumPointerStores = Bytes / MaxIntSize;
+
+ // Assume the remaining bytes if any are done a byte at a time.
+ unsigned NumByteStores = Bytes % MaxIntSize;
+
+ // If we will reduce the # stores (according to this heuristic), do the
+ // transformation. This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32
+ // etc.
+ return TheStores.size() > NumPointerStores+NumByteStores;
+}
+
+
+namespace {
+class MemsetRanges {
+ /// A sorted list of the memset ranges.
+ SmallVector<MemsetRange, 8> Ranges;
+ typedef SmallVectorImpl<MemsetRange>::iterator range_iterator;
+ const DataLayout &DL;
+public:
+ MemsetRanges(const DataLayout &DL) : DL(DL) {}
+
+ typedef SmallVectorImpl<MemsetRange>::const_iterator const_iterator;
+ const_iterator begin() const { return Ranges.begin(); }
+ const_iterator end() const { return Ranges.end(); }
+ bool empty() const { return Ranges.empty(); }
+
+ void addInst(int64_t OffsetFromFirst, Instruction *Inst) {
+ if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+ addStore(OffsetFromFirst, SI);
+ else
+ addMemSet(OffsetFromFirst, cast<MemSetInst>(Inst));
+ }
+
+ void addStore(int64_t OffsetFromFirst, StoreInst *SI) {
+ int64_t StoreSize = DL.getTypeStoreSize(SI->getOperand(0)->getType());
+
+ addRange(OffsetFromFirst, StoreSize,
+ SI->getPointerOperand(), SI->getAlignment(), SI);
+ }
+
+ void addMemSet(int64_t OffsetFromFirst, MemSetInst *MSI) {
+ int64_t Size = cast<ConstantInt>(MSI->getLength())->getZExtValue();
+ addRange(OffsetFromFirst, Size, MSI->getDest(), MSI->getAlignment(), MSI);
+ }
+
+ void addRange(int64_t Start, int64_t Size, Value *Ptr,
+ unsigned Alignment, Instruction *Inst);
+
+};
+
+} // end anon namespace
+
+
+/// Add a new store to the MemsetRanges data structure. This adds a
+/// new range for the specified store at the specified offset, merging into
+/// existing ranges as appropriate.
+void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr,
+ unsigned Alignment, Instruction *Inst) {
+ int64_t End = Start+Size;
+
+ range_iterator I = std::lower_bound(Ranges.begin(), Ranges.end(), Start,
+ [](const MemsetRange &LHS, int64_t RHS) { return LHS.End < RHS; });
+
+ // We now know that I == E, in which case we didn't find anything to merge
+ // with, or that Start <= I->End. If End < I->Start or I == E, then we need
+ // to insert a new range. Handle this now.
+ if (I == Ranges.end() || End < I->Start) {
+ MemsetRange &R = *Ranges.insert(I, MemsetRange());
+ R.Start = Start;
+ R.End = End;
+ R.StartPtr = Ptr;
+ R.Alignment = Alignment;
+ R.TheStores.push_back(Inst);
+ return;
+ }
+
+ // This store overlaps with I, add it.
+ I->TheStores.push_back(Inst);
+
+ // At this point, we may have an interval that completely contains our store.
+ // If so, just add it to the interval and return.
+ if (I->Start <= Start && I->End >= End)
+ return;
+
+ // Now we know that Start <= I->End and End >= I->Start so the range overlaps
+ // but is not entirely contained within the range.
+
+ // See if the range extends the start of the range. In this case, it couldn't
+ // possibly cause it to join the prior range, because otherwise we would have
+ // stopped on *it*.
+ if (Start < I->Start) {
+ I->Start = Start;
+ I->StartPtr = Ptr;
+ I->Alignment = Alignment;
+ }
+
+ // Now we know that Start <= I->End and Start >= I->Start (so the startpoint
+ // is in or right at the end of I), and that End >= I->Start. Extend I out to
+ // End.
+ if (End > I->End) {
+ I->End = End;
+ range_iterator NextI = I;
+ while (++NextI != Ranges.end() && End >= NextI->Start) {
+ // Merge the range in.
+ I->TheStores.append(NextI->TheStores.begin(), NextI->TheStores.end());
+ if (NextI->End > I->End)
+ I->End = NextI->End;
+ Ranges.erase(NextI);
+ NextI = I;
+ }
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// MemCpyOpt Pass
+//===----------------------------------------------------------------------===//
+
+namespace {
+ class MemCpyOpt : public FunctionPass {
+ MemoryDependenceAnalysis *MD;
+ TargetLibraryInfo *TLI;
+ public:
+ static char ID; // Pass identification, replacement for typeid
+ MemCpyOpt() : FunctionPass(ID) {
+ initializeMemCpyOptPass(*PassRegistry::getPassRegistry());
+ MD = nullptr;
+ TLI = nullptr;
+ }
+
+ bool runOnFunction(Function &F) override;
+
+ private:
+ // This transformation requires dominator postdominator info
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<MemoryDependenceAnalysis>();
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addPreserved<MemoryDependenceAnalysis>();
+ }
+
+ // Helper functions
+ bool processStore(StoreInst *SI, BasicBlock::iterator &BBI);
+ bool processMemSet(MemSetInst *SI, BasicBlock::iterator &BBI);
+ bool processMemCpy(MemCpyInst *M);
+ bool processMemMove(MemMoveInst *M);
+ bool performCallSlotOptzn(Instruction *cpy, Value *cpyDst, Value *cpySrc,
+ uint64_t cpyLen, unsigned cpyAlign, CallInst *C);
+ bool processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep);
+ bool processMemSetMemCpyDependence(MemCpyInst *M, MemSetInst *MDep);
+ bool performMemCpyToMemSetOptzn(MemCpyInst *M, MemSetInst *MDep);
+ bool processByValArgument(CallSite CS, unsigned ArgNo);
+ Instruction *tryMergingIntoMemset(Instruction *I, Value *StartPtr,
+ Value *ByteVal);
+
+ bool iterateOnFunction(Function &F);
+ };
+
+ char MemCpyOpt::ID = 0;
+}
+
+/// The public interface to this file...
+FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOpt(); }
+
+INITIALIZE_PASS_BEGIN(MemCpyOpt, "memcpyopt", "MemCpy Optimization",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_END(MemCpyOpt, "memcpyopt", "MemCpy Optimization",
+ false, false)
+
+/// When scanning forward over instructions, we look for some other patterns to
+/// fold away. In particular, this looks for stores to neighboring locations of
+/// memory. If it sees enough consecutive ones, it attempts to merge them
+/// together into a memcpy/memset.
+Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,
+ Value *StartPtr, Value *ByteVal) {
+ const DataLayout &DL = StartInst->getModule()->getDataLayout();
+
+ // Okay, so we now have a single store that can be splatable. Scan to find
+ // all subsequent stores of the same value to offset from the same pointer.
+ // Join these together into ranges, so we can decide whether contiguous blocks
+ // are stored.
+ MemsetRanges Ranges(DL);
+
+ BasicBlock::iterator BI(StartInst);
+ for (++BI; !isa<TerminatorInst>(BI); ++BI) {
+ if (!isa<StoreInst>(BI) && !isa<MemSetInst>(BI)) {
+ // If the instruction is readnone, ignore it, otherwise bail out. We
+ // don't even allow readonly here because we don't want something like:
+ // A[1] = 2; strlen(A); A[2] = 2; -> memcpy(A, ...); strlen(A).
+ if (BI->mayWriteToMemory() || BI->mayReadFromMemory())
+ break;
+ continue;
+ }
+
+ if (StoreInst *NextStore = dyn_cast<StoreInst>(BI)) {
+ // If this is a store, see if we can merge it in.
+ if (!NextStore->isSimple()) break;
+
+ // Check to see if this stored value is of the same byte-splattable value.
+ if (ByteVal != isBytewiseValue(NextStore->getOperand(0)))
+ break;
+
+ // Check to see if this store is to a constant offset from the start ptr.
+ int64_t Offset;
+ if (!IsPointerOffset(StartPtr, NextStore->getPointerOperand(), Offset,
+ DL))
+ break;
+
+ Ranges.addStore(Offset, NextStore);
+ } else {
+ MemSetInst *MSI = cast<MemSetInst>(BI);
+
+ if (MSI->isVolatile() || ByteVal != MSI->getValue() ||
+ !isa<ConstantInt>(MSI->getLength()))
+ break;
+
+ // Check to see if this store is to a constant offset from the start ptr.
+ int64_t Offset;
+ if (!IsPointerOffset(StartPtr, MSI->getDest(), Offset, DL))
+ break;
+
+ Ranges.addMemSet(Offset, MSI);
+ }
+ }
+
+ // If we have no ranges, then we just had a single store with nothing that
+ // could be merged in. This is a very common case of course.
+ if (Ranges.empty())
+ return nullptr;
+
+ // If we had at least one store that could be merged in, add the starting
+ // store as well. We try to avoid this unless there is at least something
+ // interesting as a small compile-time optimization.
+ Ranges.addInst(0, StartInst);
+
+ // If we create any memsets, we put it right before the first instruction that
+ // isn't part of the memset block. This ensure that the memset is dominated
+ // by any addressing instruction needed by the start of the block.
+ IRBuilder<> Builder(&*BI);
+
+ // Now that we have full information about ranges, loop over the ranges and
+ // emit memset's for anything big enough to be worthwhile.
+ Instruction *AMemSet = nullptr;
+ for (const MemsetRange &Range : Ranges) {
+
+ if (Range.TheStores.size() == 1) continue;
+
+ // If it is profitable to lower this range to memset, do so now.
+ if (!Range.isProfitableToUseMemset(DL))
+ continue;
+
+ // Otherwise, we do want to transform this! Create a new memset.
+ // Get the starting pointer of the block.
+ StartPtr = Range.StartPtr;
+
+ // Determine alignment
+ unsigned Alignment = Range.Alignment;
+ if (Alignment == 0) {
+ Type *EltType =
+ cast<PointerType>(StartPtr->getType())->getElementType();
+ Alignment = DL.getABITypeAlignment(EltType);
+ }
+
+ AMemSet =
+ Builder.CreateMemSet(StartPtr, ByteVal, Range.End-Range.Start, Alignment);
+
+ DEBUG(dbgs() << "Replace stores:\n";
+ for (Instruction *SI : Range.TheStores)
+ dbgs() << *SI << '\n';
+ dbgs() << "With: " << *AMemSet << '\n');
+
+ if (!Range.TheStores.empty())
+ AMemSet->setDebugLoc(Range.TheStores[0]->getDebugLoc());
+
+ // Zap all the stores.
+ for (Instruction *SI : Range.TheStores) {
+ MD->removeInstruction(SI);
+ SI->eraseFromParent();
+ }
+ ++NumMemSetInfer;
+ }
+
+ return AMemSet;
+}
+
+static unsigned findCommonAlignment(const DataLayout &DL, const StoreInst *SI,
+ const LoadInst *LI) {
+ unsigned StoreAlign = SI->getAlignment();
+ if (!StoreAlign)
+ StoreAlign = DL.getABITypeAlignment(SI->getOperand(0)->getType());
+ unsigned LoadAlign = LI->getAlignment();
+ if (!LoadAlign)
+ LoadAlign = DL.getABITypeAlignment(LI->getType());
+
+ return std::min(StoreAlign, LoadAlign);
+}
+
+bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
+ if (!SI->isSimple()) return false;
+
+ // Avoid merging nontemporal stores since the resulting
+ // memcpy/memset would not be able to preserve the nontemporal hint.
+ // In theory we could teach how to propagate the !nontemporal metadata to
+ // memset calls. However, that change would force the backend to
+ // conservatively expand !nontemporal memset calls back to sequences of
+ // store instructions (effectively undoing the merging).
+ if (SI->getMetadata(LLVMContext::MD_nontemporal))
+ return false;
+
+ const DataLayout &DL = SI->getModule()->getDataLayout();
+
+ // Load to store forwarding can be interpreted as memcpy.
+ if (LoadInst *LI = dyn_cast<LoadInst>(SI->getOperand(0))) {
+ if (LI->isSimple() && LI->hasOneUse() &&
+ LI->getParent() == SI->getParent()) {
+
+ auto *T = LI->getType();
+ if (T->isAggregateType()) {
+ AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+ MemoryLocation LoadLoc = MemoryLocation::get(LI);
+
+ // We use alias analysis to check if an instruction may store to
+ // the memory we load from in between the load and the store. If
+ // such an instruction is found, we try to promote there instead
+ // of at the store position.
+ Instruction *P = SI;
+ for (BasicBlock::iterator I = ++LI->getIterator(), E = SI->getIterator();
+ I != E; ++I) {
+ if (!(AA.getModRefInfo(&*I, LoadLoc) & MRI_Mod))
+ continue;
+
+ // We found an instruction that may write to the loaded memory.
+ // We can try to promote at this position instead of the store
+ // position if nothing alias the store memory after this and the store
+ // destination is not in the range.
+ P = &*I;
+ for (; I != E; ++I) {
+ MemoryLocation StoreLoc = MemoryLocation::get(SI);
+ if (&*I == SI->getOperand(1) ||
+ AA.getModRefInfo(&*I, StoreLoc) != MRI_NoModRef) {
+ P = nullptr;
+ break;
+ }
+ }
+
+ break;
+ }
+
+ // If a valid insertion position is found, then we can promote
+ // the load/store pair to a memcpy.
+ if (P) {
+ // If we load from memory that may alias the memory we store to,
+ // memmove must be used to preserve semantic. If not, memcpy can
+ // be used.
+ bool UseMemMove = false;
+ if (!AA.isNoAlias(MemoryLocation::get(SI), LoadLoc))
+ UseMemMove = true;
+
+ unsigned Align = findCommonAlignment(DL, SI, LI);
+ uint64_t Size = DL.getTypeStoreSize(T);
+
+ IRBuilder<> Builder(P);
+ Instruction *M;
+ if (UseMemMove)
+ M = Builder.CreateMemMove(SI->getPointerOperand(),
+ LI->getPointerOperand(), Size,
+ Align, SI->isVolatile());
+ else
+ M = Builder.CreateMemCpy(SI->getPointerOperand(),
+ LI->getPointerOperand(), Size,
+ Align, SI->isVolatile());
+
+ DEBUG(dbgs() << "Promoting " << *LI << " to " << *SI
+ << " => " << *M << "\n");
+
+ MD->removeInstruction(SI);
+ SI->eraseFromParent();
+ MD->removeInstruction(LI);
+ LI->eraseFromParent();
+ ++NumMemCpyInstr;
+
+ // Make sure we do not invalidate the iterator.
+ BBI = M->getIterator();
+ return true;
+ }
+ }
+
+ // Detect cases where we're performing call slot forwarding, but
+ // happen to be using a load-store pair to implement it, rather than
+ // a memcpy.
+ MemDepResult ldep = MD->getDependency(LI);
+ CallInst *C = nullptr;
+ if (ldep.isClobber() && !isa<MemCpyInst>(ldep.getInst()))
+ C = dyn_cast<CallInst>(ldep.getInst());
+
+ if (C) {
+ // Check that nothing touches the dest of the "copy" between
+ // the call and the store.
+ AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+ MemoryLocation StoreLoc = MemoryLocation::get(SI);
+ for (BasicBlock::iterator I = --SI->getIterator(), E = C->getIterator();
+ I != E; --I) {
+ if (AA.getModRefInfo(&*I, StoreLoc) != MRI_NoModRef) {
+ C = nullptr;
+ break;
+ }
+ }
+ }
+
+ if (C) {
+ bool changed = performCallSlotOptzn(
+ LI, SI->getPointerOperand()->stripPointerCasts(),
+ LI->getPointerOperand()->stripPointerCasts(),
+ DL.getTypeStoreSize(SI->getOperand(0)->getType()),
+ findCommonAlignment(DL, SI, LI), C);
+ if (changed) {
+ MD->removeInstruction(SI);
+ SI->eraseFromParent();
+ MD->removeInstruction(LI);
+ LI->eraseFromParent();
+ ++NumMemCpyInstr;
+ return true;
+ }
+ }
+ }
+ }
+
+ // There are two cases that are interesting for this code to handle: memcpy
+ // and memset. Right now we only handle memset.
+
+ // Ensure that the value being stored is something that can be memset'able a
+ // byte at a time like "0" or "-1" or any width, as well as things like
+ // 0xA0A0A0A0 and 0.0.
+ auto *V = SI->getOperand(0);
+ if (Value *ByteVal = isBytewiseValue(V)) {
+ if (Instruction *I = tryMergingIntoMemset(SI, SI->getPointerOperand(),
+ ByteVal)) {
+ BBI = I->getIterator(); // Don't invalidate iterator.
+ return true;
+ }
+
+ // If we have an aggregate, we try to promote it to memset regardless
+ // of opportunity for merging as it can expose optimization opportunities
+ // in subsequent passes.
+ auto *T = V->getType();
+ if (T->isAggregateType()) {
+ uint64_t Size = DL.getTypeStoreSize(T);
+ unsigned Align = SI->getAlignment();
+ if (!Align)
+ Align = DL.getABITypeAlignment(T);
+ IRBuilder<> Builder(SI);
+ auto *M = Builder.CreateMemSet(SI->getPointerOperand(), ByteVal,
+ Size, Align, SI->isVolatile());
+
+ DEBUG(dbgs() << "Promoting " << *SI << " to " << *M << "\n");
+
+ MD->removeInstruction(SI);
+ SI->eraseFromParent();
+ NumMemSetInfer++;
+
+ // Make sure we do not invalidate the iterator.
+ BBI = M->getIterator();
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool MemCpyOpt::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) {
+ // See if there is another memset or store neighboring this memset which
+ // allows us to widen out the memset to do a single larger store.
+ if (isa<ConstantInt>(MSI->getLength()) && !MSI->isVolatile())
+ if (Instruction *I = tryMergingIntoMemset(MSI, MSI->getDest(),
+ MSI->getValue())) {
+ BBI = I->getIterator(); // Don't invalidate iterator.
+ return true;
+ }
+ return false;
+}
+
+
+/// Takes a memcpy and a call that it depends on,
+/// and checks for the possibility of a call slot optimization by having
+/// the call write its result directly into the destination of the memcpy.
+bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy,
+ Value *cpyDest, Value *cpySrc,
+ uint64_t cpyLen, unsigned cpyAlign,
+ CallInst *C) {
+ // The general transformation to keep in mind is
+ //
+ // call @func(..., src, ...)
+ // memcpy(dest, src, ...)
+ //
+ // ->
+ //
+ // memcpy(dest, src, ...)
+ // call @func(..., dest, ...)
+ //
+ // Since moving the memcpy is technically awkward, we additionally check that
+ // src only holds uninitialized values at the moment of the call, meaning that
+ // the memcpy can be discarded rather than moved.
+
+ // Deliberately get the source and destination with bitcasts stripped away,
+ // because we'll need to do type comparisons based on the underlying type.
+ CallSite CS(C);
+
+ // Require that src be an alloca. This simplifies the reasoning considerably.
+ AllocaInst *srcAlloca = dyn_cast<AllocaInst>(cpySrc);
+ if (!srcAlloca)
+ return false;
+
+ ConstantInt *srcArraySize = dyn_cast<ConstantInt>(srcAlloca->getArraySize());
+ if (!srcArraySize)
+ return false;
+
+ const DataLayout &DL = cpy->getModule()->getDataLayout();
+ uint64_t srcSize = DL.getTypeAllocSize(srcAlloca->getAllocatedType()) *
+ srcArraySize->getZExtValue();
+
+ if (cpyLen < srcSize)
+ return false;
+
+ // Check that accessing the first srcSize bytes of dest will not cause a
+ // trap. Otherwise the transform is invalid since it might cause a trap
+ // to occur earlier than it otherwise would.
+ if (AllocaInst *A = dyn_cast<AllocaInst>(cpyDest)) {
+ // The destination is an alloca. Check it is larger than srcSize.
+ ConstantInt *destArraySize = dyn_cast<ConstantInt>(A->getArraySize());
+ if (!destArraySize)
+ return false;
+
+ uint64_t destSize = DL.getTypeAllocSize(A->getAllocatedType()) *
+ destArraySize->getZExtValue();
+
+ if (destSize < srcSize)
+ return false;
+ } else if (Argument *A = dyn_cast<Argument>(cpyDest)) {
+ if (A->getDereferenceableBytes() < srcSize) {
+ // If the destination is an sret parameter then only accesses that are
+ // outside of the returned struct type can trap.
+ if (!A->hasStructRetAttr())
+ return false;
+
+ Type *StructTy = cast<PointerType>(A->getType())->getElementType();
+ if (!StructTy->isSized()) {
+ // The call may never return and hence the copy-instruction may never
+ // be executed, and therefore it's not safe to say "the destination
+ // has at least <cpyLen> bytes, as implied by the copy-instruction",
+ return false;
+ }
+
+ uint64_t destSize = DL.getTypeAllocSize(StructTy);
+ if (destSize < srcSize)
+ return false;
+ }
+ } else {
+ return false;
+ }
+
+ // Check that dest points to memory that is at least as aligned as src.
+ unsigned srcAlign = srcAlloca->getAlignment();
+ if (!srcAlign)
+ srcAlign = DL.getABITypeAlignment(srcAlloca->getAllocatedType());
+ bool isDestSufficientlyAligned = srcAlign <= cpyAlign;
+ // If dest is not aligned enough and we can't increase its alignment then
+ // bail out.
+ if (!isDestSufficientlyAligned && !isa<AllocaInst>(cpyDest))
+ return false;
+
+ // Check that src is not accessed except via the call and the memcpy. This
+ // guarantees that it holds only undefined values when passed in (so the final
+ // memcpy can be dropped), that it is not read or written between the call and
+ // the memcpy, and that writing beyond the end of it is undefined.
+ SmallVector<User*, 8> srcUseList(srcAlloca->user_begin(),
+ srcAlloca->user_end());
+ while (!srcUseList.empty()) {
+ User *U = srcUseList.pop_back_val();
+
+ if (isa<BitCastInst>(U) || isa<AddrSpaceCastInst>(U)) {
+ for (User *UU : U->users())
+ srcUseList.push_back(UU);
+ continue;
+ }
+ if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(U)) {
+ if (!G->hasAllZeroIndices())
+ return false;
+
+ for (User *UU : U->users())
+ srcUseList.push_back(UU);
+ continue;
+ }
+ if (const IntrinsicInst *IT = dyn_cast<IntrinsicInst>(U))
+ if (IT->getIntrinsicID() == Intrinsic::lifetime_start ||
+ IT->getIntrinsicID() == Intrinsic::lifetime_end)
+ continue;
+
+ if (U != C && U != cpy)
+ return false;
+ }
+
+ // Check that src isn't captured by the called function since the
+ // transformation can cause aliasing issues in that case.
+ for (unsigned i = 0, e = CS.arg_size(); i != e; ++i)
+ if (CS.getArgument(i) == cpySrc && !CS.doesNotCapture(i))
+ return false;
+
+ // Since we're changing the parameter to the callsite, we need to make sure
+ // that what would be the new parameter dominates the callsite.
+ DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ if (Instruction *cpyDestInst = dyn_cast<Instruction>(cpyDest))
+ if (!DT.dominates(cpyDestInst, C))
+ return false;
+
+ // In addition to knowing that the call does not access src in some
+ // unexpected manner, for example via a global, which we deduce from
+ // the use analysis, we also need to know that it does not sneakily
+ // access dest. We rely on AA to figure this out for us.
+ AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+ ModRefInfo MR = AA.getModRefInfo(C, cpyDest, srcSize);
+ // If necessary, perform additional analysis.
+ if (MR != MRI_NoModRef)
+ MR = AA.callCapturesBefore(C, cpyDest, srcSize, &DT);
+ if (MR != MRI_NoModRef)
+ return false;
+
+ // All the checks have passed, so do the transformation.
+ bool changedArgument = false;
+ for (unsigned i = 0; i < CS.arg_size(); ++i)
+ if (CS.getArgument(i)->stripPointerCasts() == cpySrc) {
+ Value *Dest = cpySrc->getType() == cpyDest->getType() ? cpyDest
+ : CastInst::CreatePointerCast(cpyDest, cpySrc->getType(),
+ cpyDest->getName(), C);
+ changedArgument = true;
+ if (CS.getArgument(i)->getType() == Dest->getType())
+ CS.setArgument(i, Dest);
+ else
+ CS.setArgument(i, CastInst::CreatePointerCast(Dest,
+ CS.getArgument(i)->getType(), Dest->getName(), C));
+ }
+
+ if (!changedArgument)
+ return false;
+
+ // If the destination wasn't sufficiently aligned then increase its alignment.
+ if (!isDestSufficientlyAligned) {
+ assert(isa<AllocaInst>(cpyDest) && "Can only increase alloca alignment!");
+ cast<AllocaInst>(cpyDest)->setAlignment(srcAlign);
+ }
+
+ // Drop any cached information about the call, because we may have changed
+ // its dependence information by changing its parameter.
+ MD->removeInstruction(C);
+
+ // Update AA metadata
+ // FIXME: MD_tbaa_struct and MD_mem_parallel_loop_access should also be
+ // handled here, but combineMetadata doesn't support them yet
+ unsigned KnownIDs[] = {LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
+ LLVMContext::MD_noalias,
+ LLVMContext::MD_invariant_group};
+ combineMetadata(C, cpy, KnownIDs);
+
+ // Remove the memcpy.
+ MD->removeInstruction(cpy);
+ ++NumMemCpyInstr;
+
+ return true;
+}
+
+/// We've found that the (upward scanning) memory dependence of memcpy 'M' is
+/// the memcpy 'MDep'. Try to simplify M to copy from MDep's input if we can.
+bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep) {
+ // We can only transforms memcpy's where the dest of one is the source of the
+ // other.
+ if (M->getSource() != MDep->getDest() || MDep->isVolatile())
+ return false;
+
+ // If dep instruction is reading from our current input, then it is a noop
+ // transfer and substituting the input won't change this instruction. Just
+ // ignore the input and let someone else zap MDep. This handles cases like:
+ // memcpy(a <- a)
+ // memcpy(b <- a)
+ if (M->getSource() == MDep->getSource())
+ return false;
+
+ // Second, the length of the memcpy's must be the same, or the preceding one
+ // must be larger than the following one.
+ ConstantInt *MDepLen = dyn_cast<ConstantInt>(MDep->getLength());
+ ConstantInt *MLen = dyn_cast<ConstantInt>(M->getLength());
+ if (!MDepLen || !MLen || MDepLen->getZExtValue() < MLen->getZExtValue())
+ return false;
+
+ AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+
+ // Verify that the copied-from memory doesn't change in between the two
+ // transfers. For example, in:
+ // memcpy(a <- b)
+ // *b = 42;
+ // memcpy(c <- a)
+ // It would be invalid to transform the second memcpy into memcpy(c <- b).
+ //
+ // TODO: If the code between M and MDep is transparent to the destination "c",
+ // then we could still perform the xform by moving M up to the first memcpy.
+ //
+ // NOTE: This is conservative, it will stop on any read from the source loc,
+ // not just the defining memcpy.
+ MemDepResult SourceDep =
+ MD->getPointerDependencyFrom(MemoryLocation::getForSource(MDep), false,
+ M->getIterator(), M->getParent());
+ if (!SourceDep.isClobber() || SourceDep.getInst() != MDep)
+ return false;
+
+ // If the dest of the second might alias the source of the first, then the
+ // source and dest might overlap. We still want to eliminate the intermediate
+ // value, but we have to generate a memmove instead of memcpy.
+ bool UseMemMove = false;
+ if (!AA.isNoAlias(MemoryLocation::getForDest(M),
+ MemoryLocation::getForSource(MDep)))
+ UseMemMove = true;
+
+ // If all checks passed, then we can transform M.
+
+ // Make sure to use the lesser of the alignment of the source and the dest
+ // since we're changing where we're reading from, but don't want to increase
+ // the alignment past what can be read from or written to.
+ // TODO: Is this worth it if we're creating a less aligned memcpy? For
+ // example we could be moving from movaps -> movq on x86.
+ unsigned Align = std::min(MDep->getAlignment(), M->getAlignment());
+
+ IRBuilder<> Builder(M);
+ if (UseMemMove)
+ Builder.CreateMemMove(M->getRawDest(), MDep->getRawSource(), M->getLength(),
+ Align, M->isVolatile());
+ else
+ Builder.CreateMemCpy(M->getRawDest(), MDep->getRawSource(), M->getLength(),
+ Align, M->isVolatile());
+
+ // Remove the instruction we're replacing.
+ MD->removeInstruction(M);
+ M->eraseFromParent();
+ ++NumMemCpyInstr;
+ return true;
+}
+
+/// We've found that the (upward scanning) memory dependence of \p MemCpy is
+/// \p MemSet. Try to simplify \p MemSet to only set the trailing bytes that
+/// weren't copied over by \p MemCpy.
+///
+/// In other words, transform:
+/// \code
+/// memset(dst, c, dst_size);
+/// memcpy(dst, src, src_size);
+/// \endcode
+/// into:
+/// \code
+/// memcpy(dst, src, src_size);
+/// memset(dst + src_size, c, dst_size <= src_size ? 0 : dst_size - src_size);
+/// \endcode
+bool MemCpyOpt::processMemSetMemCpyDependence(MemCpyInst *MemCpy,
+ MemSetInst *MemSet) {
+ // We can only transform memset/memcpy with the same destination.
+ if (MemSet->getDest() != MemCpy->getDest())
+ return false;
+
+ // Check that there are no other dependencies on the memset destination.
+ MemDepResult DstDepInfo =
+ MD->getPointerDependencyFrom(MemoryLocation::getForDest(MemSet), false,
+ MemCpy->getIterator(), MemCpy->getParent());
+ if (DstDepInfo.getInst() != MemSet)
+ return false;
+
+ // Use the same i8* dest as the memcpy, killing the memset dest if different.
+ Value *Dest = MemCpy->getRawDest();
+ Value *DestSize = MemSet->getLength();
+ Value *SrcSize = MemCpy->getLength();
+
+ // By default, create an unaligned memset.
+ unsigned Align = 1;
+ // If Dest is aligned, and SrcSize is constant, use the minimum alignment
+ // of the sum.
+ const unsigned DestAlign =
+ std::max(MemSet->getAlignment(), MemCpy->getAlignment());
+ if (DestAlign > 1)
+ if (ConstantInt *SrcSizeC = dyn_cast<ConstantInt>(SrcSize))
+ Align = MinAlign(SrcSizeC->getZExtValue(), DestAlign);
+
+ IRBuilder<> Builder(MemCpy);
+
+ // If the sizes have different types, zext the smaller one.
+ if (DestSize->getType() != SrcSize->getType()) {
+ if (DestSize->getType()->getIntegerBitWidth() >
+ SrcSize->getType()->getIntegerBitWidth())
+ SrcSize = Builder.CreateZExt(SrcSize, DestSize->getType());
+ else
+ DestSize = Builder.CreateZExt(DestSize, SrcSize->getType());
+ }
+
+ Value *MemsetLen =
+ Builder.CreateSelect(Builder.CreateICmpULE(DestSize, SrcSize),
+ ConstantInt::getNullValue(DestSize->getType()),
+ Builder.CreateSub(DestSize, SrcSize));
+ Builder.CreateMemSet(Builder.CreateGEP(Dest, SrcSize), MemSet->getOperand(1),
+ MemsetLen, Align);
+
+ MD->removeInstruction(MemSet);
+ MemSet->eraseFromParent();
+ return true;
+}
+
+/// Transform memcpy to memset when its source was just memset.
+/// In other words, turn:
+/// \code
+/// memset(dst1, c, dst1_size);
+/// memcpy(dst2, dst1, dst2_size);
+/// \endcode
+/// into:
+/// \code
+/// memset(dst1, c, dst1_size);
+/// memset(dst2, c, dst2_size);
+/// \endcode
+/// When dst2_size <= dst1_size.
+///
+/// The \p MemCpy must have a Constant length.
+bool MemCpyOpt::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
+ MemSetInst *MemSet) {
+ // This only makes sense on memcpy(..., memset(...), ...).
+ if (MemSet->getRawDest() != MemCpy->getRawSource())
+ return false;
+
+ ConstantInt *CopySize = cast<ConstantInt>(MemCpy->getLength());
+ ConstantInt *MemSetSize = dyn_cast<ConstantInt>(MemSet->getLength());
+ // Make sure the memcpy doesn't read any more than what the memset wrote.
+ // Don't worry about sizes larger than i64.
+ if (!MemSetSize || CopySize->getZExtValue() > MemSetSize->getZExtValue())
+ return false;
+
+ IRBuilder<> Builder(MemCpy);
+ Builder.CreateMemSet(MemCpy->getRawDest(), MemSet->getOperand(1),
+ CopySize, MemCpy->getAlignment());
+ return true;
+}
+
+/// Perform simplification of memcpy's. If we have memcpy A
+/// which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite
+/// B to be a memcpy from X to Z (or potentially a memmove, depending on
+/// circumstances). This allows later passes to remove the first memcpy
+/// altogether.
+bool MemCpyOpt::processMemCpy(MemCpyInst *M) {
+ // We can only optimize non-volatile memcpy's.
+ if (M->isVolatile()) return false;
+
+ // If the source and destination of the memcpy are the same, then zap it.
+ if (M->getSource() == M->getDest()) {
+ MD->removeInstruction(M);
+ M->eraseFromParent();
+ return false;
+ }
+
+ // If copying from a constant, try to turn the memcpy into a memset.
+ if (GlobalVariable *GV = dyn_cast<GlobalVariable>(M->getSource()))
+ if (GV->isConstant() && GV->hasDefinitiveInitializer())
+ if (Value *ByteVal = isBytewiseValue(GV->getInitializer())) {
+ IRBuilder<> Builder(M);
+ Builder.CreateMemSet(M->getRawDest(), ByteVal, M->getLength(),
+ M->getAlignment(), false);
+ MD->removeInstruction(M);
+ M->eraseFromParent();
+ ++NumCpyToSet;
+ return true;
+ }
+
+ MemDepResult DepInfo = MD->getDependency(M);
+
+ // Try to turn a partially redundant memset + memcpy into
+ // memcpy + smaller memset. We don't need the memcpy size for this.
+ if (DepInfo.isClobber())
+ if (MemSetInst *MDep = dyn_cast<MemSetInst>(DepInfo.getInst()))
+ if (processMemSetMemCpyDependence(M, MDep))
+ return true;
+
+ // The optimizations after this point require the memcpy size.
+ ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength());
+ if (!CopySize) return false;
+
+ // There are four possible optimizations we can do for memcpy:
+ // a) memcpy-memcpy xform which exposes redundance for DSE.
+ // b) call-memcpy xform for return slot optimization.
+ // c) memcpy from freshly alloca'd space or space that has just started its
+ // lifetime copies undefined data, and we can therefore eliminate the
+ // memcpy in favor of the data that was already at the destination.
+ // d) memcpy from a just-memset'd source can be turned into memset.
+ if (DepInfo.isClobber()) {
+ if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) {
+ if (performCallSlotOptzn(M, M->getDest(), M->getSource(),
+ CopySize->getZExtValue(), M->getAlignment(),
+ C)) {
+ MD->removeInstruction(M);
+ M->eraseFromParent();
+ return true;
+ }
+ }
+ }
+
+ MemoryLocation SrcLoc = MemoryLocation::getForSource(M);
+ MemDepResult SrcDepInfo = MD->getPointerDependencyFrom(
+ SrcLoc, true, M->getIterator(), M->getParent());
+
+ if (SrcDepInfo.isClobber()) {
+ if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(SrcDepInfo.getInst()))
+ return processMemCpyMemCpyDependence(M, MDep);
+ } else if (SrcDepInfo.isDef()) {
+ Instruction *I = SrcDepInfo.getInst();
+ bool hasUndefContents = false;
+
+ if (isa<AllocaInst>(I)) {
+ hasUndefContents = true;
+ } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+ if (II->getIntrinsicID() == Intrinsic::lifetime_start)
+ if (ConstantInt *LTSize = dyn_cast<ConstantInt>(II->getArgOperand(0)))
+ if (LTSize->getZExtValue() >= CopySize->getZExtValue())
+ hasUndefContents = true;
+ }
+
+ if (hasUndefContents) {
+ MD->removeInstruction(M);
+ M->eraseFromParent();
+ ++NumMemCpyInstr;
+ return true;
+ }
+ }
+
+ if (SrcDepInfo.isClobber())
+ if (MemSetInst *MDep = dyn_cast<MemSetInst>(SrcDepInfo.getInst()))
+ if (performMemCpyToMemSetOptzn(M, MDep)) {
+ MD->removeInstruction(M);
+ M->eraseFromParent();
+ ++NumCpyToSet;
+ return true;
+ }
+
+ return false;
+}
+
+/// Transforms memmove calls to memcpy calls when the src/dst are guaranteed
+/// not to alias.
+bool MemCpyOpt::processMemMove(MemMoveInst *M) {
+ AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+
+ if (!TLI->has(LibFunc::memmove))
+ return false;
+
+ // See if the pointers alias.
+ if (!AA.isNoAlias(MemoryLocation::getForDest(M),
+ MemoryLocation::getForSource(M)))
+ return false;
+
+ DEBUG(dbgs() << "MemCpyOpt: Optimizing memmove -> memcpy: " << *M << "\n");
+
+ // If not, then we know we can transform this.
+ Type *ArgTys[3] = { M->getRawDest()->getType(),
+ M->getRawSource()->getType(),
+ M->getLength()->getType() };
+ M->setCalledFunction(Intrinsic::getDeclaration(M->getModule(),
+ Intrinsic::memcpy, ArgTys));
+
+ // MemDep may have over conservative information about this instruction, just
+ // conservatively flush it from the cache.
+ MD->removeInstruction(M);
+
+ ++NumMoveToCpy;
+ return true;
+}
+
+/// This is called on every byval argument in call sites.
+bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) {
+ const DataLayout &DL = CS.getCaller()->getParent()->getDataLayout();
+ // Find out what feeds this byval argument.
+ Value *ByValArg = CS.getArgument(ArgNo);
+ Type *ByValTy = cast<PointerType>(ByValArg->getType())->getElementType();
+ uint64_t ByValSize = DL.getTypeAllocSize(ByValTy);
+ MemDepResult DepInfo = MD->getPointerDependencyFrom(
+ MemoryLocation(ByValArg, ByValSize), true,
+ CS.getInstruction()->getIterator(), CS.getInstruction()->getParent());
+ if (!DepInfo.isClobber())
+ return false;
+
+ // If the byval argument isn't fed by a memcpy, ignore it. If it is fed by
+ // a memcpy, see if we can byval from the source of the memcpy instead of the
+ // result.
+ MemCpyInst *MDep = dyn_cast<MemCpyInst>(DepInfo.getInst());
+ if (!MDep || MDep->isVolatile() ||
+ ByValArg->stripPointerCasts() != MDep->getDest())
+ return false;
+
+ // The length of the memcpy must be larger or equal to the size of the byval.
+ ConstantInt *C1 = dyn_cast<ConstantInt>(MDep->getLength());
+ if (!C1 || C1->getValue().getZExtValue() < ByValSize)
+ return false;
+
+ // Get the alignment of the byval. If the call doesn't specify the alignment,
+ // then it is some target specific value that we can't know.
+ unsigned ByValAlign = CS.getParamAlignment(ArgNo+1);
+ if (ByValAlign == 0) return false;
+
+ // If it is greater than the memcpy, then we check to see if we can force the
+ // source of the memcpy to the alignment we need. If we fail, we bail out.
+ AssumptionCache &AC =
+ getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
+ *CS->getParent()->getParent());
+ DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ if (MDep->getAlignment() < ByValAlign &&
+ getOrEnforceKnownAlignment(MDep->getSource(), ByValAlign, DL,
+ CS.getInstruction(), &AC, &DT) < ByValAlign)
+ return false;
+
+ // Verify that the copied-from memory doesn't change in between the memcpy and
+ // the byval call.
+ // memcpy(a <- b)
+ // *b = 42;
+ // foo(*a)
+ // It would be invalid to transform the second memcpy into foo(*b).
+ //
+ // NOTE: This is conservative, it will stop on any read from the source loc,
+ // not just the defining memcpy.
+ MemDepResult SourceDep = MD->getPointerDependencyFrom(
+ MemoryLocation::getForSource(MDep), false,
+ CS.getInstruction()->getIterator(), MDep->getParent());
+ if (!SourceDep.isClobber() || SourceDep.getInst() != MDep)
+ return false;
+
+ Value *TmpCast = MDep->getSource();
+ if (MDep->getSource()->getType() != ByValArg->getType())
+ TmpCast = new BitCastInst(MDep->getSource(), ByValArg->getType(),
+ "tmpcast", CS.getInstruction());
+
+ DEBUG(dbgs() << "MemCpyOpt: Forwarding memcpy to byval:\n"
+ << " " << *MDep << "\n"
+ << " " << *CS.getInstruction() << "\n");
+
+ // Otherwise we're good! Update the byval argument.
+ CS.setArgument(ArgNo, TmpCast);
+ ++NumMemCpyInstr;
+ return true;
+}
+
+/// Executes one iteration of MemCpyOpt.
+bool MemCpyOpt::iterateOnFunction(Function &F) {
+ bool MadeChange = false;
+
+ // Walk all instruction in the function.
+ for (Function::iterator BB = F.begin(), BBE = F.end(); BB != BBE; ++BB) {
+ for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) {
+ // Avoid invalidating the iterator.
+ Instruction *I = &*BI++;
+
+ bool RepeatInstruction = false;
+
+ if (StoreInst *SI = dyn_cast<StoreInst>(I))
+ MadeChange |= processStore(SI, BI);
+ else if (MemSetInst *M = dyn_cast<MemSetInst>(I))
+ RepeatInstruction = processMemSet(M, BI);
+ else if (MemCpyInst *M = dyn_cast<MemCpyInst>(I))
+ RepeatInstruction = processMemCpy(M);
+ else if (MemMoveInst *M = dyn_cast<MemMoveInst>(I))
+ RepeatInstruction = processMemMove(M);
+ else if (auto CS = CallSite(I)) {
+ for (unsigned i = 0, e = CS.arg_size(); i != e; ++i)
+ if (CS.isByValArgument(i))
+ MadeChange |= processByValArgument(CS, i);
+ }
+
+ // Reprocess the instruction if desired.
+ if (RepeatInstruction) {
+ if (BI != BB->begin()) --BI;
+ MadeChange = true;
+ }
+ }
+ }
+
+ return MadeChange;
+}
+
+/// This is the main transformation entry point for a function.
+bool MemCpyOpt::runOnFunction(Function &F) {
+ if (skipOptnoneFunction(F))
+ return false;
+
+ bool MadeChange = false;
+ MD = &getAnalysis<MemoryDependenceAnalysis>();
+ TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+
+ // If we don't have at least memset and memcpy, there is little point of doing
+ // anything here. These are required by a freestanding implementation, so if
+ // even they are disabled, there is no point in trying hard.
+ if (!TLI->has(LibFunc::memset) || !TLI->has(LibFunc::memcpy))
+ return false;
+
+ while (1) {
+ if (!iterateOnFunction(F))
+ break;
+ MadeChange = true;
+ }
+
+ MD = nullptr;
+ return MadeChange;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
new file mode 100644
index 0000000..c812d61
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
@@ -0,0 +1,587 @@
+//===- MergedLoadStoreMotion.cpp - merge and hoist/sink load/stores -------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//! \file
+//! \brief This pass performs merges of loads and stores on both sides of a
+// diamond (hammock). It hoists the loads and sinks the stores.
+//
+// The algorithm iteratively hoists two loads to the same address out of a
+// diamond (hammock) and merges them into a single load in the header. Similar
+// it sinks and merges two stores to the tail block (footer). The algorithm
+// iterates over the instructions of one side of the diamond and attempts to
+// find a matching load/store on the other side. It hoists / sinks when it
+// thinks it safe to do so. This optimization helps with eg. hiding load
+// latencies, triggering if-conversion, and reducing static code size.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+// Example:
+// Diamond shaped code before merge:
+//
+// header:
+// br %cond, label %if.then, label %if.else
+// + +
+// + +
+// + +
+// if.then: if.else:
+// %lt = load %addr_l %le = load %addr_l
+// <use %lt> <use %le>
+// <...> <...>
+// store %st, %addr_s store %se, %addr_s
+// br label %if.end br label %if.end
+// + +
+// + +
+// + +
+// if.end ("footer"):
+// <...>
+//
+// Diamond shaped code after merge:
+//
+// header:
+// %l = load %addr_l
+// br %cond, label %if.then, label %if.else
+// + +
+// + +
+// + +
+// if.then: if.else:
+// <use %l> <use %l>
+// <...> <...>
+// br label %if.end br label %if.end
+// + +
+// + +
+// + +
+// if.end ("footer"):
+// %s.sink = phi [%st, if.then], [%se, if.else]
+// <...>
+// store %s.sink, %addr_s
+// <...>
+//
+//
+//===----------------------- TODO -----------------------------------------===//
+//
+// 1) Generalize to regions other than diamonds
+// 2) Be more aggressive merging memory operations
+// Note that both changes require register pressure control
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mldst-motion"
+
+//===----------------------------------------------------------------------===//
+// MergedLoadStoreMotion Pass
+//===----------------------------------------------------------------------===//
+
+namespace {
+class MergedLoadStoreMotion : public FunctionPass {
+ AliasAnalysis *AA;
+ MemoryDependenceAnalysis *MD;
+
+public:
+ static char ID; // Pass identification, replacement for typeid
+ MergedLoadStoreMotion()
+ : FunctionPass(ID), MD(nullptr), MagicCompileTimeControl(250) {
+ initializeMergedLoadStoreMotionPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override;
+
+private:
+ // This transformation requires dominator postdominator info
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addPreserved<MemoryDependenceAnalysis>();
+ }
+
+ // Helper routines
+
+ ///
+ /// \brief Remove instruction from parent and update memory dependence
+ /// analysis.
+ ///
+ void removeInstruction(Instruction *Inst);
+ BasicBlock *getDiamondTail(BasicBlock *BB);
+ bool isDiamondHead(BasicBlock *BB);
+ // Routines for hoisting loads
+ bool isLoadHoistBarrierInRange(const Instruction& Start,
+ const Instruction& End,
+ LoadInst* LI);
+ LoadInst *canHoistFromBlock(BasicBlock *BB, LoadInst *LI);
+ void hoistInstruction(BasicBlock *BB, Instruction *HoistCand,
+ Instruction *ElseInst);
+ bool isSafeToHoist(Instruction *I) const;
+ bool hoistLoad(BasicBlock *BB, LoadInst *HoistCand, LoadInst *ElseInst);
+ bool mergeLoads(BasicBlock *BB);
+ // Routines for sinking stores
+ StoreInst *canSinkFromBlock(BasicBlock *BB, StoreInst *SI);
+ PHINode *getPHIOperand(BasicBlock *BB, StoreInst *S0, StoreInst *S1);
+ bool isStoreSinkBarrierInRange(const Instruction &Start,
+ const Instruction &End, MemoryLocation Loc);
+ bool sinkStore(BasicBlock *BB, StoreInst *SinkCand, StoreInst *ElseInst);
+ bool mergeStores(BasicBlock *BB);
+ // The mergeLoad/Store algorithms could have Size0 * Size1 complexity,
+ // where Size0 and Size1 are the #instructions on the two sides of
+ // the diamond. The constant chosen here is arbitrary. Compiler Time
+ // Control is enforced by the check Size0 * Size1 < MagicCompileTimeControl.
+ const int MagicCompileTimeControl;
+};
+
+char MergedLoadStoreMotion::ID = 0;
+} // anonymous namespace
+
+///
+/// \brief createMergedLoadStoreMotionPass - The public interface to this file.
+///
+FunctionPass *llvm::createMergedLoadStoreMotionPass() {
+ return new MergedLoadStoreMotion();
+}
+
+INITIALIZE_PASS_BEGIN(MergedLoadStoreMotion, "mldst-motion",
+ "MergedLoadStoreMotion", false, false)
+INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_END(MergedLoadStoreMotion, "mldst-motion",
+ "MergedLoadStoreMotion", false, false)
+
+///
+/// \brief Remove instruction from parent and update memory dependence analysis.
+///
+void MergedLoadStoreMotion::removeInstruction(Instruction *Inst) {
+ // Notify the memory dependence analysis.
+ if (MD) {
+ MD->removeInstruction(Inst);
+ if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
+ MD->invalidateCachedPointerInfo(LI->getPointerOperand());
+ if (Inst->getType()->getScalarType()->isPointerTy()) {
+ MD->invalidateCachedPointerInfo(Inst);
+ }
+ }
+ Inst->eraseFromParent();
+}
+
+///
+/// \brief Return tail block of a diamond.
+///
+BasicBlock *MergedLoadStoreMotion::getDiamondTail(BasicBlock *BB) {
+ assert(isDiamondHead(BB) && "Basic block is not head of a diamond");
+ BranchInst *BI = (BranchInst *)(BB->getTerminator());
+ BasicBlock *Succ0 = BI->getSuccessor(0);
+ BasicBlock *Tail = Succ0->getTerminator()->getSuccessor(0);
+ return Tail;
+}
+
+///
+/// \brief True when BB is the head of a diamond (hammock)
+///
+bool MergedLoadStoreMotion::isDiamondHead(BasicBlock *BB) {
+ if (!BB)
+ return false;
+ if (!isa<BranchInst>(BB->getTerminator()))
+ return false;
+ if (BB->getTerminator()->getNumSuccessors() != 2)
+ return false;
+
+ BranchInst *BI = (BranchInst *)(BB->getTerminator());
+ BasicBlock *Succ0 = BI->getSuccessor(0);
+ BasicBlock *Succ1 = BI->getSuccessor(1);
+
+ if (!Succ0->getSinglePredecessor() ||
+ Succ0->getTerminator()->getNumSuccessors() != 1)
+ return false;
+ if (!Succ1->getSinglePredecessor() ||
+ Succ1->getTerminator()->getNumSuccessors() != 1)
+ return false;
+
+ BasicBlock *Tail = Succ0->getTerminator()->getSuccessor(0);
+ // Ignore triangles.
+ if (Succ1->getTerminator()->getSuccessor(0) != Tail)
+ return false;
+ return true;
+}
+
+///
+/// \brief True when instruction is a hoist barrier for a load
+///
+/// Whenever an instruction could possibly modify the value
+/// being loaded or protect against the load from happening
+/// it is considered a hoist barrier.
+///
+bool MergedLoadStoreMotion::isLoadHoistBarrierInRange(const Instruction& Start,
+ const Instruction& End,
+ LoadInst* LI) {
+ MemoryLocation Loc = MemoryLocation::get(LI);
+ return AA->canInstructionRangeModRef(Start, End, Loc, MRI_Mod);
+}
+
+///
+/// \brief Decide if a load can be hoisted
+///
+/// When there is a load in \p BB to the same address as \p LI
+/// and it can be hoisted from \p BB, return that load.
+/// Otherwise return Null.
+///
+LoadInst *MergedLoadStoreMotion::canHoistFromBlock(BasicBlock *BB1,
+ LoadInst *Load0) {
+
+ for (BasicBlock::iterator BBI = BB1->begin(), BBE = BB1->end(); BBI != BBE;
+ ++BBI) {
+ Instruction *Inst = &*BBI;
+
+ // Only merge and hoist loads when their result in used only in BB
+ if (!isa<LoadInst>(Inst) || Inst->isUsedOutsideOfBlock(BB1))
+ continue;
+
+ LoadInst *Load1 = dyn_cast<LoadInst>(Inst);
+ BasicBlock *BB0 = Load0->getParent();
+
+ MemoryLocation Loc0 = MemoryLocation::get(Load0);
+ MemoryLocation Loc1 = MemoryLocation::get(Load1);
+ if (AA->isMustAlias(Loc0, Loc1) && Load0->isSameOperationAs(Load1) &&
+ !isLoadHoistBarrierInRange(BB1->front(), *Load1, Load1) &&
+ !isLoadHoistBarrierInRange(BB0->front(), *Load0, Load0)) {
+ return Load1;
+ }
+ }
+ return nullptr;
+}
+
+///
+/// \brief Merge two equivalent instructions \p HoistCand and \p ElseInst into
+/// \p BB
+///
+/// BB is the head of a diamond
+///
+void MergedLoadStoreMotion::hoistInstruction(BasicBlock *BB,
+ Instruction *HoistCand,
+ Instruction *ElseInst) {
+ DEBUG(dbgs() << " Hoist Instruction into BB \n"; BB->dump();
+ dbgs() << "Instruction Left\n"; HoistCand->dump(); dbgs() << "\n";
+ dbgs() << "Instruction Right\n"; ElseInst->dump(); dbgs() << "\n");
+ // Hoist the instruction.
+ assert(HoistCand->getParent() != BB);
+
+ // Intersect optional metadata.
+ HoistCand->intersectOptionalDataWith(ElseInst);
+ HoistCand->dropUnknownNonDebugMetadata();
+
+ // Prepend point for instruction insert
+ Instruction *HoistPt = BB->getTerminator();
+
+ // Merged instruction
+ Instruction *HoistedInst = HoistCand->clone();
+
+ // Hoist instruction.
+ HoistedInst->insertBefore(HoistPt);
+
+ HoistCand->replaceAllUsesWith(HoistedInst);
+ removeInstruction(HoistCand);
+ // Replace the else block instruction.
+ ElseInst->replaceAllUsesWith(HoistedInst);
+ removeInstruction(ElseInst);
+}
+
+///
+/// \brief Return true if no operand of \p I is defined in I's parent block
+///
+bool MergedLoadStoreMotion::isSafeToHoist(Instruction *I) const {
+ BasicBlock *Parent = I->getParent();
+ for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+ Instruction *Instr = dyn_cast<Instruction>(I->getOperand(i));
+ if (Instr && Instr->getParent() == Parent)
+ return false;
+ }
+ return true;
+}
+
+///
+/// \brief Merge two equivalent loads and GEPs and hoist into diamond head
+///
+bool MergedLoadStoreMotion::hoistLoad(BasicBlock *BB, LoadInst *L0,
+ LoadInst *L1) {
+ // Only one definition?
+ Instruction *A0 = dyn_cast<Instruction>(L0->getPointerOperand());
+ Instruction *A1 = dyn_cast<Instruction>(L1->getPointerOperand());
+ if (A0 && A1 && A0->isIdenticalTo(A1) && isSafeToHoist(A0) &&
+ A0->hasOneUse() && (A0->getParent() == L0->getParent()) &&
+ A1->hasOneUse() && (A1->getParent() == L1->getParent()) &&
+ isa<GetElementPtrInst>(A0)) {
+ DEBUG(dbgs() << "Hoist Instruction into BB \n"; BB->dump();
+ dbgs() << "Instruction Left\n"; L0->dump(); dbgs() << "\n";
+ dbgs() << "Instruction Right\n"; L1->dump(); dbgs() << "\n");
+ hoistInstruction(BB, A0, A1);
+ hoistInstruction(BB, L0, L1);
+ return true;
+ } else
+ return false;
+}
+
+///
+/// \brief Try to hoist two loads to same address into diamond header
+///
+/// Starting from a diamond head block, iterate over the instructions in one
+/// successor block and try to match a load in the second successor.
+///
+bool MergedLoadStoreMotion::mergeLoads(BasicBlock *BB) {
+ bool MergedLoads = false;
+ assert(isDiamondHead(BB));
+ BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator());
+ BasicBlock *Succ0 = BI->getSuccessor(0);
+ BasicBlock *Succ1 = BI->getSuccessor(1);
+ // #Instructions in Succ1 for Compile Time Control
+ int Size1 = Succ1->size();
+ int NLoads = 0;
+ for (BasicBlock::iterator BBI = Succ0->begin(), BBE = Succ0->end();
+ BBI != BBE;) {
+ Instruction *I = &*BBI;
+ ++BBI;
+
+ // Only move non-simple (atomic, volatile) loads.
+ LoadInst *L0 = dyn_cast<LoadInst>(I);
+ if (!L0 || !L0->isSimple() || L0->isUsedOutsideOfBlock(Succ0))
+ continue;
+
+ ++NLoads;
+ if (NLoads * Size1 >= MagicCompileTimeControl)
+ break;
+ if (LoadInst *L1 = canHoistFromBlock(Succ1, L0)) {
+ bool Res = hoistLoad(BB, L0, L1);
+ MergedLoads |= Res;
+ // Don't attempt to hoist above loads that had not been hoisted.
+ if (!Res)
+ break;
+ }
+ }
+ return MergedLoads;
+}
+
+///
+/// \brief True when instruction is a sink barrier for a store
+/// located in Loc
+///
+/// Whenever an instruction could possibly read or modify the
+/// value being stored or protect against the store from
+/// happening it is considered a sink barrier.
+///
+bool MergedLoadStoreMotion::isStoreSinkBarrierInRange(const Instruction &Start,
+ const Instruction &End,
+ MemoryLocation Loc) {
+ return AA->canInstructionRangeModRef(Start, End, Loc, MRI_ModRef);
+}
+
+///
+/// \brief Check if \p BB contains a store to the same address as \p SI
+///
+/// \return The store in \p when it is safe to sink. Otherwise return Null.
+///
+StoreInst *MergedLoadStoreMotion::canSinkFromBlock(BasicBlock *BB1,
+ StoreInst *Store0) {
+ DEBUG(dbgs() << "can Sink? : "; Store0->dump(); dbgs() << "\n");
+ BasicBlock *BB0 = Store0->getParent();
+ for (BasicBlock::reverse_iterator RBI = BB1->rbegin(), RBE = BB1->rend();
+ RBI != RBE; ++RBI) {
+ Instruction *Inst = &*RBI;
+
+ if (!isa<StoreInst>(Inst))
+ continue;
+
+ StoreInst *Store1 = cast<StoreInst>(Inst);
+
+ MemoryLocation Loc0 = MemoryLocation::get(Store0);
+ MemoryLocation Loc1 = MemoryLocation::get(Store1);
+ if (AA->isMustAlias(Loc0, Loc1) && Store0->isSameOperationAs(Store1) &&
+ !isStoreSinkBarrierInRange(*(std::next(BasicBlock::iterator(Store1))),
+ BB1->back(), Loc1) &&
+ !isStoreSinkBarrierInRange(*(std::next(BasicBlock::iterator(Store0))),
+ BB0->back(), Loc0)) {
+ return Store1;
+ }
+ }
+ return nullptr;
+}
+
+///
+/// \brief Create a PHI node in BB for the operands of S0 and S1
+///
+PHINode *MergedLoadStoreMotion::getPHIOperand(BasicBlock *BB, StoreInst *S0,
+ StoreInst *S1) {
+ // Create a phi if the values mismatch.
+ PHINode *NewPN = nullptr;
+ Value *Opd1 = S0->getValueOperand();
+ Value *Opd2 = S1->getValueOperand();
+ if (Opd1 != Opd2) {
+ NewPN = PHINode::Create(Opd1->getType(), 2, Opd2->getName() + ".sink",
+ &BB->front());
+ NewPN->addIncoming(Opd1, S0->getParent());
+ NewPN->addIncoming(Opd2, S1->getParent());
+ if (MD && NewPN->getType()->getScalarType()->isPointerTy())
+ MD->invalidateCachedPointerInfo(NewPN);
+ }
+ return NewPN;
+}
+
+///
+/// \brief Merge two stores to same address and sink into \p BB
+///
+/// Also sinks GEP instruction computing the store address
+///
+bool MergedLoadStoreMotion::sinkStore(BasicBlock *BB, StoreInst *S0,
+ StoreInst *S1) {
+ // Only one definition?
+ Instruction *A0 = dyn_cast<Instruction>(S0->getPointerOperand());
+ Instruction *A1 = dyn_cast<Instruction>(S1->getPointerOperand());
+ if (A0 && A1 && A0->isIdenticalTo(A1) && A0->hasOneUse() &&
+ (A0->getParent() == S0->getParent()) && A1->hasOneUse() &&
+ (A1->getParent() == S1->getParent()) && isa<GetElementPtrInst>(A0)) {
+ DEBUG(dbgs() << "Sink Instruction into BB \n"; BB->dump();
+ dbgs() << "Instruction Left\n"; S0->dump(); dbgs() << "\n";
+ dbgs() << "Instruction Right\n"; S1->dump(); dbgs() << "\n");
+ // Hoist the instruction.
+ BasicBlock::iterator InsertPt = BB->getFirstInsertionPt();
+ // Intersect optional metadata.
+ S0->intersectOptionalDataWith(S1);
+ S0->dropUnknownNonDebugMetadata();
+
+ // Create the new store to be inserted at the join point.
+ StoreInst *SNew = (StoreInst *)(S0->clone());
+ Instruction *ANew = A0->clone();
+ SNew->insertBefore(&*InsertPt);
+ ANew->insertBefore(SNew);
+
+ assert(S0->getParent() == A0->getParent());
+ assert(S1->getParent() == A1->getParent());
+
+ PHINode *NewPN = getPHIOperand(BB, S0, S1);
+ // New PHI operand? Use it.
+ if (NewPN)
+ SNew->setOperand(0, NewPN);
+ removeInstruction(S0);
+ removeInstruction(S1);
+ A0->replaceAllUsesWith(ANew);
+ removeInstruction(A0);
+ A1->replaceAllUsesWith(ANew);
+ removeInstruction(A1);
+ return true;
+ }
+ return false;
+}
+
+///
+/// \brief True when two stores are equivalent and can sink into the footer
+///
+/// Starting from a diamond tail block, iterate over the instructions in one
+/// predecessor block and try to match a store in the second predecessor.
+///
+bool MergedLoadStoreMotion::mergeStores(BasicBlock *T) {
+
+ bool MergedStores = false;
+ assert(T && "Footer of a diamond cannot be empty");
+
+ pred_iterator PI = pred_begin(T), E = pred_end(T);
+ assert(PI != E);
+ BasicBlock *Pred0 = *PI;
+ ++PI;
+ BasicBlock *Pred1 = *PI;
+ ++PI;
+ // tail block of a diamond/hammock?
+ if (Pred0 == Pred1)
+ return false; // No.
+ if (PI != E)
+ return false; // No. More than 2 predecessors.
+
+ // #Instructions in Succ1 for Compile Time Control
+ int Size1 = Pred1->size();
+ int NStores = 0;
+
+ for (BasicBlock::reverse_iterator RBI = Pred0->rbegin(), RBE = Pred0->rend();
+ RBI != RBE;) {
+
+ Instruction *I = &*RBI;
+ ++RBI;
+
+ // Sink move non-simple (atomic, volatile) stores
+ if (!isa<StoreInst>(I))
+ continue;
+ StoreInst *S0 = (StoreInst *)I;
+ if (!S0->isSimple())
+ continue;
+
+ ++NStores;
+ if (NStores * Size1 >= MagicCompileTimeControl)
+ break;
+ if (StoreInst *S1 = canSinkFromBlock(Pred1, S0)) {
+ bool Res = sinkStore(T, S0, S1);
+ MergedStores |= Res;
+ // Don't attempt to sink below stores that had to stick around
+ // But after removal of a store and some of its feeding
+ // instruction search again from the beginning since the iterator
+ // is likely stale at this point.
+ if (!Res)
+ break;
+ else {
+ RBI = Pred0->rbegin();
+ RBE = Pred0->rend();
+ DEBUG(dbgs() << "Search again\n"; Instruction *I = &*RBI; I->dump());
+ }
+ }
+ }
+ return MergedStores;
+}
+
+///
+/// \brief Run the transformation for each function
+///
+bool MergedLoadStoreMotion::runOnFunction(Function &F) {
+ MD = getAnalysisIfAvailable<MemoryDependenceAnalysis>();
+ AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+
+ bool Changed = false;
+ DEBUG(dbgs() << "Instruction Merger\n");
+
+ // Merge unconditional branches, allowing PRE to catch more
+ // optimization opportunities.
+ for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE;) {
+ BasicBlock *BB = &*FI++;
+
+ // Hoist equivalent loads and sink stores
+ // outside diamonds when possible
+ if (isDiamondHead(BB)) {
+ Changed |= mergeLoads(BB);
+ Changed |= mergeStores(getDiamondTail(BB));
+ }
+ }
+ return Changed;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp b/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
new file mode 100644
index 0000000..c8f885e
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
@@ -0,0 +1,577 @@
+//===- NaryReassociate.cpp - Reassociate n-ary expressions ----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass reassociates n-ary add expressions and eliminates the redundancy
+// exposed by the reassociation.
+//
+// A motivating example:
+//
+// void foo(int a, int b) {
+// bar(a + b);
+// bar((a + 2) + b);
+// }
+//
+// An ideal compiler should reassociate (a + 2) + b to (a + b) + 2 and simplify
+// the above code to
+//
+// int t = a + b;
+// bar(t);
+// bar(t + 2);
+//
+// However, the Reassociate pass is unable to do that because it processes each
+// instruction individually and believes (a + 2) + b is the best form according
+// to its rank system.
+//
+// To address this limitation, NaryReassociate reassociates an expression in a
+// form that reuses existing instructions. As a result, NaryReassociate can
+// reassociate (a + 2) + b in the example to (a + b) + 2 because it detects that
+// (a + b) is computed before.
+//
+// NaryReassociate works as follows. For every instruction in the form of (a +
+// b) + c, it checks whether a + c or b + c is already computed by a dominating
+// instruction. If so, it then reassociates (a + b) + c into (a + c) + b or (b +
+// c) + a and removes the redundancy accordingly. To efficiently look up whether
+// an expression is computed before, we store each instruction seen and its SCEV
+// into an SCEV-to-instruction map.
+//
+// Although the algorithm pattern-matches only ternary additions, it
+// automatically handles many >3-ary expressions by walking through the function
+// in the depth-first order. For example, given
+//
+// (a + c) + d
+// ((a + b) + c) + d
+//
+// NaryReassociate first rewrites (a + b) + c to (a + c) + b, and then rewrites
+// ((a + c) + b) + d into ((a + c) + d) + b.
+//
+// Finally, the above dominator-based algorithm may need to be run multiple
+// iterations before emitting optimal code. One source of this need is that we
+// only split an operand when it is used only once. The above algorithm can
+// eliminate an instruction and decrease the usage count of its operands. As a
+// result, an instruction that previously had multiple uses may become a
+// single-use instruction and thus eligible for split consideration. For
+// example,
+//
+// ac = a + c
+// ab = a + b
+// abc = ab + c
+// ab2 = ab + b
+// ab2c = ab2 + c
+//
+// In the first iteration, we cannot reassociate abc to ac+b because ab is used
+// twice. However, we can reassociate ab2c to abc+b in the first iteration. As a
+// result, ab2 becomes dead and ab will be used only once in the second
+// iteration.
+//
+// Limitations and TODO items:
+//
+// 1) We only considers n-ary adds and muls for now. This should be extended
+// and generalized.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "nary-reassociate"
+
+namespace {
+class NaryReassociate : public FunctionPass {
+public:
+ static char ID;
+
+ NaryReassociate(): FunctionPass(ID) {
+ initializeNaryReassociatePass(*PassRegistry::getPassRegistry());
+ }
+
+ bool doInitialization(Module &M) override {
+ DL = &M.getDataLayout();
+ return false;
+ }
+ bool runOnFunction(Function &F) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<ScalarEvolutionWrapperPass>();
+ AU.addPreserved<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.setPreservesCFG();
+ }
+
+private:
+ // Runs only one iteration of the dominator-based algorithm. See the header
+ // comments for why we need multiple iterations.
+ bool doOneIteration(Function &F);
+
+ // Reassociates I for better CSE.
+ Instruction *tryReassociate(Instruction *I);
+
+ // Reassociate GEP for better CSE.
+ Instruction *tryReassociateGEP(GetElementPtrInst *GEP);
+ // Try splitting GEP at the I-th index and see whether either part can be
+ // CSE'ed. This is a helper function for tryReassociateGEP.
+ //
+ // \p IndexedType The element type indexed by GEP's I-th index. This is
+ // equivalent to
+ // GEP->getIndexedType(GEP->getPointerOperand(), 0-th index,
+ // ..., i-th index).
+ GetElementPtrInst *tryReassociateGEPAtIndex(GetElementPtrInst *GEP,
+ unsigned I, Type *IndexedType);
+ // Given GEP's I-th index = LHS + RHS, see whether &Base[..][LHS][..] or
+ // &Base[..][RHS][..] can be CSE'ed and rewrite GEP accordingly.
+ GetElementPtrInst *tryReassociateGEPAtIndex(GetElementPtrInst *GEP,
+ unsigned I, Value *LHS,
+ Value *RHS, Type *IndexedType);
+
+ // Reassociate binary operators for better CSE.
+ Instruction *tryReassociateBinaryOp(BinaryOperator *I);
+
+ // A helper function for tryReassociateBinaryOp. LHS and RHS are explicitly
+ // passed.
+ Instruction *tryReassociateBinaryOp(Value *LHS, Value *RHS,
+ BinaryOperator *I);
+ // Rewrites I to (LHS op RHS) if LHS is computed already.
+ Instruction *tryReassociatedBinaryOp(const SCEV *LHS, Value *RHS,
+ BinaryOperator *I);
+
+ // Tries to match Op1 and Op2 by using V.
+ bool matchTernaryOp(BinaryOperator *I, Value *V, Value *&Op1, Value *&Op2);
+
+ // Gets SCEV for (LHS op RHS).
+ const SCEV *getBinarySCEV(BinaryOperator *I, const SCEV *LHS,
+ const SCEV *RHS);
+
+ // Returns the closest dominator of \c Dominatee that computes
+ // \c CandidateExpr. Returns null if not found.
+ Instruction *findClosestMatchingDominator(const SCEV *CandidateExpr,
+ Instruction *Dominatee);
+ // GetElementPtrInst implicitly sign-extends an index if the index is shorter
+ // than the pointer size. This function returns whether Index is shorter than
+ // GEP's pointer size, i.e., whether Index needs to be sign-extended in order
+ // to be an index of GEP.
+ bool requiresSignExtension(Value *Index, GetElementPtrInst *GEP);
+
+ AssumptionCache *AC;
+ const DataLayout *DL;
+ DominatorTree *DT;
+ ScalarEvolution *SE;
+ TargetLibraryInfo *TLI;
+ TargetTransformInfo *TTI;
+ // A lookup table quickly telling which instructions compute the given SCEV.
+ // Note that there can be multiple instructions at different locations
+ // computing to the same SCEV, so we map a SCEV to an instruction list. For
+ // example,
+ //
+ // if (p1)
+ // foo(a + b);
+ // if (p2)
+ // bar(a + b);
+ DenseMap<const SCEV *, SmallVector<WeakVH, 2>> SeenExprs;
+};
+} // anonymous namespace
+
+char NaryReassociate::ID = 0;
+INITIALIZE_PASS_BEGIN(NaryReassociate, "nary-reassociate", "Nary reassociation",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(NaryReassociate, "nary-reassociate", "Nary reassociation",
+ false, false)
+
+FunctionPass *llvm::createNaryReassociatePass() {
+ return new NaryReassociate();
+}
+
+bool NaryReassociate::runOnFunction(Function &F) {
+ if (skipOptnoneFunction(F))
+ return false;
+
+ AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+
+ bool Changed = false, ChangedInThisIteration;
+ do {
+ ChangedInThisIteration = doOneIteration(F);
+ Changed |= ChangedInThisIteration;
+ } while (ChangedInThisIteration);
+ return Changed;
+}
+
+// Whitelist the instruction types NaryReassociate handles for now.
+static bool isPotentiallyNaryReassociable(Instruction *I) {
+ switch (I->getOpcode()) {
+ case Instruction::Add:
+ case Instruction::GetElementPtr:
+ case Instruction::Mul:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool NaryReassociate::doOneIteration(Function &F) {
+ bool Changed = false;
+ SeenExprs.clear();
+ // Process the basic blocks in pre-order of the dominator tree. This order
+ // ensures that all bases of a candidate are in Candidates when we process it.
+ for (auto Node = GraphTraits<DominatorTree *>::nodes_begin(DT);
+ Node != GraphTraits<DominatorTree *>::nodes_end(DT); ++Node) {
+ BasicBlock *BB = Node->getBlock();
+ for (auto I = BB->begin(); I != BB->end(); ++I) {
+ if (SE->isSCEVable(I->getType()) && isPotentiallyNaryReassociable(&*I)) {
+ const SCEV *OldSCEV = SE->getSCEV(&*I);
+ if (Instruction *NewI = tryReassociate(&*I)) {
+ Changed = true;
+ SE->forgetValue(&*I);
+ I->replaceAllUsesWith(NewI);
+ // If SeenExprs constains I's WeakVH, that entry will be replaced with
+ // nullptr.
+ RecursivelyDeleteTriviallyDeadInstructions(&*I, TLI);
+ I = NewI->getIterator();
+ }
+ // Add the rewritten instruction to SeenExprs; the original instruction
+ // is deleted.
+ const SCEV *NewSCEV = SE->getSCEV(&*I);
+ SeenExprs[NewSCEV].push_back(WeakVH(&*I));
+ // Ideally, NewSCEV should equal OldSCEV because tryReassociate(I)
+ // is equivalent to I. However, ScalarEvolution::getSCEV may
+ // weaken nsw causing NewSCEV not to equal OldSCEV. For example, suppose
+ // we reassociate
+ // I = &a[sext(i +nsw j)] // assuming sizeof(a[0]) = 4
+ // to
+ // NewI = &a[sext(i)] + sext(j).
+ //
+ // ScalarEvolution computes
+ // getSCEV(I) = a + 4 * sext(i + j)
+ // getSCEV(newI) = a + 4 * sext(i) + 4 * sext(j)
+ // which are different SCEVs.
+ //
+ // To alleviate this issue of ScalarEvolution not always capturing
+ // equivalence, we add I to SeenExprs[OldSCEV] as well so that we can
+ // map both SCEV before and after tryReassociate(I) to I.
+ //
+ // This improvement is exercised in @reassociate_gep_nsw in nary-gep.ll.
+ if (NewSCEV != OldSCEV)
+ SeenExprs[OldSCEV].push_back(WeakVH(&*I));
+ }
+ }
+ }
+ return Changed;
+}
+
+Instruction *NaryReassociate::tryReassociate(Instruction *I) {
+ switch (I->getOpcode()) {
+ case Instruction::Add:
+ case Instruction::Mul:
+ return tryReassociateBinaryOp(cast<BinaryOperator>(I));
+ case Instruction::GetElementPtr:
+ return tryReassociateGEP(cast<GetElementPtrInst>(I));
+ default:
+ llvm_unreachable("should be filtered out by isPotentiallyNaryReassociable");
+ }
+}
+
+// FIXME: extract this method into TTI->getGEPCost.
+static bool isGEPFoldable(GetElementPtrInst *GEP,
+ const TargetTransformInfo *TTI,
+ const DataLayout *DL) {
+ GlobalVariable *BaseGV = nullptr;
+ int64_t BaseOffset = 0;
+ bool HasBaseReg = false;
+ int64_t Scale = 0;
+
+ if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getPointerOperand()))
+ BaseGV = GV;
+ else
+ HasBaseReg = true;
+
+ gep_type_iterator GTI = gep_type_begin(GEP);
+ for (auto I = GEP->idx_begin(); I != GEP->idx_end(); ++I, ++GTI) {
+ if (isa<SequentialType>(*GTI)) {
+ int64_t ElementSize = DL->getTypeAllocSize(GTI.getIndexedType());
+ if (ConstantInt *ConstIdx = dyn_cast<ConstantInt>(*I)) {
+ BaseOffset += ConstIdx->getSExtValue() * ElementSize;
+ } else {
+ // Needs scale register.
+ if (Scale != 0) {
+ // No addressing mode takes two scale registers.
+ return false;
+ }
+ Scale = ElementSize;
+ }
+ } else {
+ StructType *STy = cast<StructType>(*GTI);
+ uint64_t Field = cast<ConstantInt>(*I)->getZExtValue();
+ BaseOffset += DL->getStructLayout(STy)->getElementOffset(Field);
+ }
+ }
+
+ unsigned AddrSpace = GEP->getPointerAddressSpace();
+ return TTI->isLegalAddressingMode(GEP->getType()->getElementType(), BaseGV,
+ BaseOffset, HasBaseReg, Scale, AddrSpace);
+}
+
+Instruction *NaryReassociate::tryReassociateGEP(GetElementPtrInst *GEP) {
+ // Not worth reassociating GEP if it is foldable.
+ if (isGEPFoldable(GEP, TTI, DL))
+ return nullptr;
+
+ gep_type_iterator GTI = gep_type_begin(*GEP);
+ for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I) {
+ if (isa<SequentialType>(*GTI++)) {
+ if (auto *NewGEP = tryReassociateGEPAtIndex(GEP, I - 1, *GTI)) {
+ return NewGEP;
+ }
+ }
+ }
+ return nullptr;
+}
+
+bool NaryReassociate::requiresSignExtension(Value *Index,
+ GetElementPtrInst *GEP) {
+ unsigned PointerSizeInBits =
+ DL->getPointerSizeInBits(GEP->getType()->getPointerAddressSpace());
+ return cast<IntegerType>(Index->getType())->getBitWidth() < PointerSizeInBits;
+}
+
+GetElementPtrInst *
+NaryReassociate::tryReassociateGEPAtIndex(GetElementPtrInst *GEP, unsigned I,
+ Type *IndexedType) {
+ Value *IndexToSplit = GEP->getOperand(I + 1);
+ if (SExtInst *SExt = dyn_cast<SExtInst>(IndexToSplit)) {
+ IndexToSplit = SExt->getOperand(0);
+ } else if (ZExtInst *ZExt = dyn_cast<ZExtInst>(IndexToSplit)) {
+ // zext can be treated as sext if the source is non-negative.
+ if (isKnownNonNegative(ZExt->getOperand(0), *DL, 0, AC, GEP, DT))
+ IndexToSplit = ZExt->getOperand(0);
+ }
+
+ if (AddOperator *AO = dyn_cast<AddOperator>(IndexToSplit)) {
+ // If the I-th index needs sext and the underlying add is not equipped with
+ // nsw, we cannot split the add because
+ // sext(LHS + RHS) != sext(LHS) + sext(RHS).
+ if (requiresSignExtension(IndexToSplit, GEP) &&
+ computeOverflowForSignedAdd(AO, *DL, AC, GEP, DT) !=
+ OverflowResult::NeverOverflows)
+ return nullptr;
+
+ Value *LHS = AO->getOperand(0), *RHS = AO->getOperand(1);
+ // IndexToSplit = LHS + RHS.
+ if (auto *NewGEP = tryReassociateGEPAtIndex(GEP, I, LHS, RHS, IndexedType))
+ return NewGEP;
+ // Symmetrically, try IndexToSplit = RHS + LHS.
+ if (LHS != RHS) {
+ if (auto *NewGEP =
+ tryReassociateGEPAtIndex(GEP, I, RHS, LHS, IndexedType))
+ return NewGEP;
+ }
+ }
+ return nullptr;
+}
+
+GetElementPtrInst *NaryReassociate::tryReassociateGEPAtIndex(
+ GetElementPtrInst *GEP, unsigned I, Value *LHS, Value *RHS,
+ Type *IndexedType) {
+ // Look for GEP's closest dominator that has the same SCEV as GEP except that
+ // the I-th index is replaced with LHS.
+ SmallVector<const SCEV *, 4> IndexExprs;
+ for (auto Index = GEP->idx_begin(); Index != GEP->idx_end(); ++Index)
+ IndexExprs.push_back(SE->getSCEV(*Index));
+ // Replace the I-th index with LHS.
+ IndexExprs[I] = SE->getSCEV(LHS);
+ if (isKnownNonNegative(LHS, *DL, 0, AC, GEP, DT) &&
+ DL->getTypeSizeInBits(LHS->getType()) <
+ DL->getTypeSizeInBits(GEP->getOperand(I)->getType())) {
+ // Zero-extend LHS if it is non-negative. InstCombine canonicalizes sext to
+ // zext if the source operand is proved non-negative. We should do that
+ // consistently so that CandidateExpr more likely appears before. See
+ // @reassociate_gep_assume for an example of this canonicalization.
+ IndexExprs[I] =
+ SE->getZeroExtendExpr(IndexExprs[I], GEP->getOperand(I)->getType());
+ }
+ const SCEV *CandidateExpr = SE->getGEPExpr(
+ GEP->getSourceElementType(), SE->getSCEV(GEP->getPointerOperand()),
+ IndexExprs, GEP->isInBounds());
+
+ Value *Candidate = findClosestMatchingDominator(CandidateExpr, GEP);
+ if (Candidate == nullptr)
+ return nullptr;
+
+ IRBuilder<> Builder(GEP);
+ // Candidate does not necessarily have the same pointer type as GEP. Use
+ // bitcast or pointer cast to make sure they have the same type, so that the
+ // later RAUW doesn't complain.
+ Candidate = Builder.CreateBitOrPointerCast(Candidate, GEP->getType());
+ assert(Candidate->getType() == GEP->getType());
+
+ // NewGEP = (char *)Candidate + RHS * sizeof(IndexedType)
+ uint64_t IndexedSize = DL->getTypeAllocSize(IndexedType);
+ Type *ElementType = GEP->getType()->getElementType();
+ uint64_t ElementSize = DL->getTypeAllocSize(ElementType);
+ // Another less rare case: because I is not necessarily the last index of the
+ // GEP, the size of the type at the I-th index (IndexedSize) is not
+ // necessarily divisible by ElementSize. For example,
+ //
+ // #pragma pack(1)
+ // struct S {
+ // int a[3];
+ // int64 b[8];
+ // };
+ // #pragma pack()
+ //
+ // sizeof(S) = 100 is indivisible by sizeof(int64) = 8.
+ //
+ // TODO: bail out on this case for now. We could emit uglygep.
+ if (IndexedSize % ElementSize != 0)
+ return nullptr;
+
+ // NewGEP = &Candidate[RHS * (sizeof(IndexedType) / sizeof(Candidate[0])));
+ Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
+ if (RHS->getType() != IntPtrTy)
+ RHS = Builder.CreateSExtOrTrunc(RHS, IntPtrTy);
+ if (IndexedSize != ElementSize) {
+ RHS = Builder.CreateMul(
+ RHS, ConstantInt::get(IntPtrTy, IndexedSize / ElementSize));
+ }
+ GetElementPtrInst *NewGEP =
+ cast<GetElementPtrInst>(Builder.CreateGEP(Candidate, RHS));
+ NewGEP->setIsInBounds(GEP->isInBounds());
+ NewGEP->takeName(GEP);
+ return NewGEP;
+}
+
+Instruction *NaryReassociate::tryReassociateBinaryOp(BinaryOperator *I) {
+ Value *LHS = I->getOperand(0), *RHS = I->getOperand(1);
+ if (auto *NewI = tryReassociateBinaryOp(LHS, RHS, I))
+ return NewI;
+ if (auto *NewI = tryReassociateBinaryOp(RHS, LHS, I))
+ return NewI;
+ return nullptr;
+}
+
+Instruction *NaryReassociate::tryReassociateBinaryOp(Value *LHS, Value *RHS,
+ BinaryOperator *I) {
+ Value *A = nullptr, *B = nullptr;
+ // To be conservative, we reassociate I only when it is the only user of (A op
+ // B).
+ if (LHS->hasOneUse() && matchTernaryOp(I, LHS, A, B)) {
+ // I = (A op B) op RHS
+ // = (A op RHS) op B or (B op RHS) op A
+ const SCEV *AExpr = SE->getSCEV(A), *BExpr = SE->getSCEV(B);
+ const SCEV *RHSExpr = SE->getSCEV(RHS);
+ if (BExpr != RHSExpr) {
+ if (auto *NewI =
+ tryReassociatedBinaryOp(getBinarySCEV(I, AExpr, RHSExpr), B, I))
+ return NewI;
+ }
+ if (AExpr != RHSExpr) {
+ if (auto *NewI =
+ tryReassociatedBinaryOp(getBinarySCEV(I, BExpr, RHSExpr), A, I))
+ return NewI;
+ }
+ }
+ return nullptr;
+}
+
+Instruction *NaryReassociate::tryReassociatedBinaryOp(const SCEV *LHSExpr,
+ Value *RHS,
+ BinaryOperator *I) {
+ // Look for the closest dominator LHS of I that computes LHSExpr, and replace
+ // I with LHS op RHS.
+ auto *LHS = findClosestMatchingDominator(LHSExpr, I);
+ if (LHS == nullptr)
+ return nullptr;
+
+ Instruction *NewI = nullptr;
+ switch (I->getOpcode()) {
+ case Instruction::Add:
+ NewI = BinaryOperator::CreateAdd(LHS, RHS, "", I);
+ break;
+ case Instruction::Mul:
+ NewI = BinaryOperator::CreateMul(LHS, RHS, "", I);
+ break;
+ default:
+ llvm_unreachable("Unexpected instruction.");
+ }
+ NewI->takeName(I);
+ return NewI;
+}
+
+bool NaryReassociate::matchTernaryOp(BinaryOperator *I, Value *V, Value *&Op1,
+ Value *&Op2) {
+ switch (I->getOpcode()) {
+ case Instruction::Add:
+ return match(V, m_Add(m_Value(Op1), m_Value(Op2)));
+ case Instruction::Mul:
+ return match(V, m_Mul(m_Value(Op1), m_Value(Op2)));
+ default:
+ llvm_unreachable("Unexpected instruction.");
+ }
+ return false;
+}
+
+const SCEV *NaryReassociate::getBinarySCEV(BinaryOperator *I, const SCEV *LHS,
+ const SCEV *RHS) {
+ switch (I->getOpcode()) {
+ case Instruction::Add:
+ return SE->getAddExpr(LHS, RHS);
+ case Instruction::Mul:
+ return SE->getMulExpr(LHS, RHS);
+ default:
+ llvm_unreachable("Unexpected instruction.");
+ }
+ return nullptr;
+}
+
+Instruction *
+NaryReassociate::findClosestMatchingDominator(const SCEV *CandidateExpr,
+ Instruction *Dominatee) {
+ auto Pos = SeenExprs.find(CandidateExpr);
+ if (Pos == SeenExprs.end())
+ return nullptr;
+
+ auto &Candidates = Pos->second;
+ // Because we process the basic blocks in pre-order of the dominator tree, a
+ // candidate that doesn't dominate the current instruction won't dominate any
+ // future instruction either. Therefore, we pop it out of the stack. This
+ // optimization makes the algorithm O(n).
+ while (!Candidates.empty()) {
+ // Candidates stores WeakVHs, so a candidate can be nullptr if it's removed
+ // during rewriting.
+ if (Value *Candidate = Candidates.back()) {
+ Instruction *CandidateInstruction = cast<Instruction>(Candidate);
+ if (DT->dominates(CandidateInstruction, Dominatee))
+ return CandidateInstruction;
+ }
+ Candidates.pop_back();
+ }
+ return nullptr;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
new file mode 100644
index 0000000..9f26f78
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
@@ -0,0 +1,163 @@
+//===--- PartiallyInlineLibCalls.cpp - Partially inline libcalls ----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass tries to partially inline the fast path of well-known library
+// functions, such as using square-root instructions for cases where sqrt()
+// does not need to set errno.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "partially-inline-libcalls"
+
+namespace {
+ class PartiallyInlineLibCalls : public FunctionPass {
+ public:
+ static char ID;
+
+ PartiallyInlineLibCalls() :
+ FunctionPass(ID) {
+ initializePartiallyInlineLibCallsPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+ bool runOnFunction(Function &F) override;
+
+ private:
+ /// Optimize calls to sqrt.
+ bool optimizeSQRT(CallInst *Call, Function *CalledFunc,
+ BasicBlock &CurrBB, Function::iterator &BB);
+ };
+
+ char PartiallyInlineLibCalls::ID = 0;
+}
+
+INITIALIZE_PASS(PartiallyInlineLibCalls, "partially-inline-libcalls",
+ "Partially inline calls to library functions", false, false)
+
+void PartiallyInlineLibCalls::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ FunctionPass::getAnalysisUsage(AU);
+}
+
+bool PartiallyInlineLibCalls::runOnFunction(Function &F) {
+ bool Changed = false;
+ Function::iterator CurrBB;
+ TargetLibraryInfo *TLI =
+ &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ const TargetTransformInfo *TTI =
+ &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ for (Function::iterator BB = F.begin(), BE = F.end(); BB != BE;) {
+ CurrBB = BB++;
+
+ for (BasicBlock::iterator II = CurrBB->begin(), IE = CurrBB->end();
+ II != IE; ++II) {
+ CallInst *Call = dyn_cast<CallInst>(&*II);
+ Function *CalledFunc;
+
+ if (!Call || !(CalledFunc = Call->getCalledFunction()))
+ continue;
+
+ // Skip if function either has local linkage or is not a known library
+ // function.
+ LibFunc::Func LibFunc;
+ if (CalledFunc->hasLocalLinkage() || !CalledFunc->hasName() ||
+ !TLI->getLibFunc(CalledFunc->getName(), LibFunc))
+ continue;
+
+ switch (LibFunc) {
+ case LibFunc::sqrtf:
+ case LibFunc::sqrt:
+ if (TTI->haveFastSqrt(Call->getType()) &&
+ optimizeSQRT(Call, CalledFunc, *CurrBB, BB))
+ break;
+ continue;
+ default:
+ continue;
+ }
+
+ Changed = true;
+ break;
+ }
+ }
+
+ return Changed;
+}
+
+bool PartiallyInlineLibCalls::optimizeSQRT(CallInst *Call,
+ Function *CalledFunc,
+ BasicBlock &CurrBB,
+ Function::iterator &BB) {
+ // There is no need to change the IR, since backend will emit sqrt
+ // instruction if the call has already been marked read-only.
+ if (Call->onlyReadsMemory())
+ return false;
+
+ // The call must have the expected result type.
+ if (!Call->getType()->isFloatingPointTy())
+ return false;
+
+ // Do the following transformation:
+ //
+ // (before)
+ // dst = sqrt(src)
+ //
+ // (after)
+ // v0 = sqrt_noreadmem(src) # native sqrt instruction.
+ // if (v0 is a NaN)
+ // v1 = sqrt(src) # library call.
+ // dst = phi(v0, v1)
+ //
+
+ // Move all instructions following Call to newly created block JoinBB.
+ // Create phi and replace all uses.
+ BasicBlock *JoinBB = llvm::SplitBlock(&CurrBB, Call->getNextNode());
+ IRBuilder<> Builder(JoinBB, JoinBB->begin());
+ PHINode *Phi = Builder.CreatePHI(Call->getType(), 2);
+ Call->replaceAllUsesWith(Phi);
+
+ // Create basic block LibCallBB and insert a call to library function sqrt.
+ BasicBlock *LibCallBB = BasicBlock::Create(CurrBB.getContext(), "call.sqrt",
+ CurrBB.getParent(), JoinBB);
+ Builder.SetInsertPoint(LibCallBB);
+ Instruction *LibCall = Call->clone();
+ Builder.Insert(LibCall);
+ Builder.CreateBr(JoinBB);
+
+ // Add attribute "readnone" so that backend can use a native sqrt instruction
+ // for this call. Insert a FP compare instruction and a conditional branch
+ // at the end of CurrBB.
+ Call->addAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone);
+ CurrBB.getTerminator()->eraseFromParent();
+ Builder.SetInsertPoint(&CurrBB);
+ Value *FCmp = Builder.CreateFCmpOEQ(Call, Call);
+ Builder.CreateCondBr(FCmp, JoinBB, LibCallBB);
+
+ // Add phi operands.
+ Phi->addIncoming(Call, &CurrBB);
+ Phi->addIncoming(LibCall, LibCallBB);
+
+ BB = JoinBB->getIterator();
+ return true;
+}
+
+FunctionPass *llvm::createPartiallyInlineLibCallsPass() {
+ return new PartiallyInlineLibCalls();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp b/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp
new file mode 100644
index 0000000..b56b355
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp
@@ -0,0 +1,953 @@
+//===- PlaceSafepoints.cpp - Place GC Safepoints --------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Place garbage collection safepoints at appropriate locations in the IR. This
+// does not make relocation semantics or variable liveness explicit. That's
+// done by RewriteStatepointsForGC.
+//
+// Terminology:
+// - A call is said to be "parseable" if there is a stack map generated for the
+// return PC of the call. A runtime can determine where values listed in the
+// deopt arguments and (after RewriteStatepointsForGC) gc arguments are located
+// on the stack when the code is suspended inside such a call. Every parse
+// point is represented by a call wrapped in an gc.statepoint intrinsic.
+// - A "poll" is an explicit check in the generated code to determine if the
+// runtime needs the generated code to cooperate by calling a helper routine
+// and thus suspending its execution at a known state. The call to the helper
+// routine will be parseable. The (gc & runtime specific) logic of a poll is
+// assumed to be provided in a function of the name "gc.safepoint_poll".
+//
+// We aim to insert polls such that running code can quickly be brought to a
+// well defined state for inspection by the collector. In the current
+// implementation, this is done via the insertion of poll sites at method entry
+// and the backedge of most loops. We try to avoid inserting more polls than
+// are necessary to ensure a finite period between poll sites. This is not
+// because the poll itself is expensive in the generated code; it's not. Polls
+// do tend to impact the optimizer itself in negative ways; we'd like to avoid
+// perturbing the optimization of the method as much as we can.
+//
+// We also need to make most call sites parseable. The callee might execute a
+// poll (or otherwise be inspected by the GC). If so, the entire stack
+// (including the suspended frame of the current method) must be parseable.
+//
+// This pass will insert:
+// - Call parse points ("call safepoints") for any call which may need to
+// reach a safepoint during the execution of the callee function.
+// - Backedge safepoint polls and entry safepoint polls to ensure that
+// executing code reaches a safepoint poll in a finite amount of time.
+//
+// We do not currently support return statepoints, but adding them would not
+// be hard. They are not required for correctness - entry safepoints are an
+// alternative - but some GCs may prefer them. Patches welcome.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Pass.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Statepoint.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+#define DEBUG_TYPE "safepoint-placement"
+STATISTIC(NumEntrySafepoints, "Number of entry safepoints inserted");
+STATISTIC(NumCallSafepoints, "Number of call safepoints inserted");
+STATISTIC(NumBackedgeSafepoints, "Number of backedge safepoints inserted");
+
+STATISTIC(CallInLoop, "Number of loops w/o safepoints due to calls in loop");
+STATISTIC(FiniteExecution, "Number of loops w/o safepoints finite execution");
+
+using namespace llvm;
+
+// Ignore opportunities to avoid placing safepoints on backedges, useful for
+// validation
+static cl::opt<bool> AllBackedges("spp-all-backedges", cl::Hidden,
+ cl::init(false));
+
+/// How narrow does the trip count of a loop have to be to have to be considered
+/// "counted"? Counted loops do not get safepoints at backedges.
+static cl::opt<int> CountedLoopTripWidth("spp-counted-loop-trip-width",
+ cl::Hidden, cl::init(32));
+
+// If true, split the backedge of a loop when placing the safepoint, otherwise
+// split the latch block itself. Both are useful to support for
+// experimentation, but in practice, it looks like splitting the backedge
+// optimizes better.
+static cl::opt<bool> SplitBackedge("spp-split-backedge", cl::Hidden,
+ cl::init(false));
+
+// Print tracing output
+static cl::opt<bool> TraceLSP("spp-trace", cl::Hidden, cl::init(false));
+
+namespace {
+
+/// An analysis pass whose purpose is to identify each of the backedges in
+/// the function which require a safepoint poll to be inserted.
+struct PlaceBackedgeSafepointsImpl : public FunctionPass {
+ static char ID;
+
+ /// The output of the pass - gives a list of each backedge (described by
+ /// pointing at the branch) which need a poll inserted.
+ std::vector<TerminatorInst *> PollLocations;
+
+ /// True unless we're running spp-no-calls in which case we need to disable
+ /// the call-dependent placement opts.
+ bool CallSafepointsEnabled;
+
+ ScalarEvolution *SE = nullptr;
+ DominatorTree *DT = nullptr;
+ LoopInfo *LI = nullptr;
+
+ PlaceBackedgeSafepointsImpl(bool CallSafepoints = false)
+ : FunctionPass(ID), CallSafepointsEnabled(CallSafepoints) {
+ initializePlaceBackedgeSafepointsImplPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnLoop(Loop *);
+ void runOnLoopAndSubLoops(Loop *L) {
+ // Visit all the subloops
+ for (auto I = L->begin(), E = L->end(); I != E; I++)
+ runOnLoopAndSubLoops(*I);
+ runOnLoop(L);
+ }
+
+ bool runOnFunction(Function &F) override {
+ SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ for (auto I = LI->begin(), E = LI->end(); I != E; I++) {
+ runOnLoopAndSubLoops(*I);
+ }
+ return false;
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ // We no longer modify the IR at all in this pass. Thus all
+ // analysis are preserved.
+ AU.setPreservesAll();
+ }
+};
+}
+
+static cl::opt<bool> NoEntry("spp-no-entry", cl::Hidden, cl::init(false));
+static cl::opt<bool> NoCall("spp-no-call", cl::Hidden, cl::init(false));
+static cl::opt<bool> NoBackedge("spp-no-backedge", cl::Hidden, cl::init(false));
+
+namespace {
+struct PlaceSafepoints : public FunctionPass {
+ static char ID; // Pass identification, replacement for typeid
+
+ PlaceSafepoints() : FunctionPass(ID) {
+ initializePlaceSafepointsPass(*PassRegistry::getPassRegistry());
+ }
+ bool runOnFunction(Function &F) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ // We modify the graph wholesale (inlining, block insertion, etc). We
+ // preserve nothing at the moment. We could potentially preserve dom tree
+ // if that was worth doing
+ }
+};
+}
+
+// Insert a safepoint poll immediately before the given instruction. Does
+// not handle the parsability of state at the runtime call, that's the
+// callers job.
+static void
+InsertSafepointPoll(Instruction *InsertBefore,
+ std::vector<CallSite> &ParsePointsNeeded /*rval*/);
+
+static bool needsStatepoint(const CallSite &CS) {
+ if (callsGCLeafFunction(CS))
+ return false;
+ if (CS.isCall()) {
+ CallInst *call = cast<CallInst>(CS.getInstruction());
+ if (call->isInlineAsm())
+ return false;
+ }
+ if (isStatepoint(CS) || isGCRelocate(CS) || isGCResult(CS)) {
+ return false;
+ }
+ return true;
+}
+
+static Value *ReplaceWithStatepoint(const CallSite &CS);
+
+/// Returns true if this loop is known to contain a call safepoint which
+/// must unconditionally execute on any iteration of the loop which returns
+/// to the loop header via an edge from Pred. Returns a conservative correct
+/// answer; i.e. false is always valid.
+static bool containsUnconditionalCallSafepoint(Loop *L, BasicBlock *Header,
+ BasicBlock *Pred,
+ DominatorTree &DT) {
+ // In general, we're looking for any cut of the graph which ensures
+ // there's a call safepoint along every edge between Header and Pred.
+ // For the moment, we look only for the 'cuts' that consist of a single call
+ // instruction in a block which is dominated by the Header and dominates the
+ // loop latch (Pred) block. Somewhat surprisingly, walking the entire chain
+ // of such dominating blocks gets substantially more occurrences than just
+ // checking the Pred and Header blocks themselves. This may be due to the
+ // density of loop exit conditions caused by range and null checks.
+ // TODO: structure this as an analysis pass, cache the result for subloops,
+ // avoid dom tree recalculations
+ assert(DT.dominates(Header, Pred) && "loop latch not dominated by header?");
+
+ BasicBlock *Current = Pred;
+ while (true) {
+ for (Instruction &I : *Current) {
+ if (auto CS = CallSite(&I))
+ // Note: Technically, needing a safepoint isn't quite the right
+ // condition here. We should instead be checking if the target method
+ // has an
+ // unconditional poll. In practice, this is only a theoretical concern
+ // since we don't have any methods with conditional-only safepoint
+ // polls.
+ if (needsStatepoint(CS))
+ return true;
+ }
+
+ if (Current == Header)
+ break;
+ Current = DT.getNode(Current)->getIDom()->getBlock();
+ }
+
+ return false;
+}
+
+/// Returns true if this loop is known to terminate in a finite number of
+/// iterations. Note that this function may return false for a loop which
+/// does actual terminate in a finite constant number of iterations due to
+/// conservatism in the analysis.
+static bool mustBeFiniteCountedLoop(Loop *L, ScalarEvolution *SE,
+ BasicBlock *Pred) {
+ // A conservative bound on the loop as a whole.
+ const SCEV *MaxTrips = SE->getMaxBackedgeTakenCount(L);
+ if (MaxTrips != SE->getCouldNotCompute() &&
+ SE->getUnsignedRange(MaxTrips).getUnsignedMax().isIntN(
+ CountedLoopTripWidth))
+ return true;
+
+ // If this is a conditional branch to the header with the alternate path
+ // being outside the loop, we can ask questions about the execution frequency
+ // of the exit block.
+ if (L->isLoopExiting(Pred)) {
+ // This returns an exact expression only. TODO: We really only need an
+ // upper bound here, but SE doesn't expose that.
+ const SCEV *MaxExec = SE->getExitCount(L, Pred);
+ if (MaxExec != SE->getCouldNotCompute() &&
+ SE->getUnsignedRange(MaxExec).getUnsignedMax().isIntN(
+ CountedLoopTripWidth))
+ return true;
+ }
+
+ return /* not finite */ false;
+}
+
+static void scanOneBB(Instruction *start, Instruction *end,
+ std::vector<CallInst *> &calls,
+ std::set<BasicBlock *> &seen,
+ std::vector<BasicBlock *> &worklist) {
+ for (BasicBlock::iterator itr(start);
+ itr != start->getParent()->end() && itr != BasicBlock::iterator(end);
+ itr++) {
+ if (CallInst *CI = dyn_cast<CallInst>(&*itr)) {
+ calls.push_back(CI);
+ }
+ // FIXME: This code does not handle invokes
+ assert(!dyn_cast<InvokeInst>(&*itr) &&
+ "support for invokes in poll code needed");
+ // Only add the successor blocks if we reach the terminator instruction
+ // without encountering end first
+ if (itr->isTerminator()) {
+ BasicBlock *BB = itr->getParent();
+ for (BasicBlock *Succ : successors(BB)) {
+ if (seen.count(Succ) == 0) {
+ worklist.push_back(Succ);
+ seen.insert(Succ);
+ }
+ }
+ }
+ }
+}
+static void scanInlinedCode(Instruction *start, Instruction *end,
+ std::vector<CallInst *> &calls,
+ std::set<BasicBlock *> &seen) {
+ calls.clear();
+ std::vector<BasicBlock *> worklist;
+ seen.insert(start->getParent());
+ scanOneBB(start, end, calls, seen, worklist);
+ while (!worklist.empty()) {
+ BasicBlock *BB = worklist.back();
+ worklist.pop_back();
+ scanOneBB(&*BB->begin(), end, calls, seen, worklist);
+ }
+}
+
+bool PlaceBackedgeSafepointsImpl::runOnLoop(Loop *L) {
+ // Loop through all loop latches (branches controlling backedges). We need
+ // to place a safepoint on every backedge (potentially).
+ // Note: In common usage, there will be only one edge due to LoopSimplify
+ // having run sometime earlier in the pipeline, but this code must be correct
+ // w.r.t. loops with multiple backedges.
+ BasicBlock *header = L->getHeader();
+ SmallVector<BasicBlock*, 16> LoopLatches;
+ L->getLoopLatches(LoopLatches);
+ for (BasicBlock *pred : LoopLatches) {
+ assert(L->contains(pred));
+
+ // Make a policy decision about whether this loop needs a safepoint or
+ // not. Note that this is about unburdening the optimizer in loops, not
+ // avoiding the runtime cost of the actual safepoint.
+ if (!AllBackedges) {
+ if (mustBeFiniteCountedLoop(L, SE, pred)) {
+ if (TraceLSP)
+ errs() << "skipping safepoint placement in finite loop\n";
+ FiniteExecution++;
+ continue;
+ }
+ if (CallSafepointsEnabled &&
+ containsUnconditionalCallSafepoint(L, header, pred, *DT)) {
+ // Note: This is only semantically legal since we won't do any further
+ // IPO or inlining before the actual call insertion.. If we hadn't, we
+ // might latter loose this call safepoint.
+ if (TraceLSP)
+ errs() << "skipping safepoint placement due to unconditional call\n";
+ CallInLoop++;
+ continue;
+ }
+ }
+
+ // TODO: We can create an inner loop which runs a finite number of
+ // iterations with an outer loop which contains a safepoint. This would
+ // not help runtime performance that much, but it might help our ability to
+ // optimize the inner loop.
+
+ // Safepoint insertion would involve creating a new basic block (as the
+ // target of the current backedge) which does the safepoint (of all live
+ // variables) and branches to the true header
+ TerminatorInst *term = pred->getTerminator();
+
+ if (TraceLSP) {
+ errs() << "[LSP] terminator instruction: ";
+ term->dump();
+ }
+
+ PollLocations.push_back(term);
+ }
+
+ return false;
+}
+
+/// Returns true if an entry safepoint is not required before this callsite in
+/// the caller function.
+static bool doesNotRequireEntrySafepointBefore(const CallSite &CS) {
+ Instruction *Inst = CS.getInstruction();
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::experimental_gc_statepoint:
+ case Intrinsic::experimental_patchpoint_void:
+ case Intrinsic::experimental_patchpoint_i64:
+ // The can wrap an actual call which may grow the stack by an unbounded
+ // amount or run forever.
+ return false;
+ default:
+ // Most LLVM intrinsics are things which do not expand to actual calls, or
+ // at least if they do, are leaf functions that cause only finite stack
+ // growth. In particular, the optimizer likes to form things like memsets
+ // out of stores in the original IR. Another important example is
+ // llvm.localescape which must occur in the entry block. Inserting a
+ // safepoint before it is not legal since it could push the localescape
+ // out of the entry block.
+ return true;
+ }
+ }
+ return false;
+}
+
+static Instruction *findLocationForEntrySafepoint(Function &F,
+ DominatorTree &DT) {
+
+ // Conceptually, this poll needs to be on method entry, but in
+ // practice, we place it as late in the entry block as possible. We
+ // can place it as late as we want as long as it dominates all calls
+ // that can grow the stack. This, combined with backedge polls,
+ // give us all the progress guarantees we need.
+
+ // hasNextInstruction and nextInstruction are used to iterate
+ // through a "straight line" execution sequence.
+
+ auto hasNextInstruction = [](Instruction *I) {
+ if (!I->isTerminator()) {
+ return true;
+ }
+ BasicBlock *nextBB = I->getParent()->getUniqueSuccessor();
+ return nextBB && (nextBB->getUniquePredecessor() != nullptr);
+ };
+
+ auto nextInstruction = [&hasNextInstruction](Instruction *I) {
+ assert(hasNextInstruction(I) &&
+ "first check if there is a next instruction!");
+ if (I->isTerminator()) {
+ return &I->getParent()->getUniqueSuccessor()->front();
+ } else {
+ return &*++I->getIterator();
+ }
+ };
+
+ Instruction *cursor = nullptr;
+ for (cursor = &F.getEntryBlock().front(); hasNextInstruction(cursor);
+ cursor = nextInstruction(cursor)) {
+
+ // We need to ensure a safepoint poll occurs before any 'real' call. The
+ // easiest way to ensure finite execution between safepoints in the face of
+ // recursive and mutually recursive functions is to enforce that each take
+ // a safepoint. Additionally, we need to ensure a poll before any call
+ // which can grow the stack by an unbounded amount. This isn't required
+ // for GC semantics per se, but is a common requirement for languages
+ // which detect stack overflow via guard pages and then throw exceptions.
+ if (auto CS = CallSite(cursor)) {
+ if (doesNotRequireEntrySafepointBefore(CS))
+ continue;
+ break;
+ }
+ }
+
+ assert((hasNextInstruction(cursor) || cursor->isTerminator()) &&
+ "either we stopped because of a call, or because of terminator");
+
+ return cursor;
+}
+
+/// Identify the list of call sites which need to be have parseable state
+static void findCallSafepoints(Function &F,
+ std::vector<CallSite> &Found /*rval*/) {
+ assert(Found.empty() && "must be empty!");
+ for (Instruction &I : instructions(F)) {
+ Instruction *inst = &I;
+ if (isa<CallInst>(inst) || isa<InvokeInst>(inst)) {
+ CallSite CS(inst);
+
+ // No safepoint needed or wanted
+ if (!needsStatepoint(CS)) {
+ continue;
+ }
+
+ Found.push_back(CS);
+ }
+ }
+}
+
+/// Implement a unique function which doesn't require we sort the input
+/// vector. Doing so has the effect of changing the output of a couple of
+/// tests in ways which make them less useful in testing fused safepoints.
+template <typename T> static void unique_unsorted(std::vector<T> &vec) {
+ std::set<T> seen;
+ std::vector<T> tmp;
+ vec.reserve(vec.size());
+ std::swap(tmp, vec);
+ for (auto V : tmp) {
+ if (seen.insert(V).second) {
+ vec.push_back(V);
+ }
+ }
+}
+
+static const char *const GCSafepointPollName = "gc.safepoint_poll";
+
+static bool isGCSafepointPoll(Function &F) {
+ return F.getName().equals(GCSafepointPollName);
+}
+
+/// Returns true if this function should be rewritten to include safepoint
+/// polls and parseable call sites. The main point of this function is to be
+/// an extension point for custom logic.
+static bool shouldRewriteFunction(Function &F) {
+ // TODO: This should check the GCStrategy
+ if (F.hasGC()) {
+ const auto &FunctionGCName = F.getGC();
+ const StringRef StatepointExampleName("statepoint-example");
+ const StringRef CoreCLRName("coreclr");
+ return (StatepointExampleName == FunctionGCName) ||
+ (CoreCLRName == FunctionGCName);
+ } else
+ return false;
+}
+
+// TODO: These should become properties of the GCStrategy, possibly with
+// command line overrides.
+static bool enableEntrySafepoints(Function &F) { return !NoEntry; }
+static bool enableBackedgeSafepoints(Function &F) { return !NoBackedge; }
+static bool enableCallSafepoints(Function &F) { return !NoCall; }
+
+// Normalize basic block to make it ready to be target of invoke statepoint.
+// Ensure that 'BB' does not have phi nodes. It may require spliting it.
+static BasicBlock *normalizeForInvokeSafepoint(BasicBlock *BB,
+ BasicBlock *InvokeParent) {
+ BasicBlock *ret = BB;
+
+ if (!BB->getUniquePredecessor()) {
+ ret = SplitBlockPredecessors(BB, InvokeParent, "");
+ }
+
+ // Now that 'ret' has unique predecessor we can safely remove all phi nodes
+ // from it
+ FoldSingleEntryPHINodes(ret);
+ assert(!isa<PHINode>(ret->begin()));
+
+ return ret;
+}
+
+bool PlaceSafepoints::runOnFunction(Function &F) {
+ if (F.isDeclaration() || F.empty()) {
+ // This is a declaration, nothing to do. Must exit early to avoid crash in
+ // dom tree calculation
+ return false;
+ }
+
+ if (isGCSafepointPoll(F)) {
+ // Given we're inlining this inside of safepoint poll insertion, this
+ // doesn't make any sense. Note that we do make any contained calls
+ // parseable after we inline a poll.
+ return false;
+ }
+
+ if (!shouldRewriteFunction(F))
+ return false;
+
+ bool modified = false;
+
+ // In various bits below, we rely on the fact that uses are reachable from
+ // defs. When there are basic blocks unreachable from the entry, dominance
+ // and reachablity queries return non-sensical results. Thus, we preprocess
+ // the function to ensure these properties hold.
+ modified |= removeUnreachableBlocks(F);
+
+ // STEP 1 - Insert the safepoint polling locations. We do not need to
+ // actually insert parse points yet. That will be done for all polls and
+ // calls in a single pass.
+
+ DominatorTree DT;
+ DT.recalculate(F);
+
+ SmallVector<Instruction *, 16> PollsNeeded;
+ std::vector<CallSite> ParsePointNeeded;
+
+ if (enableBackedgeSafepoints(F)) {
+ // Construct a pass manager to run the LoopPass backedge logic. We
+ // need the pass manager to handle scheduling all the loop passes
+ // appropriately. Doing this by hand is painful and just not worth messing
+ // with for the moment.
+ legacy::FunctionPassManager FPM(F.getParent());
+ bool CanAssumeCallSafepoints = enableCallSafepoints(F);
+ PlaceBackedgeSafepointsImpl *PBS =
+ new PlaceBackedgeSafepointsImpl(CanAssumeCallSafepoints);
+ FPM.add(PBS);
+ FPM.run(F);
+
+ // We preserve dominance information when inserting the poll, otherwise
+ // we'd have to recalculate this on every insert
+ DT.recalculate(F);
+
+ auto &PollLocations = PBS->PollLocations;
+
+ auto OrderByBBName = [](Instruction *a, Instruction *b) {
+ return a->getParent()->getName() < b->getParent()->getName();
+ };
+ // We need the order of list to be stable so that naming ends up stable
+ // when we split edges. This makes test cases much easier to write.
+ std::sort(PollLocations.begin(), PollLocations.end(), OrderByBBName);
+
+ // We can sometimes end up with duplicate poll locations. This happens if
+ // a single loop is visited more than once. The fact this happens seems
+ // wrong, but it does happen for the split-backedge.ll test case.
+ PollLocations.erase(std::unique(PollLocations.begin(),
+ PollLocations.end()),
+ PollLocations.end());
+
+ // Insert a poll at each point the analysis pass identified
+ // The poll location must be the terminator of a loop latch block.
+ for (TerminatorInst *Term : PollLocations) {
+ // We are inserting a poll, the function is modified
+ modified = true;
+
+ if (SplitBackedge) {
+ // Split the backedge of the loop and insert the poll within that new
+ // basic block. This creates a loop with two latches per original
+ // latch (which is non-ideal), but this appears to be easier to
+ // optimize in practice than inserting the poll immediately before the
+ // latch test.
+
+ // Since this is a latch, at least one of the successors must dominate
+ // it. Its possible that we have a) duplicate edges to the same header
+ // and b) edges to distinct loop headers. We need to insert pools on
+ // each.
+ SetVector<BasicBlock *> Headers;
+ for (unsigned i = 0; i < Term->getNumSuccessors(); i++) {
+ BasicBlock *Succ = Term->getSuccessor(i);
+ if (DT.dominates(Succ, Term->getParent())) {
+ Headers.insert(Succ);
+ }
+ }
+ assert(!Headers.empty() && "poll location is not a loop latch?");
+
+ // The split loop structure here is so that we only need to recalculate
+ // the dominator tree once. Alternatively, we could just keep it up to
+ // date and use a more natural merged loop.
+ SetVector<BasicBlock *> SplitBackedges;
+ for (BasicBlock *Header : Headers) {
+ BasicBlock *NewBB = SplitEdge(Term->getParent(), Header, &DT);
+ PollsNeeded.push_back(NewBB->getTerminator());
+ NumBackedgeSafepoints++;
+ }
+ } else {
+ // Split the latch block itself, right before the terminator.
+ PollsNeeded.push_back(Term);
+ NumBackedgeSafepoints++;
+ }
+ }
+ }
+
+ if (enableEntrySafepoints(F)) {
+ Instruction *Location = findLocationForEntrySafepoint(F, DT);
+ if (!Location) {
+ // policy choice not to insert?
+ } else {
+ PollsNeeded.push_back(Location);
+ modified = true;
+ NumEntrySafepoints++;
+ }
+ }
+
+ // Now that we've identified all the needed safepoint poll locations, insert
+ // safepoint polls themselves.
+ for (Instruction *PollLocation : PollsNeeded) {
+ std::vector<CallSite> RuntimeCalls;
+ InsertSafepointPoll(PollLocation, RuntimeCalls);
+ ParsePointNeeded.insert(ParsePointNeeded.end(), RuntimeCalls.begin(),
+ RuntimeCalls.end());
+ }
+ PollsNeeded.clear(); // make sure we don't accidentally use
+ // The dominator tree has been invalidated by the inlining performed in the
+ // above loop. TODO: Teach the inliner how to update the dom tree?
+ DT.recalculate(F);
+
+ if (enableCallSafepoints(F)) {
+ std::vector<CallSite> Calls;
+ findCallSafepoints(F, Calls);
+ NumCallSafepoints += Calls.size();
+ ParsePointNeeded.insert(ParsePointNeeded.end(), Calls.begin(), Calls.end());
+ }
+
+ // Unique the vectors since we can end up with duplicates if we scan the call
+ // site for call safepoints after we add it for entry or backedge. The
+ // only reason we need tracking at all is that some functions might have
+ // polls but not call safepoints and thus we might miss marking the runtime
+ // calls for the polls. (This is useful in test cases!)
+ unique_unsorted(ParsePointNeeded);
+
+ // Any parse point (no matter what source) will be handled here
+
+ // We're about to start modifying the function
+ if (!ParsePointNeeded.empty())
+ modified = true;
+
+ // Now run through and insert the safepoints, but do _NOT_ update or remove
+ // any existing uses. We have references to live variables that need to
+ // survive to the last iteration of this loop.
+ std::vector<Value *> Results;
+ Results.reserve(ParsePointNeeded.size());
+ for (size_t i = 0; i < ParsePointNeeded.size(); i++) {
+ CallSite &CS = ParsePointNeeded[i];
+
+ // For invoke statepoints we need to remove all phi nodes at the normal
+ // destination block.
+ // Reason for this is that we can place gc_result only after last phi node
+ // in basic block. We will get malformed code after RAUW for the
+ // gc_result if one of this phi nodes uses result from the invoke.
+ if (InvokeInst *Invoke = dyn_cast<InvokeInst>(CS.getInstruction())) {
+ normalizeForInvokeSafepoint(Invoke->getNormalDest(),
+ Invoke->getParent());
+ }
+
+ Value *GCResult = ReplaceWithStatepoint(CS);
+ Results.push_back(GCResult);
+ }
+ assert(Results.size() == ParsePointNeeded.size());
+
+ // Adjust all users of the old call sites to use the new ones instead
+ for (size_t i = 0; i < ParsePointNeeded.size(); i++) {
+ CallSite &CS = ParsePointNeeded[i];
+ Value *GCResult = Results[i];
+ if (GCResult) {
+ // Can not RAUW for the invoke gc result in case of phi nodes preset.
+ assert(CS.isCall() || !isa<PHINode>(cast<Instruction>(GCResult)->getParent()->begin()));
+
+ // Replace all uses with the new call
+ CS.getInstruction()->replaceAllUsesWith(GCResult);
+ }
+
+ // Now that we've handled all uses, remove the original call itself
+ // Note: The insert point can't be the deleted instruction!
+ CS.getInstruction()->eraseFromParent();
+ }
+ return modified;
+}
+
+char PlaceBackedgeSafepointsImpl::ID = 0;
+char PlaceSafepoints::ID = 0;
+
+FunctionPass *llvm::createPlaceSafepointsPass() {
+ return new PlaceSafepoints();
+}
+
+INITIALIZE_PASS_BEGIN(PlaceBackedgeSafepointsImpl,
+ "place-backedge-safepoints-impl",
+ "Place Backedge Safepoints", false, false)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(PlaceBackedgeSafepointsImpl,
+ "place-backedge-safepoints-impl",
+ "Place Backedge Safepoints", false, false)
+
+INITIALIZE_PASS_BEGIN(PlaceSafepoints, "place-safepoints", "Place Safepoints",
+ false, false)
+INITIALIZE_PASS_END(PlaceSafepoints, "place-safepoints", "Place Safepoints",
+ false, false)
+
+static void
+InsertSafepointPoll(Instruction *InsertBefore,
+ std::vector<CallSite> &ParsePointsNeeded /*rval*/) {
+ BasicBlock *OrigBB = InsertBefore->getParent();
+ Module *M = InsertBefore->getModule();
+ assert(M && "must be part of a module");
+
+ // Inline the safepoint poll implementation - this will get all the branch,
+ // control flow, etc.. Most importantly, it will introduce the actual slow
+ // path call - where we need to insert a safepoint (parsepoint).
+
+ auto *F = M->getFunction(GCSafepointPollName);
+ assert(F && "gc.safepoint_poll function is missing");
+ assert(F->getType()->getElementType() ==
+ FunctionType::get(Type::getVoidTy(M->getContext()), false) &&
+ "gc.safepoint_poll declared with wrong type");
+ assert(!F->empty() && "gc.safepoint_poll must be a non-empty function");
+ CallInst *PollCall = CallInst::Create(F, "", InsertBefore);
+
+ // Record some information about the call site we're replacing
+ BasicBlock::iterator before(PollCall), after(PollCall);
+ bool isBegin(false);
+ if (before == OrigBB->begin()) {
+ isBegin = true;
+ } else {
+ before--;
+ }
+ after++;
+ assert(after != OrigBB->end() && "must have successor");
+
+ // do the actual inlining
+ InlineFunctionInfo IFI;
+ bool InlineStatus = InlineFunction(PollCall, IFI);
+ assert(InlineStatus && "inline must succeed");
+ (void)InlineStatus; // suppress warning in release-asserts
+
+ // Check post conditions
+ assert(IFI.StaticAllocas.empty() && "can't have allocs");
+
+ std::vector<CallInst *> calls; // new calls
+ std::set<BasicBlock *> BBs; // new BBs + insertee
+ // Include only the newly inserted instructions, Note: begin may not be valid
+ // if we inserted to the beginning of the basic block
+ BasicBlock::iterator start;
+ if (isBegin) {
+ start = OrigBB->begin();
+ } else {
+ start = before;
+ start++;
+ }
+
+ // If your poll function includes an unreachable at the end, that's not
+ // valid. Bugpoint likes to create this, so check for it.
+ assert(isPotentiallyReachable(&*start, &*after, nullptr, nullptr) &&
+ "malformed poll function");
+
+ scanInlinedCode(&*(start), &*(after), calls, BBs);
+ assert(!calls.empty() && "slow path not found for safepoint poll");
+
+ // Record the fact we need a parsable state at the runtime call contained in
+ // the poll function. This is required so that the runtime knows how to
+ // parse the last frame when we actually take the safepoint (i.e. execute
+ // the slow path)
+ assert(ParsePointsNeeded.empty());
+ for (size_t i = 0; i < calls.size(); i++) {
+
+ // No safepoint needed or wanted
+ if (!needsStatepoint(calls[i])) {
+ continue;
+ }
+
+ // These are likely runtime calls. Should we assert that via calling
+ // convention or something?
+ ParsePointsNeeded.push_back(CallSite(calls[i]));
+ }
+ assert(ParsePointsNeeded.size() <= calls.size());
+}
+
+/// Replaces the given call site (Call or Invoke) with a gc.statepoint
+/// intrinsic with an empty deoptimization arguments list. This does
+/// NOT do explicit relocation for GC support.
+static Value *ReplaceWithStatepoint(const CallSite &CS /* to replace */) {
+ assert(CS.getInstruction()->getModule() && "must be set");
+
+ // TODO: technically, a pass is not allowed to get functions from within a
+ // function pass since it might trigger a new function addition. Refactor
+ // this logic out to the initialization of the pass. Doesn't appear to
+ // matter in practice.
+
+ // Then go ahead and use the builder do actually do the inserts. We insert
+ // immediately before the previous instruction under the assumption that all
+ // arguments will be available here. We can't insert afterwards since we may
+ // be replacing a terminator.
+ IRBuilder<> Builder(CS.getInstruction());
+
+ // Note: The gc args are not filled in at this time, that's handled by
+ // RewriteStatepointsForGC (which is currently under review).
+
+ // Create the statepoint given all the arguments
+ Instruction *Token = nullptr;
+
+ uint64_t ID;
+ uint32_t NumPatchBytes;
+
+ AttributeSet OriginalAttrs = CS.getAttributes();
+ Attribute AttrID =
+ OriginalAttrs.getAttribute(AttributeSet::FunctionIndex, "statepoint-id");
+ Attribute AttrNumPatchBytes = OriginalAttrs.getAttribute(
+ AttributeSet::FunctionIndex, "statepoint-num-patch-bytes");
+
+ AttrBuilder AttrsToRemove;
+ bool HasID = AttrID.isStringAttribute() &&
+ !AttrID.getValueAsString().getAsInteger(10, ID);
+
+ if (HasID)
+ AttrsToRemove.addAttribute("statepoint-id");
+ else
+ ID = 0xABCDEF00;
+
+ bool HasNumPatchBytes =
+ AttrNumPatchBytes.isStringAttribute() &&
+ !AttrNumPatchBytes.getValueAsString().getAsInteger(10, NumPatchBytes);
+
+ if (HasNumPatchBytes)
+ AttrsToRemove.addAttribute("statepoint-num-patch-bytes");
+ else
+ NumPatchBytes = 0;
+
+ OriginalAttrs = OriginalAttrs.removeAttributes(
+ CS.getInstruction()->getContext(), AttributeSet::FunctionIndex,
+ AttrsToRemove);
+
+ if (CS.isCall()) {
+ CallInst *ToReplace = cast<CallInst>(CS.getInstruction());
+ CallInst *Call = Builder.CreateGCStatepointCall(
+ ID, NumPatchBytes, CS.getCalledValue(),
+ makeArrayRef(CS.arg_begin(), CS.arg_end()), None, None,
+ "safepoint_token");
+ Call->setTailCall(ToReplace->isTailCall());
+ Call->setCallingConv(ToReplace->getCallingConv());
+
+ // In case if we can handle this set of attributes - set up function
+ // attributes directly on statepoint and return attributes later for
+ // gc_result intrinsic.
+ Call->setAttributes(OriginalAttrs.getFnAttributes());
+
+ Token = Call;
+
+ // Put the following gc_result and gc_relocate calls immediately after
+ // the old call (which we're about to delete).
+ assert(ToReplace->getNextNode() && "not a terminator, must have next");
+ Builder.SetInsertPoint(ToReplace->getNextNode());
+ Builder.SetCurrentDebugLocation(ToReplace->getNextNode()->getDebugLoc());
+ } else if (CS.isInvoke()) {
+ InvokeInst *ToReplace = cast<InvokeInst>(CS.getInstruction());
+
+ // Insert the new invoke into the old block. We'll remove the old one in a
+ // moment at which point this will become the new terminator for the
+ // original block.
+ Builder.SetInsertPoint(ToReplace->getParent());
+ InvokeInst *Invoke = Builder.CreateGCStatepointInvoke(
+ ID, NumPatchBytes, CS.getCalledValue(), ToReplace->getNormalDest(),
+ ToReplace->getUnwindDest(), makeArrayRef(CS.arg_begin(), CS.arg_end()),
+ None, None, "safepoint_token");
+
+ Invoke->setCallingConv(ToReplace->getCallingConv());
+
+ // In case if we can handle this set of attributes - set up function
+ // attributes directly on statepoint and return attributes later for
+ // gc_result intrinsic.
+ Invoke->setAttributes(OriginalAttrs.getFnAttributes());
+
+ Token = Invoke;
+
+ // We'll insert the gc.result into the normal block
+ BasicBlock *NormalDest = ToReplace->getNormalDest();
+ // Can not insert gc.result in case of phi nodes preset.
+ // Should have removed this cases prior to running this function
+ assert(!isa<PHINode>(NormalDest->begin()));
+ Instruction *IP = &*(NormalDest->getFirstInsertionPt());
+ Builder.SetInsertPoint(IP);
+ } else {
+ llvm_unreachable("unexpect type of CallSite");
+ }
+ assert(Token);
+
+ // Handle the return value of the original call - update all uses to use a
+ // gc_result hanging off the statepoint node we just inserted
+
+ // Only add the gc_result iff there is actually a used result
+ if (!CS.getType()->isVoidTy() && !CS.getInstruction()->use_empty()) {
+ std::string TakenName =
+ CS.getInstruction()->hasName() ? CS.getInstruction()->getName() : "";
+ CallInst *GCResult = Builder.CreateGCResult(Token, CS.getType(), TakenName);
+ GCResult->setAttributes(OriginalAttrs.getRetAttributes());
+ return GCResult;
+ } else {
+ // No return value for the call.
+ return nullptr;
+ }
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp
new file mode 100644
index 0000000..bcadd4e
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp
@@ -0,0 +1,2306 @@
+//===- Reassociate.cpp - Reassociate binary expressions -------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass reassociates commutative expressions in an order that is designed
+// to promote better constant propagation, GCSE, LICM, PRE, etc.
+//
+// For example: 4 + (x + 5) -> x + (4 + 5)
+//
+// In the implementation of this algorithm, constants are assigned rank = 0,
+// function arguments are rank = 1, and other values are assigned ranks
+// corresponding to the reverse post order traversal of current function
+// (starting at 2), which effectively gives values in deep loops higher rank
+// than values not in loops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+using namespace llvm;
+
+#define DEBUG_TYPE "reassociate"
+
+STATISTIC(NumChanged, "Number of insts reassociated");
+STATISTIC(NumAnnihil, "Number of expr tree annihilated");
+STATISTIC(NumFactor , "Number of multiplies factored");
+
+namespace {
+ struct ValueEntry {
+ unsigned Rank;
+ Value *Op;
+ ValueEntry(unsigned R, Value *O) : Rank(R), Op(O) {}
+ };
+ inline bool operator<(const ValueEntry &LHS, const ValueEntry &RHS) {
+ return LHS.Rank > RHS.Rank; // Sort so that highest rank goes to start.
+ }
+}
+
+#ifndef NDEBUG
+/// Print out the expression identified in the Ops list.
+///
+static void PrintOps(Instruction *I, const SmallVectorImpl<ValueEntry> &Ops) {
+ Module *M = I->getModule();
+ dbgs() << Instruction::getOpcodeName(I->getOpcode()) << " "
+ << *Ops[0].Op->getType() << '\t';
+ for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+ dbgs() << "[ ";
+ Ops[i].Op->printAsOperand(dbgs(), false, M);
+ dbgs() << ", #" << Ops[i].Rank << "] ";
+ }
+}
+#endif
+
+namespace {
+ /// \brief Utility class representing a base and exponent pair which form one
+ /// factor of some product.
+ struct Factor {
+ Value *Base;
+ unsigned Power;
+
+ Factor(Value *Base, unsigned Power) : Base(Base), Power(Power) {}
+
+ /// \brief Sort factors in descending order by their power.
+ struct PowerDescendingSorter {
+ bool operator()(const Factor &LHS, const Factor &RHS) {
+ return LHS.Power > RHS.Power;
+ }
+ };
+
+ /// \brief Compare factors for equal powers.
+ struct PowerEqual {
+ bool operator()(const Factor &LHS, const Factor &RHS) {
+ return LHS.Power == RHS.Power;
+ }
+ };
+ };
+
+ /// Utility class representing a non-constant Xor-operand. We classify
+ /// non-constant Xor-Operands into two categories:
+ /// C1) The operand is in the form "X & C", where C is a constant and C != ~0
+ /// C2)
+ /// C2.1) The operand is in the form of "X | C", where C is a non-zero
+ /// constant.
+ /// C2.2) Any operand E which doesn't fall into C1 and C2.1, we view this
+ /// operand as "E | 0"
+ class XorOpnd {
+ public:
+ XorOpnd(Value *V);
+
+ bool isInvalid() const { return SymbolicPart == nullptr; }
+ bool isOrExpr() const { return isOr; }
+ Value *getValue() const { return OrigVal; }
+ Value *getSymbolicPart() const { return SymbolicPart; }
+ unsigned getSymbolicRank() const { return SymbolicRank; }
+ const APInt &getConstPart() const { return ConstPart; }
+
+ void Invalidate() { SymbolicPart = OrigVal = nullptr; }
+ void setSymbolicRank(unsigned R) { SymbolicRank = R; }
+
+ // Sort the XorOpnd-Pointer in ascending order of symbolic-value-rank.
+ // The purpose is twofold:
+ // 1) Cluster together the operands sharing the same symbolic-value.
+ // 2) Operand having smaller symbolic-value-rank is permuted earlier, which
+ // could potentially shorten crital path, and expose more loop-invariants.
+ // Note that values' rank are basically defined in RPO order (FIXME).
+ // So, if Rank(X) < Rank(Y) < Rank(Z), it means X is defined earlier
+ // than Y which is defined earlier than Z. Permute "x | 1", "Y & 2",
+ // "z" in the order of X-Y-Z is better than any other orders.
+ struct PtrSortFunctor {
+ bool operator()(XorOpnd * const &LHS, XorOpnd * const &RHS) {
+ return LHS->getSymbolicRank() < RHS->getSymbolicRank();
+ }
+ };
+ private:
+ Value *OrigVal;
+ Value *SymbolicPart;
+ APInt ConstPart;
+ unsigned SymbolicRank;
+ bool isOr;
+ };
+}
+
+namespace {
+ class Reassociate : public FunctionPass {
+ DenseMap<BasicBlock*, unsigned> RankMap;
+ DenseMap<AssertingVH<Value>, unsigned> ValueRankMap;
+ SetVector<AssertingVH<Instruction> > RedoInsts;
+ bool MadeChange;
+ public:
+ static char ID; // Pass identification, replacement for typeid
+ Reassociate() : FunctionPass(ID) {
+ initializeReassociatePass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ }
+ private:
+ void BuildRankMap(Function &F);
+ unsigned getRank(Value *V);
+ void canonicalizeOperands(Instruction *I);
+ void ReassociateExpression(BinaryOperator *I);
+ void RewriteExprTree(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops);
+ Value *OptimizeExpression(BinaryOperator *I,
+ SmallVectorImpl<ValueEntry> &Ops);
+ Value *OptimizeAdd(Instruction *I, SmallVectorImpl<ValueEntry> &Ops);
+ Value *OptimizeXor(Instruction *I, SmallVectorImpl<ValueEntry> &Ops);
+ bool CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, APInt &ConstOpnd,
+ Value *&Res);
+ bool CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, XorOpnd *Opnd2,
+ APInt &ConstOpnd, Value *&Res);
+ bool collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops,
+ SmallVectorImpl<Factor> &Factors);
+ Value *buildMinimalMultiplyDAG(IRBuilder<> &Builder,
+ SmallVectorImpl<Factor> &Factors);
+ Value *OptimizeMul(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops);
+ Value *RemoveFactorFromExpression(Value *V, Value *Factor);
+ void EraseInst(Instruction *I);
+ void RecursivelyEraseDeadInsts(Instruction *I,
+ SetVector<AssertingVH<Instruction>> &Insts);
+ void OptimizeInst(Instruction *I);
+ Instruction *canonicalizeNegConstExpr(Instruction *I);
+ };
+}
+
+XorOpnd::XorOpnd(Value *V) {
+ assert(!isa<ConstantInt>(V) && "No ConstantInt");
+ OrigVal = V;
+ Instruction *I = dyn_cast<Instruction>(V);
+ SymbolicRank = 0;
+
+ if (I && (I->getOpcode() == Instruction::Or ||
+ I->getOpcode() == Instruction::And)) {
+ Value *V0 = I->getOperand(0);
+ Value *V1 = I->getOperand(1);
+ if (isa<ConstantInt>(V0))
+ std::swap(V0, V1);
+
+ if (ConstantInt *C = dyn_cast<ConstantInt>(V1)) {
+ ConstPart = C->getValue();
+ SymbolicPart = V0;
+ isOr = (I->getOpcode() == Instruction::Or);
+ return;
+ }
+ }
+
+ // view the operand as "V | 0"
+ SymbolicPart = V;
+ ConstPart = APInt::getNullValue(V->getType()->getIntegerBitWidth());
+ isOr = true;
+}
+
+char Reassociate::ID = 0;
+INITIALIZE_PASS(Reassociate, "reassociate",
+ "Reassociate expressions", false, false)
+
+// Public interface to the Reassociate pass
+FunctionPass *llvm::createReassociatePass() { return new Reassociate(); }
+
+/// Return true if V is an instruction of the specified opcode and if it
+/// only has one use.
+static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode) {
+ if (V->hasOneUse() && isa<Instruction>(V) &&
+ cast<Instruction>(V)->getOpcode() == Opcode &&
+ (!isa<FPMathOperator>(V) ||
+ cast<Instruction>(V)->hasUnsafeAlgebra()))
+ return cast<BinaryOperator>(V);
+ return nullptr;
+}
+
+static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode1,
+ unsigned Opcode2) {
+ if (V->hasOneUse() && isa<Instruction>(V) &&
+ (cast<Instruction>(V)->getOpcode() == Opcode1 ||
+ cast<Instruction>(V)->getOpcode() == Opcode2) &&
+ (!isa<FPMathOperator>(V) ||
+ cast<Instruction>(V)->hasUnsafeAlgebra()))
+ return cast<BinaryOperator>(V);
+ return nullptr;
+}
+
+void Reassociate::BuildRankMap(Function &F) {
+ unsigned i = 2;
+
+ // Assign distinct ranks to function arguments.
+ for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I) {
+ ValueRankMap[&*I] = ++i;
+ DEBUG(dbgs() << "Calculated Rank[" << I->getName() << "] = " << i << "\n");
+ }
+
+ ReversePostOrderTraversal<Function*> RPOT(&F);
+ for (ReversePostOrderTraversal<Function*>::rpo_iterator I = RPOT.begin(),
+ E = RPOT.end(); I != E; ++I) {
+ BasicBlock *BB = *I;
+ unsigned BBRank = RankMap[BB] = ++i << 16;
+
+ // Walk the basic block, adding precomputed ranks for any instructions that
+ // we cannot move. This ensures that the ranks for these instructions are
+ // all different in the block.
+ for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+ if (mayBeMemoryDependent(*I))
+ ValueRankMap[&*I] = ++BBRank;
+ }
+}
+
+unsigned Reassociate::getRank(Value *V) {
+ Instruction *I = dyn_cast<Instruction>(V);
+ if (!I) {
+ if (isa<Argument>(V)) return ValueRankMap[V]; // Function argument.
+ return 0; // Otherwise it's a global or constant, rank 0.
+ }
+
+ if (unsigned Rank = ValueRankMap[I])
+ return Rank; // Rank already known?
+
+ // If this is an expression, return the 1+MAX(rank(LHS), rank(RHS)) so that
+ // we can reassociate expressions for code motion! Since we do not recurse
+ // for PHI nodes, we cannot have infinite recursion here, because there
+ // cannot be loops in the value graph that do not go through PHI nodes.
+ unsigned Rank = 0, MaxRank = RankMap[I->getParent()];
+ for (unsigned i = 0, e = I->getNumOperands();
+ i != e && Rank != MaxRank; ++i)
+ Rank = std::max(Rank, getRank(I->getOperand(i)));
+
+ // If this is a not or neg instruction, do not count it for rank. This
+ // assures us that X and ~X will have the same rank.
+ if (!BinaryOperator::isNot(I) && !BinaryOperator::isNeg(I) &&
+ !BinaryOperator::isFNeg(I))
+ ++Rank;
+
+ DEBUG(dbgs() << "Calculated Rank[" << V->getName() << "] = " << Rank << "\n");
+
+ return ValueRankMap[I] = Rank;
+}
+
+// Canonicalize constants to RHS. Otherwise, sort the operands by rank.
+void Reassociate::canonicalizeOperands(Instruction *I) {
+ assert(isa<BinaryOperator>(I) && "Expected binary operator.");
+ assert(I->isCommutative() && "Expected commutative operator.");
+
+ Value *LHS = I->getOperand(0);
+ Value *RHS = I->getOperand(1);
+ unsigned LHSRank = getRank(LHS);
+ unsigned RHSRank = getRank(RHS);
+
+ if (isa<Constant>(RHS))
+ return;
+
+ if (isa<Constant>(LHS) || RHSRank < LHSRank)
+ cast<BinaryOperator>(I)->swapOperands();
+}
+
+static BinaryOperator *CreateAdd(Value *S1, Value *S2, const Twine &Name,
+ Instruction *InsertBefore, Value *FlagsOp) {
+ if (S1->getType()->isIntOrIntVectorTy())
+ return BinaryOperator::CreateAdd(S1, S2, Name, InsertBefore);
+ else {
+ BinaryOperator *Res =
+ BinaryOperator::CreateFAdd(S1, S2, Name, InsertBefore);
+ Res->setFastMathFlags(cast<FPMathOperator>(FlagsOp)->getFastMathFlags());
+ return Res;
+ }
+}
+
+static BinaryOperator *CreateMul(Value *S1, Value *S2, const Twine &Name,
+ Instruction *InsertBefore, Value *FlagsOp) {
+ if (S1->getType()->isIntOrIntVectorTy())
+ return BinaryOperator::CreateMul(S1, S2, Name, InsertBefore);
+ else {
+ BinaryOperator *Res =
+ BinaryOperator::CreateFMul(S1, S2, Name, InsertBefore);
+ Res->setFastMathFlags(cast<FPMathOperator>(FlagsOp)->getFastMathFlags());
+ return Res;
+ }
+}
+
+static BinaryOperator *CreateNeg(Value *S1, const Twine &Name,
+ Instruction *InsertBefore, Value *FlagsOp) {
+ if (S1->getType()->isIntOrIntVectorTy())
+ return BinaryOperator::CreateNeg(S1, Name, InsertBefore);
+ else {
+ BinaryOperator *Res = BinaryOperator::CreateFNeg(S1, Name, InsertBefore);
+ Res->setFastMathFlags(cast<FPMathOperator>(FlagsOp)->getFastMathFlags());
+ return Res;
+ }
+}
+
+/// Replace 0-X with X*-1.
+static BinaryOperator *LowerNegateToMultiply(Instruction *Neg) {
+ Type *Ty = Neg->getType();
+ Constant *NegOne = Ty->isIntOrIntVectorTy() ?
+ ConstantInt::getAllOnesValue(Ty) : ConstantFP::get(Ty, -1.0);
+
+ BinaryOperator *Res = CreateMul(Neg->getOperand(1), NegOne, "", Neg, Neg);
+ Neg->setOperand(1, Constant::getNullValue(Ty)); // Drop use of op.
+ Res->takeName(Neg);
+ Neg->replaceAllUsesWith(Res);
+ Res->setDebugLoc(Neg->getDebugLoc());
+ return Res;
+}
+
+/// Returns k such that lambda(2^Bitwidth) = 2^k, where lambda is the Carmichael
+/// function. This means that x^(2^k) === 1 mod 2^Bitwidth for
+/// every odd x, i.e. x^(2^k) = 1 for every odd x in Bitwidth-bit arithmetic.
+/// Note that 0 <= k < Bitwidth, and if Bitwidth > 3 then x^(2^k) = 0 for every
+/// even x in Bitwidth-bit arithmetic.
+static unsigned CarmichaelShift(unsigned Bitwidth) {
+ if (Bitwidth < 3)
+ return Bitwidth - 1;
+ return Bitwidth - 2;
+}
+
+/// Add the extra weight 'RHS' to the existing weight 'LHS',
+/// reducing the combined weight using any special properties of the operation.
+/// The existing weight LHS represents the computation X op X op ... op X where
+/// X occurs LHS times. The combined weight represents X op X op ... op X with
+/// X occurring LHS + RHS times. If op is "Xor" for example then the combined
+/// operation is equivalent to X if LHS + RHS is odd, or 0 if LHS + RHS is even;
+/// the routine returns 1 in LHS in the first case, and 0 in LHS in the second.
+static void IncorporateWeight(APInt &LHS, const APInt &RHS, unsigned Opcode) {
+ // If we were working with infinite precision arithmetic then the combined
+ // weight would be LHS + RHS. But we are using finite precision arithmetic,
+ // and the APInt sum LHS + RHS may not be correct if it wraps (it is correct
+ // for nilpotent operations and addition, but not for idempotent operations
+ // and multiplication), so it is important to correctly reduce the combined
+ // weight back into range if wrapping would be wrong.
+
+ // If RHS is zero then the weight didn't change.
+ if (RHS.isMinValue())
+ return;
+ // If LHS is zero then the combined weight is RHS.
+ if (LHS.isMinValue()) {
+ LHS = RHS;
+ return;
+ }
+ // From this point on we know that neither LHS nor RHS is zero.
+
+ if (Instruction::isIdempotent(Opcode)) {
+ // Idempotent means X op X === X, so any non-zero weight is equivalent to a
+ // weight of 1. Keeping weights at zero or one also means that wrapping is
+ // not a problem.
+ assert(LHS == 1 && RHS == 1 && "Weights not reduced!");
+ return; // Return a weight of 1.
+ }
+ if (Instruction::isNilpotent(Opcode)) {
+ // Nilpotent means X op X === 0, so reduce weights modulo 2.
+ assert(LHS == 1 && RHS == 1 && "Weights not reduced!");
+ LHS = 0; // 1 + 1 === 0 modulo 2.
+ return;
+ }
+ if (Opcode == Instruction::Add || Opcode == Instruction::FAdd) {
+ // TODO: Reduce the weight by exploiting nsw/nuw?
+ LHS += RHS;
+ return;
+ }
+
+ assert((Opcode == Instruction::Mul || Opcode == Instruction::FMul) &&
+ "Unknown associative operation!");
+ unsigned Bitwidth = LHS.getBitWidth();
+ // If CM is the Carmichael number then a weight W satisfying W >= CM+Bitwidth
+ // can be replaced with W-CM. That's because x^W=x^(W-CM) for every Bitwidth
+ // bit number x, since either x is odd in which case x^CM = 1, or x is even in
+ // which case both x^W and x^(W - CM) are zero. By subtracting off multiples
+ // of CM like this weights can always be reduced to the range [0, CM+Bitwidth)
+ // which by a happy accident means that they can always be represented using
+ // Bitwidth bits.
+ // TODO: Reduce the weight by exploiting nsw/nuw? (Could do much better than
+ // the Carmichael number).
+ if (Bitwidth > 3) {
+ /// CM - The value of Carmichael's lambda function.
+ APInt CM = APInt::getOneBitSet(Bitwidth, CarmichaelShift(Bitwidth));
+ // Any weight W >= Threshold can be replaced with W - CM.
+ APInt Threshold = CM + Bitwidth;
+ assert(LHS.ult(Threshold) && RHS.ult(Threshold) && "Weights not reduced!");
+ // For Bitwidth 4 or more the following sum does not overflow.
+ LHS += RHS;
+ while (LHS.uge(Threshold))
+ LHS -= CM;
+ } else {
+ // To avoid problems with overflow do everything the same as above but using
+ // a larger type.
+ unsigned CM = 1U << CarmichaelShift(Bitwidth);
+ unsigned Threshold = CM + Bitwidth;
+ assert(LHS.getZExtValue() < Threshold && RHS.getZExtValue() < Threshold &&
+ "Weights not reduced!");
+ unsigned Total = LHS.getZExtValue() + RHS.getZExtValue();
+ while (Total >= Threshold)
+ Total -= CM;
+ LHS = Total;
+ }
+}
+
+typedef std::pair<Value*, APInt> RepeatedValue;
+
+/// Given an associative binary expression, return the leaf
+/// nodes in Ops along with their weights (how many times the leaf occurs). The
+/// original expression is the same as
+/// (Ops[0].first op Ops[0].first op ... Ops[0].first) <- Ops[0].second times
+/// op
+/// (Ops[1].first op Ops[1].first op ... Ops[1].first) <- Ops[1].second times
+/// op
+/// ...
+/// op
+/// (Ops[N].first op Ops[N].first op ... Ops[N].first) <- Ops[N].second times
+///
+/// Note that the values Ops[0].first, ..., Ops[N].first are all distinct.
+///
+/// This routine may modify the function, in which case it returns 'true'. The
+/// changes it makes may well be destructive, changing the value computed by 'I'
+/// to something completely different. Thus if the routine returns 'true' then
+/// you MUST either replace I with a new expression computed from the Ops array,
+/// or use RewriteExprTree to put the values back in.
+///
+/// A leaf node is either not a binary operation of the same kind as the root
+/// node 'I' (i.e. is not a binary operator at all, or is, but with a different
+/// opcode), or is the same kind of binary operator but has a use which either
+/// does not belong to the expression, or does belong to the expression but is
+/// a leaf node. Every leaf node has at least one use that is a non-leaf node
+/// of the expression, while for non-leaf nodes (except for the root 'I') every
+/// use is a non-leaf node of the expression.
+///
+/// For example:
+/// expression graph node names
+///
+/// + | I
+/// / \ |
+/// + + | A, B
+/// / \ / \ |
+/// * + * | C, D, E
+/// / \ / \ / \ |
+/// + * | F, G
+///
+/// The leaf nodes are C, E, F and G. The Ops array will contain (maybe not in
+/// that order) (C, 1), (E, 1), (F, 2), (G, 2).
+///
+/// The expression is maximal: if some instruction is a binary operator of the
+/// same kind as 'I', and all of its uses are non-leaf nodes of the expression,
+/// then the instruction also belongs to the expression, is not a leaf node of
+/// it, and its operands also belong to the expression (but may be leaf nodes).
+///
+/// NOTE: This routine will set operands of non-leaf non-root nodes to undef in
+/// order to ensure that every non-root node in the expression has *exactly one*
+/// use by a non-leaf node of the expression. This destruction means that the
+/// caller MUST either replace 'I' with a new expression or use something like
+/// RewriteExprTree to put the values back in if the routine indicates that it
+/// made a change by returning 'true'.
+///
+/// In the above example either the right operand of A or the left operand of B
+/// will be replaced by undef. If it is B's operand then this gives:
+///
+/// + | I
+/// / \ |
+/// + + | A, B - operand of B replaced with undef
+/// / \ \ |
+/// * + * | C, D, E
+/// / \ / \ / \ |
+/// + * | F, G
+///
+/// Note that such undef operands can only be reached by passing through 'I'.
+/// For example, if you visit operands recursively starting from a leaf node
+/// then you will never see such an undef operand unless you get back to 'I',
+/// which requires passing through a phi node.
+///
+/// Note that this routine may also mutate binary operators of the wrong type
+/// that have all uses inside the expression (i.e. only used by non-leaf nodes
+/// of the expression) if it can turn them into binary operators of the right
+/// type and thus make the expression bigger.
+
+static bool LinearizeExprTree(BinaryOperator *I,
+ SmallVectorImpl<RepeatedValue> &Ops) {
+ DEBUG(dbgs() << "LINEARIZE: " << *I << '\n');
+ unsigned Bitwidth = I->getType()->getScalarType()->getPrimitiveSizeInBits();
+ unsigned Opcode = I->getOpcode();
+ assert(I->isAssociative() && I->isCommutative() &&
+ "Expected an associative and commutative operation!");
+
+ // Visit all operands of the expression, keeping track of their weight (the
+ // number of paths from the expression root to the operand, or if you like
+ // the number of times that operand occurs in the linearized expression).
+ // For example, if I = X + A, where X = A + B, then I, X and B have weight 1
+ // while A has weight two.
+
+ // Worklist of non-leaf nodes (their operands are in the expression too) along
+ // with their weights, representing a certain number of paths to the operator.
+ // If an operator occurs in the worklist multiple times then we found multiple
+ // ways to get to it.
+ SmallVector<std::pair<BinaryOperator*, APInt>, 8> Worklist; // (Op, Weight)
+ Worklist.push_back(std::make_pair(I, APInt(Bitwidth, 1)));
+ bool Changed = false;
+
+ // Leaves of the expression are values that either aren't the right kind of
+ // operation (eg: a constant, or a multiply in an add tree), or are, but have
+ // some uses that are not inside the expression. For example, in I = X + X,
+ // X = A + B, the value X has two uses (by I) that are in the expression. If
+ // X has any other uses, for example in a return instruction, then we consider
+ // X to be a leaf, and won't analyze it further. When we first visit a value,
+ // if it has more than one use then at first we conservatively consider it to
+ // be a leaf. Later, as the expression is explored, we may discover some more
+ // uses of the value from inside the expression. If all uses turn out to be
+ // from within the expression (and the value is a binary operator of the right
+ // kind) then the value is no longer considered to be a leaf, and its operands
+ // are explored.
+
+ // Leaves - Keeps track of the set of putative leaves as well as the number of
+ // paths to each leaf seen so far.
+ typedef DenseMap<Value*, APInt> LeafMap;
+ LeafMap Leaves; // Leaf -> Total weight so far.
+ SmallVector<Value*, 8> LeafOrder; // Ensure deterministic leaf output order.
+
+#ifndef NDEBUG
+ SmallPtrSet<Value*, 8> Visited; // For sanity checking the iteration scheme.
+#endif
+ while (!Worklist.empty()) {
+ std::pair<BinaryOperator*, APInt> P = Worklist.pop_back_val();
+ I = P.first; // We examine the operands of this binary operator.
+
+ for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx) { // Visit operands.
+ Value *Op = I->getOperand(OpIdx);
+ APInt Weight = P.second; // Number of paths to this operand.
+ DEBUG(dbgs() << "OPERAND: " << *Op << " (" << Weight << ")\n");
+ assert(!Op->use_empty() && "No uses, so how did we get to it?!");
+
+ // If this is a binary operation of the right kind with only one use then
+ // add its operands to the expression.
+ if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) {
+ assert(Visited.insert(Op).second && "Not first visit!");
+ DEBUG(dbgs() << "DIRECT ADD: " << *Op << " (" << Weight << ")\n");
+ Worklist.push_back(std::make_pair(BO, Weight));
+ continue;
+ }
+
+ // Appears to be a leaf. Is the operand already in the set of leaves?
+ LeafMap::iterator It = Leaves.find(Op);
+ if (It == Leaves.end()) {
+ // Not in the leaf map. Must be the first time we saw this operand.
+ assert(Visited.insert(Op).second && "Not first visit!");
+ if (!Op->hasOneUse()) {
+ // This value has uses not accounted for by the expression, so it is
+ // not safe to modify. Mark it as being a leaf.
+ DEBUG(dbgs() << "ADD USES LEAF: " << *Op << " (" << Weight << ")\n");
+ LeafOrder.push_back(Op);
+ Leaves[Op] = Weight;
+ continue;
+ }
+ // No uses outside the expression, try morphing it.
+ } else if (It != Leaves.end()) {
+ // Already in the leaf map.
+ assert(Visited.count(Op) && "In leaf map but not visited!");
+
+ // Update the number of paths to the leaf.
+ IncorporateWeight(It->second, Weight, Opcode);
+
+#if 0 // TODO: Re-enable once PR13021 is fixed.
+ // The leaf already has one use from inside the expression. As we want
+ // exactly one such use, drop this new use of the leaf.
+ assert(!Op->hasOneUse() && "Only one use, but we got here twice!");
+ I->setOperand(OpIdx, UndefValue::get(I->getType()));
+ Changed = true;
+
+ // If the leaf is a binary operation of the right kind and we now see
+ // that its multiple original uses were in fact all by nodes belonging
+ // to the expression, then no longer consider it to be a leaf and add
+ // its operands to the expression.
+ if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) {
+ DEBUG(dbgs() << "UNLEAF: " << *Op << " (" << It->second << ")\n");
+ Worklist.push_back(std::make_pair(BO, It->second));
+ Leaves.erase(It);
+ continue;
+ }
+#endif
+
+ // If we still have uses that are not accounted for by the expression
+ // then it is not safe to modify the value.
+ if (!Op->hasOneUse())
+ continue;
+
+ // No uses outside the expression, try morphing it.
+ Weight = It->second;
+ Leaves.erase(It); // Since the value may be morphed below.
+ }
+
+ // At this point we have a value which, first of all, is not a binary
+ // expression of the right kind, and secondly, is only used inside the
+ // expression. This means that it can safely be modified. See if we
+ // can usefully morph it into an expression of the right kind.
+ assert((!isa<Instruction>(Op) ||
+ cast<Instruction>(Op)->getOpcode() != Opcode
+ || (isa<FPMathOperator>(Op) &&
+ !cast<Instruction>(Op)->hasUnsafeAlgebra())) &&
+ "Should have been handled above!");
+ assert(Op->hasOneUse() && "Has uses outside the expression tree!");
+
+ // If this is a multiply expression, turn any internal negations into
+ // multiplies by -1 so they can be reassociated.
+ if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Op))
+ if ((Opcode == Instruction::Mul && BinaryOperator::isNeg(BO)) ||
+ (Opcode == Instruction::FMul && BinaryOperator::isFNeg(BO))) {
+ DEBUG(dbgs() << "MORPH LEAF: " << *Op << " (" << Weight << ") TO ");
+ BO = LowerNegateToMultiply(BO);
+ DEBUG(dbgs() << *BO << '\n');
+ Worklist.push_back(std::make_pair(BO, Weight));
+ Changed = true;
+ continue;
+ }
+
+ // Failed to morph into an expression of the right type. This really is
+ // a leaf.
+ DEBUG(dbgs() << "ADD LEAF: " << *Op << " (" << Weight << ")\n");
+ assert(!isReassociableOp(Op, Opcode) && "Value was morphed?");
+ LeafOrder.push_back(Op);
+ Leaves[Op] = Weight;
+ }
+ }
+
+ // The leaves, repeated according to their weights, represent the linearized
+ // form of the expression.
+ for (unsigned i = 0, e = LeafOrder.size(); i != e; ++i) {
+ Value *V = LeafOrder[i];
+ LeafMap::iterator It = Leaves.find(V);
+ if (It == Leaves.end())
+ // Node initially thought to be a leaf wasn't.
+ continue;
+ assert(!isReassociableOp(V, Opcode) && "Shouldn't be a leaf!");
+ APInt Weight = It->second;
+ if (Weight.isMinValue())
+ // Leaf already output or weight reduction eliminated it.
+ continue;
+ // Ensure the leaf is only output once.
+ It->second = 0;
+ Ops.push_back(std::make_pair(V, Weight));
+ }
+
+ // For nilpotent operations or addition there may be no operands, for example
+ // because the expression was "X xor X" or consisted of 2^Bitwidth additions:
+ // in both cases the weight reduces to 0 causing the value to be skipped.
+ if (Ops.empty()) {
+ Constant *Identity = ConstantExpr::getBinOpIdentity(Opcode, I->getType());
+ assert(Identity && "Associative operation without identity!");
+ Ops.emplace_back(Identity, APInt(Bitwidth, 1));
+ }
+
+ return Changed;
+}
+
+/// Now that the operands for this expression tree are
+/// linearized and optimized, emit them in-order.
+void Reassociate::RewriteExprTree(BinaryOperator *I,
+ SmallVectorImpl<ValueEntry> &Ops) {
+ assert(Ops.size() > 1 && "Single values should be used directly!");
+
+ // Since our optimizations should never increase the number of operations, the
+ // new expression can usually be written reusing the existing binary operators
+ // from the original expression tree, without creating any new instructions,
+ // though the rewritten expression may have a completely different topology.
+ // We take care to not change anything if the new expression will be the same
+ // as the original. If more than trivial changes (like commuting operands)
+ // were made then we are obliged to clear out any optional subclass data like
+ // nsw flags.
+
+ /// NodesToRewrite - Nodes from the original expression available for writing
+ /// the new expression into.
+ SmallVector<BinaryOperator*, 8> NodesToRewrite;
+ unsigned Opcode = I->getOpcode();
+ BinaryOperator *Op = I;
+
+ /// NotRewritable - The operands being written will be the leaves of the new
+ /// expression and must not be used as inner nodes (via NodesToRewrite) by
+ /// mistake. Inner nodes are always reassociable, and usually leaves are not
+ /// (if they were they would have been incorporated into the expression and so
+ /// would not be leaves), so most of the time there is no danger of this. But
+ /// in rare cases a leaf may become reassociable if an optimization kills uses
+ /// of it, or it may momentarily become reassociable during rewriting (below)
+ /// due it being removed as an operand of one of its uses. Ensure that misuse
+ /// of leaf nodes as inner nodes cannot occur by remembering all of the future
+ /// leaves and refusing to reuse any of them as inner nodes.
+ SmallPtrSet<Value*, 8> NotRewritable;
+ for (unsigned i = 0, e = Ops.size(); i != e; ++i)
+ NotRewritable.insert(Ops[i].Op);
+
+ // ExpressionChanged - Non-null if the rewritten expression differs from the
+ // original in some non-trivial way, requiring the clearing of optional flags.
+ // Flags are cleared from the operator in ExpressionChanged up to I inclusive.
+ BinaryOperator *ExpressionChanged = nullptr;
+ for (unsigned i = 0; ; ++i) {
+ // The last operation (which comes earliest in the IR) is special as both
+ // operands will come from Ops, rather than just one with the other being
+ // a subexpression.
+ if (i+2 == Ops.size()) {
+ Value *NewLHS = Ops[i].Op;
+ Value *NewRHS = Ops[i+1].Op;
+ Value *OldLHS = Op->getOperand(0);
+ Value *OldRHS = Op->getOperand(1);
+
+ if (NewLHS == OldLHS && NewRHS == OldRHS)
+ // Nothing changed, leave it alone.
+ break;
+
+ if (NewLHS == OldRHS && NewRHS == OldLHS) {
+ // The order of the operands was reversed. Swap them.
+ DEBUG(dbgs() << "RA: " << *Op << '\n');
+ Op->swapOperands();
+ DEBUG(dbgs() << "TO: " << *Op << '\n');
+ MadeChange = true;
+ ++NumChanged;
+ break;
+ }
+
+ // The new operation differs non-trivially from the original. Overwrite
+ // the old operands with the new ones.
+ DEBUG(dbgs() << "RA: " << *Op << '\n');
+ if (NewLHS != OldLHS) {
+ BinaryOperator *BO = isReassociableOp(OldLHS, Opcode);
+ if (BO && !NotRewritable.count(BO))
+ NodesToRewrite.push_back(BO);
+ Op->setOperand(0, NewLHS);
+ }
+ if (NewRHS != OldRHS) {
+ BinaryOperator *BO = isReassociableOp(OldRHS, Opcode);
+ if (BO && !NotRewritable.count(BO))
+ NodesToRewrite.push_back(BO);
+ Op->setOperand(1, NewRHS);
+ }
+ DEBUG(dbgs() << "TO: " << *Op << '\n');
+
+ ExpressionChanged = Op;
+ MadeChange = true;
+ ++NumChanged;
+
+ break;
+ }
+
+ // Not the last operation. The left-hand side will be a sub-expression
+ // while the right-hand side will be the current element of Ops.
+ Value *NewRHS = Ops[i].Op;
+ if (NewRHS != Op->getOperand(1)) {
+ DEBUG(dbgs() << "RA: " << *Op << '\n');
+ if (NewRHS == Op->getOperand(0)) {
+ // The new right-hand side was already present as the left operand. If
+ // we are lucky then swapping the operands will sort out both of them.
+ Op->swapOperands();
+ } else {
+ // Overwrite with the new right-hand side.
+ BinaryOperator *BO = isReassociableOp(Op->getOperand(1), Opcode);
+ if (BO && !NotRewritable.count(BO))
+ NodesToRewrite.push_back(BO);
+ Op->setOperand(1, NewRHS);
+ ExpressionChanged = Op;
+ }
+ DEBUG(dbgs() << "TO: " << *Op << '\n');
+ MadeChange = true;
+ ++NumChanged;
+ }
+
+ // Now deal with the left-hand side. If this is already an operation node
+ // from the original expression then just rewrite the rest of the expression
+ // into it.
+ BinaryOperator *BO = isReassociableOp(Op->getOperand(0), Opcode);
+ if (BO && !NotRewritable.count(BO)) {
+ Op = BO;
+ continue;
+ }
+
+ // Otherwise, grab a spare node from the original expression and use that as
+ // the left-hand side. If there are no nodes left then the optimizers made
+ // an expression with more nodes than the original! This usually means that
+ // they did something stupid but it might mean that the problem was just too
+ // hard (finding the mimimal number of multiplications needed to realize a
+ // multiplication expression is NP-complete). Whatever the reason, smart or
+ // stupid, create a new node if there are none left.
+ BinaryOperator *NewOp;
+ if (NodesToRewrite.empty()) {
+ Constant *Undef = UndefValue::get(I->getType());
+ NewOp = BinaryOperator::Create(Instruction::BinaryOps(Opcode),
+ Undef, Undef, "", I);
+ if (NewOp->getType()->isFPOrFPVectorTy())
+ NewOp->setFastMathFlags(I->getFastMathFlags());
+ } else {
+ NewOp = NodesToRewrite.pop_back_val();
+ }
+
+ DEBUG(dbgs() << "RA: " << *Op << '\n');
+ Op->setOperand(0, NewOp);
+ DEBUG(dbgs() << "TO: " << *Op << '\n');
+ ExpressionChanged = Op;
+ MadeChange = true;
+ ++NumChanged;
+ Op = NewOp;
+ }
+
+ // If the expression changed non-trivially then clear out all subclass data
+ // starting from the operator specified in ExpressionChanged, and compactify
+ // the operators to just before the expression root to guarantee that the
+ // expression tree is dominated by all of Ops.
+ if (ExpressionChanged)
+ do {
+ // Preserve FastMathFlags.
+ if (isa<FPMathOperator>(I)) {
+ FastMathFlags Flags = I->getFastMathFlags();
+ ExpressionChanged->clearSubclassOptionalData();
+ ExpressionChanged->setFastMathFlags(Flags);
+ } else
+ ExpressionChanged->clearSubclassOptionalData();
+
+ if (ExpressionChanged == I)
+ break;
+ ExpressionChanged->moveBefore(I);
+ ExpressionChanged = cast<BinaryOperator>(*ExpressionChanged->user_begin());
+ } while (1);
+
+ // Throw away any left over nodes from the original expression.
+ for (unsigned i = 0, e = NodesToRewrite.size(); i != e; ++i)
+ RedoInsts.insert(NodesToRewrite[i]);
+}
+
+/// Insert instructions before the instruction pointed to by BI,
+/// that computes the negative version of the value specified. The negative
+/// version of the value is returned, and BI is left pointing at the instruction
+/// that should be processed next by the reassociation pass.
+/// Also add intermediate instructions to the redo list that are modified while
+/// pushing the negates through adds. These will be revisited to see if
+/// additional opportunities have been exposed.
+static Value *NegateValue(Value *V, Instruction *BI,
+ SetVector<AssertingVH<Instruction>> &ToRedo) {
+ if (Constant *C = dyn_cast<Constant>(V)) {
+ if (C->getType()->isFPOrFPVectorTy()) {
+ return ConstantExpr::getFNeg(C);
+ }
+ return ConstantExpr::getNeg(C);
+ }
+
+
+ // We are trying to expose opportunity for reassociation. One of the things
+ // that we want to do to achieve this is to push a negation as deep into an
+ // expression chain as possible, to expose the add instructions. In practice,
+ // this means that we turn this:
+ // X = -(A+12+C+D) into X = -A + -12 + -C + -D = -12 + -A + -C + -D
+ // so that later, a: Y = 12+X could get reassociated with the -12 to eliminate
+ // the constants. We assume that instcombine will clean up the mess later if
+ // we introduce tons of unnecessary negation instructions.
+ //
+ if (BinaryOperator *I =
+ isReassociableOp(V, Instruction::Add, Instruction::FAdd)) {
+ // Push the negates through the add.
+ I->setOperand(0, NegateValue(I->getOperand(0), BI, ToRedo));
+ I->setOperand(1, NegateValue(I->getOperand(1), BI, ToRedo));
+ if (I->getOpcode() == Instruction::Add) {
+ I->setHasNoUnsignedWrap(false);
+ I->setHasNoSignedWrap(false);
+ }
+
+ // We must move the add instruction here, because the neg instructions do
+ // not dominate the old add instruction in general. By moving it, we are
+ // assured that the neg instructions we just inserted dominate the
+ // instruction we are about to insert after them.
+ //
+ I->moveBefore(BI);
+ I->setName(I->getName()+".neg");
+
+ // Add the intermediate negates to the redo list as processing them later
+ // could expose more reassociating opportunities.
+ ToRedo.insert(I);
+ return I;
+ }
+
+ // Okay, we need to materialize a negated version of V with an instruction.
+ // Scan the use lists of V to see if we have one already.
+ for (User *U : V->users()) {
+ if (!BinaryOperator::isNeg(U) && !BinaryOperator::isFNeg(U))
+ continue;
+
+ // We found one! Now we have to make sure that the definition dominates
+ // this use. We do this by moving it to the entry block (if it is a
+ // non-instruction value) or right after the definition. These negates will
+ // be zapped by reassociate later, so we don't need much finesse here.
+ BinaryOperator *TheNeg = cast<BinaryOperator>(U);
+
+ // Verify that the negate is in this function, V might be a constant expr.
+ if (TheNeg->getParent()->getParent() != BI->getParent()->getParent())
+ continue;
+
+ BasicBlock::iterator InsertPt;
+ if (Instruction *InstInput = dyn_cast<Instruction>(V)) {
+ if (InvokeInst *II = dyn_cast<InvokeInst>(InstInput)) {
+ InsertPt = II->getNormalDest()->begin();
+ } else {
+ InsertPt = ++InstInput->getIterator();
+ }
+ while (isa<PHINode>(InsertPt)) ++InsertPt;
+ } else {
+ InsertPt = TheNeg->getParent()->getParent()->getEntryBlock().begin();
+ }
+ TheNeg->moveBefore(&*InsertPt);
+ if (TheNeg->getOpcode() == Instruction::Sub) {
+ TheNeg->setHasNoUnsignedWrap(false);
+ TheNeg->setHasNoSignedWrap(false);
+ } else {
+ TheNeg->andIRFlags(BI);
+ }
+ ToRedo.insert(TheNeg);
+ return TheNeg;
+ }
+
+ // Insert a 'neg' instruction that subtracts the value from zero to get the
+ // negation.
+ BinaryOperator *NewNeg = CreateNeg(V, V->getName() + ".neg", BI, BI);
+ ToRedo.insert(NewNeg);
+ return NewNeg;
+}
+
+/// Return true if we should break up this subtract of X-Y into (X + -Y).
+static bool ShouldBreakUpSubtract(Instruction *Sub) {
+ // If this is a negation, we can't split it up!
+ if (BinaryOperator::isNeg(Sub) || BinaryOperator::isFNeg(Sub))
+ return false;
+
+ // Don't breakup X - undef.
+ if (isa<UndefValue>(Sub->getOperand(1)))
+ return false;
+
+ // Don't bother to break this up unless either the LHS is an associable add or
+ // subtract or if this is only used by one.
+ Value *V0 = Sub->getOperand(0);
+ if (isReassociableOp(V0, Instruction::Add, Instruction::FAdd) ||
+ isReassociableOp(V0, Instruction::Sub, Instruction::FSub))
+ return true;
+ Value *V1 = Sub->getOperand(1);
+ if (isReassociableOp(V1, Instruction::Add, Instruction::FAdd) ||
+ isReassociableOp(V1, Instruction::Sub, Instruction::FSub))
+ return true;
+ Value *VB = Sub->user_back();
+ if (Sub->hasOneUse() &&
+ (isReassociableOp(VB, Instruction::Add, Instruction::FAdd) ||
+ isReassociableOp(VB, Instruction::Sub, Instruction::FSub)))
+ return true;
+
+ return false;
+}
+
+/// If we have (X-Y), and if either X is an add, or if this is only used by an
+/// add, transform this into (X+(0-Y)) to promote better reassociation.
+static BinaryOperator *
+BreakUpSubtract(Instruction *Sub, SetVector<AssertingVH<Instruction>> &ToRedo) {
+ // Convert a subtract into an add and a neg instruction. This allows sub
+ // instructions to be commuted with other add instructions.
+ //
+ // Calculate the negative value of Operand 1 of the sub instruction,
+ // and set it as the RHS of the add instruction we just made.
+ //
+ Value *NegVal = NegateValue(Sub->getOperand(1), Sub, ToRedo);
+ BinaryOperator *New = CreateAdd(Sub->getOperand(0), NegVal, "", Sub, Sub);
+ Sub->setOperand(0, Constant::getNullValue(Sub->getType())); // Drop use of op.
+ Sub->setOperand(1, Constant::getNullValue(Sub->getType())); // Drop use of op.
+ New->takeName(Sub);
+
+ // Everyone now refers to the add instruction.
+ Sub->replaceAllUsesWith(New);
+ New->setDebugLoc(Sub->getDebugLoc());
+
+ DEBUG(dbgs() << "Negated: " << *New << '\n');
+ return New;
+}
+
+/// If this is a shift of a reassociable multiply or is used by one, change
+/// this into a multiply by a constant to assist with further reassociation.
+static BinaryOperator *ConvertShiftToMul(Instruction *Shl) {
+ Constant *MulCst = ConstantInt::get(Shl->getType(), 1);
+ MulCst = ConstantExpr::getShl(MulCst, cast<Constant>(Shl->getOperand(1)));
+
+ BinaryOperator *Mul =
+ BinaryOperator::CreateMul(Shl->getOperand(0), MulCst, "", Shl);
+ Shl->setOperand(0, UndefValue::get(Shl->getType())); // Drop use of op.
+ Mul->takeName(Shl);
+
+ // Everyone now refers to the mul instruction.
+ Shl->replaceAllUsesWith(Mul);
+ Mul->setDebugLoc(Shl->getDebugLoc());
+
+ // We can safely preserve the nuw flag in all cases. It's also safe to turn a
+ // nuw nsw shl into a nuw nsw mul. However, nsw in isolation requires special
+ // handling.
+ bool NSW = cast<BinaryOperator>(Shl)->hasNoSignedWrap();
+ bool NUW = cast<BinaryOperator>(Shl)->hasNoUnsignedWrap();
+ if (NSW && NUW)
+ Mul->setHasNoSignedWrap(true);
+ Mul->setHasNoUnsignedWrap(NUW);
+ return Mul;
+}
+
+/// Scan backwards and forwards among values with the same rank as element i
+/// to see if X exists. If X does not exist, return i. This is useful when
+/// scanning for 'x' when we see '-x' because they both get the same rank.
+static unsigned FindInOperandList(SmallVectorImpl<ValueEntry> &Ops, unsigned i,
+ Value *X) {
+ unsigned XRank = Ops[i].Rank;
+ unsigned e = Ops.size();
+ for (unsigned j = i+1; j != e && Ops[j].Rank == XRank; ++j) {
+ if (Ops[j].Op == X)
+ return j;
+ if (Instruction *I1 = dyn_cast<Instruction>(Ops[j].Op))
+ if (Instruction *I2 = dyn_cast<Instruction>(X))
+ if (I1->isIdenticalTo(I2))
+ return j;
+ }
+ // Scan backwards.
+ for (unsigned j = i-1; j != ~0U && Ops[j].Rank == XRank; --j) {
+ if (Ops[j].Op == X)
+ return j;
+ if (Instruction *I1 = dyn_cast<Instruction>(Ops[j].Op))
+ if (Instruction *I2 = dyn_cast<Instruction>(X))
+ if (I1->isIdenticalTo(I2))
+ return j;
+ }
+ return i;
+}
+
+/// Emit a tree of add instructions, summing Ops together
+/// and returning the result. Insert the tree before I.
+static Value *EmitAddTreeOfValues(Instruction *I,
+ SmallVectorImpl<WeakVH> &Ops){
+ if (Ops.size() == 1) return Ops.back();
+
+ Value *V1 = Ops.back();
+ Ops.pop_back();
+ Value *V2 = EmitAddTreeOfValues(I, Ops);
+ return CreateAdd(V2, V1, "tmp", I, I);
+}
+
+/// If V is an expression tree that is a multiplication sequence,
+/// and if this sequence contains a multiply by Factor,
+/// remove Factor from the tree and return the new tree.
+Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) {
+ BinaryOperator *BO = isReassociableOp(V, Instruction::Mul, Instruction::FMul);
+ if (!BO)
+ return nullptr;
+
+ SmallVector<RepeatedValue, 8> Tree;
+ MadeChange |= LinearizeExprTree(BO, Tree);
+ SmallVector<ValueEntry, 8> Factors;
+ Factors.reserve(Tree.size());
+ for (unsigned i = 0, e = Tree.size(); i != e; ++i) {
+ RepeatedValue E = Tree[i];
+ Factors.append(E.second.getZExtValue(),
+ ValueEntry(getRank(E.first), E.first));
+ }
+
+ bool FoundFactor = false;
+ bool NeedsNegate = false;
+ for (unsigned i = 0, e = Factors.size(); i != e; ++i) {
+ if (Factors[i].Op == Factor) {
+ FoundFactor = true;
+ Factors.erase(Factors.begin()+i);
+ break;
+ }
+
+ // If this is a negative version of this factor, remove it.
+ if (ConstantInt *FC1 = dyn_cast<ConstantInt>(Factor)) {
+ if (ConstantInt *FC2 = dyn_cast<ConstantInt>(Factors[i].Op))
+ if (FC1->getValue() == -FC2->getValue()) {
+ FoundFactor = NeedsNegate = true;
+ Factors.erase(Factors.begin()+i);
+ break;
+ }
+ } else if (ConstantFP *FC1 = dyn_cast<ConstantFP>(Factor)) {
+ if (ConstantFP *FC2 = dyn_cast<ConstantFP>(Factors[i].Op)) {
+ APFloat F1(FC1->getValueAPF());
+ APFloat F2(FC2->getValueAPF());
+ F2.changeSign();
+ if (F1.compare(F2) == APFloat::cmpEqual) {
+ FoundFactor = NeedsNegate = true;
+ Factors.erase(Factors.begin() + i);
+ break;
+ }
+ }
+ }
+ }
+
+ if (!FoundFactor) {
+ // Make sure to restore the operands to the expression tree.
+ RewriteExprTree(BO, Factors);
+ return nullptr;
+ }
+
+ BasicBlock::iterator InsertPt = ++BO->getIterator();
+
+ // If this was just a single multiply, remove the multiply and return the only
+ // remaining operand.
+ if (Factors.size() == 1) {
+ RedoInsts.insert(BO);
+ V = Factors[0].Op;
+ } else {
+ RewriteExprTree(BO, Factors);
+ V = BO;
+ }
+
+ if (NeedsNegate)
+ V = CreateNeg(V, "neg", &*InsertPt, BO);
+
+ return V;
+}
+
+/// If V is a single-use multiply, recursively add its operands as factors,
+/// otherwise add V to the list of factors.
+///
+/// Ops is the top-level list of add operands we're trying to factor.
+static void FindSingleUseMultiplyFactors(Value *V,
+ SmallVectorImpl<Value*> &Factors,
+ const SmallVectorImpl<ValueEntry> &Ops) {
+ BinaryOperator *BO = isReassociableOp(V, Instruction::Mul, Instruction::FMul);
+ if (!BO) {
+ Factors.push_back(V);
+ return;
+ }
+
+ // Otherwise, add the LHS and RHS to the list of factors.
+ FindSingleUseMultiplyFactors(BO->getOperand(1), Factors, Ops);
+ FindSingleUseMultiplyFactors(BO->getOperand(0), Factors, Ops);
+}
+
+/// Optimize a series of operands to an 'and', 'or', or 'xor' instruction.
+/// This optimizes based on identities. If it can be reduced to a single Value,
+/// it is returned, otherwise the Ops list is mutated as necessary.
+static Value *OptimizeAndOrXor(unsigned Opcode,
+ SmallVectorImpl<ValueEntry> &Ops) {
+ // Scan the operand lists looking for X and ~X pairs, along with X,X pairs.
+ // If we find any, we can simplify the expression. X&~X == 0, X|~X == -1.
+ for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+ // First, check for X and ~X in the operand list.
+ assert(i < Ops.size());
+ if (BinaryOperator::isNot(Ops[i].Op)) { // Cannot occur for ^.
+ Value *X = BinaryOperator::getNotArgument(Ops[i].Op);
+ unsigned FoundX = FindInOperandList(Ops, i, X);
+ if (FoundX != i) {
+ if (Opcode == Instruction::And) // ...&X&~X = 0
+ return Constant::getNullValue(X->getType());
+
+ if (Opcode == Instruction::Or) // ...|X|~X = -1
+ return Constant::getAllOnesValue(X->getType());
+ }
+ }
+
+ // Next, check for duplicate pairs of values, which we assume are next to
+ // each other, due to our sorting criteria.
+ assert(i < Ops.size());
+ if (i+1 != Ops.size() && Ops[i+1].Op == Ops[i].Op) {
+ if (Opcode == Instruction::And || Opcode == Instruction::Or) {
+ // Drop duplicate values for And and Or.
+ Ops.erase(Ops.begin()+i);
+ --i; --e;
+ ++NumAnnihil;
+ continue;
+ }
+
+ // Drop pairs of values for Xor.
+ assert(Opcode == Instruction::Xor);
+ if (e == 2)
+ return Constant::getNullValue(Ops[0].Op->getType());
+
+ // Y ^ X^X -> Y
+ Ops.erase(Ops.begin()+i, Ops.begin()+i+2);
+ i -= 1; e -= 2;
+ ++NumAnnihil;
+ }
+ }
+ return nullptr;
+}
+
+/// Helper function of CombineXorOpnd(). It creates a bitwise-and
+/// instruction with the given two operands, and return the resulting
+/// instruction. There are two special cases: 1) if the constant operand is 0,
+/// it will return NULL. 2) if the constant is ~0, the symbolic operand will
+/// be returned.
+static Value *createAndInstr(Instruction *InsertBefore, Value *Opnd,
+ const APInt &ConstOpnd) {
+ if (ConstOpnd != 0) {
+ if (!ConstOpnd.isAllOnesValue()) {
+ LLVMContext &Ctx = Opnd->getType()->getContext();
+ Instruction *I;
+ I = BinaryOperator::CreateAnd(Opnd, ConstantInt::get(Ctx, ConstOpnd),
+ "and.ra", InsertBefore);
+ I->setDebugLoc(InsertBefore->getDebugLoc());
+ return I;
+ }
+ return Opnd;
+ }
+ return nullptr;
+}
+
+// Helper function of OptimizeXor(). It tries to simplify "Opnd1 ^ ConstOpnd"
+// into "R ^ C", where C would be 0, and R is a symbolic value.
+//
+// If it was successful, true is returned, and the "R" and "C" is returned
+// via "Res" and "ConstOpnd", respectively; otherwise, false is returned,
+// and both "Res" and "ConstOpnd" remain unchanged.
+//
+bool Reassociate::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1,
+ APInt &ConstOpnd, Value *&Res) {
+ // Xor-Rule 1: (x | c1) ^ c2 = (x | c1) ^ (c1 ^ c1) ^ c2
+ // = ((x | c1) ^ c1) ^ (c1 ^ c2)
+ // = (x & ~c1) ^ (c1 ^ c2)
+ // It is useful only when c1 == c2.
+ if (Opnd1->isOrExpr() && Opnd1->getConstPart() != 0) {
+ if (!Opnd1->getValue()->hasOneUse())
+ return false;
+
+ const APInt &C1 = Opnd1->getConstPart();
+ if (C1 != ConstOpnd)
+ return false;
+
+ Value *X = Opnd1->getSymbolicPart();
+ Res = createAndInstr(I, X, ~C1);
+ // ConstOpnd was C2, now C1 ^ C2.
+ ConstOpnd ^= C1;
+
+ if (Instruction *T = dyn_cast<Instruction>(Opnd1->getValue()))
+ RedoInsts.insert(T);
+ return true;
+ }
+ return false;
+}
+
+
+// Helper function of OptimizeXor(). It tries to simplify
+// "Opnd1 ^ Opnd2 ^ ConstOpnd" into "R ^ C", where C would be 0, and R is a
+// symbolic value.
+//
+// If it was successful, true is returned, and the "R" and "C" is returned
+// via "Res" and "ConstOpnd", respectively (If the entire expression is
+// evaluated to a constant, the Res is set to NULL); otherwise, false is
+// returned, and both "Res" and "ConstOpnd" remain unchanged.
+bool Reassociate::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, XorOpnd *Opnd2,
+ APInt &ConstOpnd, Value *&Res) {
+ Value *X = Opnd1->getSymbolicPart();
+ if (X != Opnd2->getSymbolicPart())
+ return false;
+
+ // This many instruction become dead.(At least "Opnd1 ^ Opnd2" will die.)
+ int DeadInstNum = 1;
+ if (Opnd1->getValue()->hasOneUse())
+ DeadInstNum++;
+ if (Opnd2->getValue()->hasOneUse())
+ DeadInstNum++;
+
+ // Xor-Rule 2:
+ // (x | c1) ^ (x & c2)
+ // = (x|c1) ^ (x&c2) ^ (c1 ^ c1) = ((x|c1) ^ c1) ^ (x & c2) ^ c1
+ // = (x & ~c1) ^ (x & c2) ^ c1 // Xor-Rule 1
+ // = (x & c3) ^ c1, where c3 = ~c1 ^ c2 // Xor-rule 3
+ //
+ if (Opnd1->isOrExpr() != Opnd2->isOrExpr()) {
+ if (Opnd2->isOrExpr())
+ std::swap(Opnd1, Opnd2);
+
+ const APInt &C1 = Opnd1->getConstPart();
+ const APInt &C2 = Opnd2->getConstPart();
+ APInt C3((~C1) ^ C2);
+
+ // Do not increase code size!
+ if (C3 != 0 && !C3.isAllOnesValue()) {
+ int NewInstNum = ConstOpnd != 0 ? 1 : 2;
+ if (NewInstNum > DeadInstNum)
+ return false;
+ }
+
+ Res = createAndInstr(I, X, C3);
+ ConstOpnd ^= C1;
+
+ } else if (Opnd1->isOrExpr()) {
+ // Xor-Rule 3: (x | c1) ^ (x | c2) = (x & c3) ^ c3 where c3 = c1 ^ c2
+ //
+ const APInt &C1 = Opnd1->getConstPart();
+ const APInt &C2 = Opnd2->getConstPart();
+ APInt C3 = C1 ^ C2;
+
+ // Do not increase code size
+ if (C3 != 0 && !C3.isAllOnesValue()) {
+ int NewInstNum = ConstOpnd != 0 ? 1 : 2;
+ if (NewInstNum > DeadInstNum)
+ return false;
+ }
+
+ Res = createAndInstr(I, X, C3);
+ ConstOpnd ^= C3;
+ } else {
+ // Xor-Rule 4: (x & c1) ^ (x & c2) = (x & (c1^c2))
+ //
+ const APInt &C1 = Opnd1->getConstPart();
+ const APInt &C2 = Opnd2->getConstPart();
+ APInt C3 = C1 ^ C2;
+ Res = createAndInstr(I, X, C3);
+ }
+
+ // Put the original operands in the Redo list; hope they will be deleted
+ // as dead code.
+ if (Instruction *T = dyn_cast<Instruction>(Opnd1->getValue()))
+ RedoInsts.insert(T);
+ if (Instruction *T = dyn_cast<Instruction>(Opnd2->getValue()))
+ RedoInsts.insert(T);
+
+ return true;
+}
+
+/// Optimize a series of operands to an 'xor' instruction. If it can be reduced
+/// to a single Value, it is returned, otherwise the Ops list is mutated as
+/// necessary.
+Value *Reassociate::OptimizeXor(Instruction *I,
+ SmallVectorImpl<ValueEntry> &Ops) {
+ if (Value *V = OptimizeAndOrXor(Instruction::Xor, Ops))
+ return V;
+
+ if (Ops.size() == 1)
+ return nullptr;
+
+ SmallVector<XorOpnd, 8> Opnds;
+ SmallVector<XorOpnd*, 8> OpndPtrs;
+ Type *Ty = Ops[0].Op->getType();
+ APInt ConstOpnd(Ty->getIntegerBitWidth(), 0);
+
+ // Step 1: Convert ValueEntry to XorOpnd
+ for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+ Value *V = Ops[i].Op;
+ if (!isa<ConstantInt>(V)) {
+ XorOpnd O(V);
+ O.setSymbolicRank(getRank(O.getSymbolicPart()));
+ Opnds.push_back(O);
+ } else
+ ConstOpnd ^= cast<ConstantInt>(V)->getValue();
+ }
+
+ // NOTE: From this point on, do *NOT* add/delete element to/from "Opnds".
+ // It would otherwise invalidate the "Opnds"'s iterator, and hence invalidate
+ // the "OpndPtrs" as well. For the similar reason, do not fuse this loop
+ // with the previous loop --- the iterator of the "Opnds" may be invalidated
+ // when new elements are added to the vector.
+ for (unsigned i = 0, e = Opnds.size(); i != e; ++i)
+ OpndPtrs.push_back(&Opnds[i]);
+
+ // Step 2: Sort the Xor-Operands in a way such that the operands containing
+ // the same symbolic value cluster together. For instance, the input operand
+ // sequence ("x | 123", "y & 456", "x & 789") will be sorted into:
+ // ("x | 123", "x & 789", "y & 456").
+ std::stable_sort(OpndPtrs.begin(), OpndPtrs.end(), XorOpnd::PtrSortFunctor());
+
+ // Step 3: Combine adjacent operands
+ XorOpnd *PrevOpnd = nullptr;
+ bool Changed = false;
+ for (unsigned i = 0, e = Opnds.size(); i < e; i++) {
+ XorOpnd *CurrOpnd = OpndPtrs[i];
+ // The combined value
+ Value *CV;
+
+ // Step 3.1: Try simplifying "CurrOpnd ^ ConstOpnd"
+ if (ConstOpnd != 0 && CombineXorOpnd(I, CurrOpnd, ConstOpnd, CV)) {
+ Changed = true;
+ if (CV)
+ *CurrOpnd = XorOpnd(CV);
+ else {
+ CurrOpnd->Invalidate();
+ continue;
+ }
+ }
+
+ if (!PrevOpnd || CurrOpnd->getSymbolicPart() != PrevOpnd->getSymbolicPart()) {
+ PrevOpnd = CurrOpnd;
+ continue;
+ }
+
+ // step 3.2: When previous and current operands share the same symbolic
+ // value, try to simplify "PrevOpnd ^ CurrOpnd ^ ConstOpnd"
+ //
+ if (CombineXorOpnd(I, CurrOpnd, PrevOpnd, ConstOpnd, CV)) {
+ // Remove previous operand
+ PrevOpnd->Invalidate();
+ if (CV) {
+ *CurrOpnd = XorOpnd(CV);
+ PrevOpnd = CurrOpnd;
+ } else {
+ CurrOpnd->Invalidate();
+ PrevOpnd = nullptr;
+ }
+ Changed = true;
+ }
+ }
+
+ // Step 4: Reassemble the Ops
+ if (Changed) {
+ Ops.clear();
+ for (unsigned int i = 0, e = Opnds.size(); i < e; i++) {
+ XorOpnd &O = Opnds[i];
+ if (O.isInvalid())
+ continue;
+ ValueEntry VE(getRank(O.getValue()), O.getValue());
+ Ops.push_back(VE);
+ }
+ if (ConstOpnd != 0) {
+ Value *C = ConstantInt::get(Ty->getContext(), ConstOpnd);
+ ValueEntry VE(getRank(C), C);
+ Ops.push_back(VE);
+ }
+ int Sz = Ops.size();
+ if (Sz == 1)
+ return Ops.back().Op;
+ else if (Sz == 0) {
+ assert(ConstOpnd == 0);
+ return ConstantInt::get(Ty->getContext(), ConstOpnd);
+ }
+ }
+
+ return nullptr;
+}
+
+/// Optimize a series of operands to an 'add' instruction. This
+/// optimizes based on identities. If it can be reduced to a single Value, it
+/// is returned, otherwise the Ops list is mutated as necessary.
+Value *Reassociate::OptimizeAdd(Instruction *I,
+ SmallVectorImpl<ValueEntry> &Ops) {
+ // Scan the operand lists looking for X and -X pairs. If we find any, we
+ // can simplify expressions like X+-X == 0 and X+~X ==-1. While we're at it,
+ // scan for any
+ // duplicates. We want to canonicalize Y+Y+Y+Z -> 3*Y+Z.
+
+ for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+ Value *TheOp = Ops[i].Op;
+ // Check to see if we've seen this operand before. If so, we factor all
+ // instances of the operand together. Due to our sorting criteria, we know
+ // that these need to be next to each other in the vector.
+ if (i+1 != Ops.size() && Ops[i+1].Op == TheOp) {
+ // Rescan the list, remove all instances of this operand from the expr.
+ unsigned NumFound = 0;
+ do {
+ Ops.erase(Ops.begin()+i);
+ ++NumFound;
+ } while (i != Ops.size() && Ops[i].Op == TheOp);
+
+ DEBUG(dbgs() << "\nFACTORING [" << NumFound << "]: " << *TheOp << '\n');
+ ++NumFactor;
+
+ // Insert a new multiply.
+ Type *Ty = TheOp->getType();
+ Constant *C = Ty->isIntOrIntVectorTy() ?
+ ConstantInt::get(Ty, NumFound) : ConstantFP::get(Ty, NumFound);
+ Instruction *Mul = CreateMul(TheOp, C, "factor", I, I);
+
+ // Now that we have inserted a multiply, optimize it. This allows us to
+ // handle cases that require multiple factoring steps, such as this:
+ // (X*2) + (X*2) + (X*2) -> (X*2)*3 -> X*6
+ RedoInsts.insert(Mul);
+
+ // If every add operand was a duplicate, return the multiply.
+ if (Ops.empty())
+ return Mul;
+
+ // Otherwise, we had some input that didn't have the dupe, such as
+ // "A + A + B" -> "A*2 + B". Add the new multiply to the list of
+ // things being added by this operation.
+ Ops.insert(Ops.begin(), ValueEntry(getRank(Mul), Mul));
+
+ --i;
+ e = Ops.size();
+ continue;
+ }
+
+ // Check for X and -X or X and ~X in the operand list.
+ if (!BinaryOperator::isNeg(TheOp) && !BinaryOperator::isFNeg(TheOp) &&
+ !BinaryOperator::isNot(TheOp))
+ continue;
+
+ Value *X = nullptr;
+ if (BinaryOperator::isNeg(TheOp) || BinaryOperator::isFNeg(TheOp))
+ X = BinaryOperator::getNegArgument(TheOp);
+ else if (BinaryOperator::isNot(TheOp))
+ X = BinaryOperator::getNotArgument(TheOp);
+
+ unsigned FoundX = FindInOperandList(Ops, i, X);
+ if (FoundX == i)
+ continue;
+
+ // Remove X and -X from the operand list.
+ if (Ops.size() == 2 &&
+ (BinaryOperator::isNeg(TheOp) || BinaryOperator::isFNeg(TheOp)))
+ return Constant::getNullValue(X->getType());
+
+ // Remove X and ~X from the operand list.
+ if (Ops.size() == 2 && BinaryOperator::isNot(TheOp))
+ return Constant::getAllOnesValue(X->getType());
+
+ Ops.erase(Ops.begin()+i);
+ if (i < FoundX)
+ --FoundX;
+ else
+ --i; // Need to back up an extra one.
+ Ops.erase(Ops.begin()+FoundX);
+ ++NumAnnihil;
+ --i; // Revisit element.
+ e -= 2; // Removed two elements.
+
+ // if X and ~X we append -1 to the operand list.
+ if (BinaryOperator::isNot(TheOp)) {
+ Value *V = Constant::getAllOnesValue(X->getType());
+ Ops.insert(Ops.end(), ValueEntry(getRank(V), V));
+ e += 1;
+ }
+ }
+
+ // Scan the operand list, checking to see if there are any common factors
+ // between operands. Consider something like A*A+A*B*C+D. We would like to
+ // reassociate this to A*(A+B*C)+D, which reduces the number of multiplies.
+ // To efficiently find this, we count the number of times a factor occurs
+ // for any ADD operands that are MULs.
+ DenseMap<Value*, unsigned> FactorOccurrences;
+
+ // Keep track of each multiply we see, to avoid triggering on (X*4)+(X*4)
+ // where they are actually the same multiply.
+ unsigned MaxOcc = 0;
+ Value *MaxOccVal = nullptr;
+ for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+ BinaryOperator *BOp =
+ isReassociableOp(Ops[i].Op, Instruction::Mul, Instruction::FMul);
+ if (!BOp)
+ continue;
+
+ // Compute all of the factors of this added value.
+ SmallVector<Value*, 8> Factors;
+ FindSingleUseMultiplyFactors(BOp, Factors, Ops);
+ assert(Factors.size() > 1 && "Bad linearize!");
+
+ // Add one to FactorOccurrences for each unique factor in this op.
+ SmallPtrSet<Value*, 8> Duplicates;
+ for (unsigned i = 0, e = Factors.size(); i != e; ++i) {
+ Value *Factor = Factors[i];
+ if (!Duplicates.insert(Factor).second)
+ continue;
+
+ unsigned Occ = ++FactorOccurrences[Factor];
+ if (Occ > MaxOcc) {
+ MaxOcc = Occ;
+ MaxOccVal = Factor;
+ }
+
+ // If Factor is a negative constant, add the negated value as a factor
+ // because we can percolate the negate out. Watch for minint, which
+ // cannot be positivified.
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(Factor)) {
+ if (CI->isNegative() && !CI->isMinValue(true)) {
+ Factor = ConstantInt::get(CI->getContext(), -CI->getValue());
+ assert(!Duplicates.count(Factor) &&
+ "Shouldn't have two constant factors, missed a canonicalize");
+ unsigned Occ = ++FactorOccurrences[Factor];
+ if (Occ > MaxOcc) {
+ MaxOcc = Occ;
+ MaxOccVal = Factor;
+ }
+ }
+ } else if (ConstantFP *CF = dyn_cast<ConstantFP>(Factor)) {
+ if (CF->isNegative()) {
+ APFloat F(CF->getValueAPF());
+ F.changeSign();
+ Factor = ConstantFP::get(CF->getContext(), F);
+ assert(!Duplicates.count(Factor) &&
+ "Shouldn't have two constant factors, missed a canonicalize");
+ unsigned Occ = ++FactorOccurrences[Factor];
+ if (Occ > MaxOcc) {
+ MaxOcc = Occ;
+ MaxOccVal = Factor;
+ }
+ }
+ }
+ }
+ }
+
+ // If any factor occurred more than one time, we can pull it out.
+ if (MaxOcc > 1) {
+ DEBUG(dbgs() << "\nFACTORING [" << MaxOcc << "]: " << *MaxOccVal << '\n');
+ ++NumFactor;
+
+ // Create a new instruction that uses the MaxOccVal twice. If we don't do
+ // this, we could otherwise run into situations where removing a factor
+ // from an expression will drop a use of maxocc, and this can cause
+ // RemoveFactorFromExpression on successive values to behave differently.
+ Instruction *DummyInst =
+ I->getType()->isIntOrIntVectorTy()
+ ? BinaryOperator::CreateAdd(MaxOccVal, MaxOccVal)
+ : BinaryOperator::CreateFAdd(MaxOccVal, MaxOccVal);
+
+ SmallVector<WeakVH, 4> NewMulOps;
+ for (unsigned i = 0; i != Ops.size(); ++i) {
+ // Only try to remove factors from expressions we're allowed to.
+ BinaryOperator *BOp =
+ isReassociableOp(Ops[i].Op, Instruction::Mul, Instruction::FMul);
+ if (!BOp)
+ continue;
+
+ if (Value *V = RemoveFactorFromExpression(Ops[i].Op, MaxOccVal)) {
+ // The factorized operand may occur several times. Convert them all in
+ // one fell swoop.
+ for (unsigned j = Ops.size(); j != i;) {
+ --j;
+ if (Ops[j].Op == Ops[i].Op) {
+ NewMulOps.push_back(V);
+ Ops.erase(Ops.begin()+j);
+ }
+ }
+ --i;
+ }
+ }
+
+ // No need for extra uses anymore.
+ delete DummyInst;
+
+ unsigned NumAddedValues = NewMulOps.size();
+ Value *V = EmitAddTreeOfValues(I, NewMulOps);
+
+ // Now that we have inserted the add tree, optimize it. This allows us to
+ // handle cases that require multiple factoring steps, such as this:
+ // A*A*B + A*A*C --> A*(A*B+A*C) --> A*(A*(B+C))
+ assert(NumAddedValues > 1 && "Each occurrence should contribute a value");
+ (void)NumAddedValues;
+ if (Instruction *VI = dyn_cast<Instruction>(V))
+ RedoInsts.insert(VI);
+
+ // Create the multiply.
+ Instruction *V2 = CreateMul(V, MaxOccVal, "tmp", I, I);
+
+ // Rerun associate on the multiply in case the inner expression turned into
+ // a multiply. We want to make sure that we keep things in canonical form.
+ RedoInsts.insert(V2);
+
+ // If every add operand included the factor (e.g. "A*B + A*C"), then the
+ // entire result expression is just the multiply "A*(B+C)".
+ if (Ops.empty())
+ return V2;
+
+ // Otherwise, we had some input that didn't have the factor, such as
+ // "A*B + A*C + D" -> "A*(B+C) + D". Add the new multiply to the list of
+ // things being added by this operation.
+ Ops.insert(Ops.begin(), ValueEntry(getRank(V2), V2));
+ }
+
+ return nullptr;
+}
+
+/// \brief Build up a vector of value/power pairs factoring a product.
+///
+/// Given a series of multiplication operands, build a vector of factors and
+/// the powers each is raised to when forming the final product. Sort them in
+/// the order of descending power.
+///
+/// (x*x) -> [(x, 2)]
+/// ((x*x)*x) -> [(x, 3)]
+/// ((((x*y)*x)*y)*x) -> [(x, 3), (y, 2)]
+///
+/// \returns Whether any factors have a power greater than one.
+bool Reassociate::collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops,
+ SmallVectorImpl<Factor> &Factors) {
+ // FIXME: Have Ops be (ValueEntry, Multiplicity) pairs, simplifying this.
+ // Compute the sum of powers of simplifiable factors.
+ unsigned FactorPowerSum = 0;
+ for (unsigned Idx = 1, Size = Ops.size(); Idx < Size; ++Idx) {
+ Value *Op = Ops[Idx-1].Op;
+
+ // Count the number of occurrences of this value.
+ unsigned Count = 1;
+ for (; Idx < Size && Ops[Idx].Op == Op; ++Idx)
+ ++Count;
+ // Track for simplification all factors which occur 2 or more times.
+ if (Count > 1)
+ FactorPowerSum += Count;
+ }
+
+ // We can only simplify factors if the sum of the powers of our simplifiable
+ // factors is 4 or higher. When that is the case, we will *always* have
+ // a simplification. This is an important invariant to prevent cyclicly
+ // trying to simplify already minimal formations.
+ if (FactorPowerSum < 4)
+ return false;
+
+ // Now gather the simplifiable factors, removing them from Ops.
+ FactorPowerSum = 0;
+ for (unsigned Idx = 1; Idx < Ops.size(); ++Idx) {
+ Value *Op = Ops[Idx-1].Op;
+
+ // Count the number of occurrences of this value.
+ unsigned Count = 1;
+ for (; Idx < Ops.size() && Ops[Idx].Op == Op; ++Idx)
+ ++Count;
+ if (Count == 1)
+ continue;
+ // Move an even number of occurrences to Factors.
+ Count &= ~1U;
+ Idx -= Count;
+ FactorPowerSum += Count;
+ Factors.push_back(Factor(Op, Count));
+ Ops.erase(Ops.begin()+Idx, Ops.begin()+Idx+Count);
+ }
+
+ // None of the adjustments above should have reduced the sum of factor powers
+ // below our mininum of '4'.
+ assert(FactorPowerSum >= 4);
+
+ std::stable_sort(Factors.begin(), Factors.end(), Factor::PowerDescendingSorter());
+ return true;
+}
+
+/// \brief Build a tree of multiplies, computing the product of Ops.
+static Value *buildMultiplyTree(IRBuilder<> &Builder,
+ SmallVectorImpl<Value*> &Ops) {
+ if (Ops.size() == 1)
+ return Ops.back();
+
+ Value *LHS = Ops.pop_back_val();
+ do {
+ if (LHS->getType()->isIntOrIntVectorTy())
+ LHS = Builder.CreateMul(LHS, Ops.pop_back_val());
+ else
+ LHS = Builder.CreateFMul(LHS, Ops.pop_back_val());
+ } while (!Ops.empty());
+
+ return LHS;
+}
+
+/// \brief Build a minimal multiplication DAG for (a^x)*(b^y)*(c^z)*...
+///
+/// Given a vector of values raised to various powers, where no two values are
+/// equal and the powers are sorted in decreasing order, compute the minimal
+/// DAG of multiplies to compute the final product, and return that product
+/// value.
+Value *Reassociate::buildMinimalMultiplyDAG(IRBuilder<> &Builder,
+ SmallVectorImpl<Factor> &Factors) {
+ assert(Factors[0].Power);
+ SmallVector<Value *, 4> OuterProduct;
+ for (unsigned LastIdx = 0, Idx = 1, Size = Factors.size();
+ Idx < Size && Factors[Idx].Power > 0; ++Idx) {
+ if (Factors[Idx].Power != Factors[LastIdx].Power) {
+ LastIdx = Idx;
+ continue;
+ }
+
+ // We want to multiply across all the factors with the same power so that
+ // we can raise them to that power as a single entity. Build a mini tree
+ // for that.
+ SmallVector<Value *, 4> InnerProduct;
+ InnerProduct.push_back(Factors[LastIdx].Base);
+ do {
+ InnerProduct.push_back(Factors[Idx].Base);
+ ++Idx;
+ } while (Idx < Size && Factors[Idx].Power == Factors[LastIdx].Power);
+
+ // Reset the base value of the first factor to the new expression tree.
+ // We'll remove all the factors with the same power in a second pass.
+ Value *M = Factors[LastIdx].Base = buildMultiplyTree(Builder, InnerProduct);
+ if (Instruction *MI = dyn_cast<Instruction>(M))
+ RedoInsts.insert(MI);
+
+ LastIdx = Idx;
+ }
+ // Unique factors with equal powers -- we've folded them into the first one's
+ // base.
+ Factors.erase(std::unique(Factors.begin(), Factors.end(),
+ Factor::PowerEqual()),
+ Factors.end());
+
+ // Iteratively collect the base of each factor with an add power into the
+ // outer product, and halve each power in preparation for squaring the
+ // expression.
+ for (unsigned Idx = 0, Size = Factors.size(); Idx != Size; ++Idx) {
+ if (Factors[Idx].Power & 1)
+ OuterProduct.push_back(Factors[Idx].Base);
+ Factors[Idx].Power >>= 1;
+ }
+ if (Factors[0].Power) {
+ Value *SquareRoot = buildMinimalMultiplyDAG(Builder, Factors);
+ OuterProduct.push_back(SquareRoot);
+ OuterProduct.push_back(SquareRoot);
+ }
+ if (OuterProduct.size() == 1)
+ return OuterProduct.front();
+
+ Value *V = buildMultiplyTree(Builder, OuterProduct);
+ return V;
+}
+
+Value *Reassociate::OptimizeMul(BinaryOperator *I,
+ SmallVectorImpl<ValueEntry> &Ops) {
+ // We can only optimize the multiplies when there is a chain of more than
+ // three, such that a balanced tree might require fewer total multiplies.
+ if (Ops.size() < 4)
+ return nullptr;
+
+ // Try to turn linear trees of multiplies without other uses of the
+ // intermediate stages into minimal multiply DAGs with perfect sub-expression
+ // re-use.
+ SmallVector<Factor, 4> Factors;
+ if (!collectMultiplyFactors(Ops, Factors))
+ return nullptr; // All distinct factors, so nothing left for us to do.
+
+ IRBuilder<> Builder(I);
+ Value *V = buildMinimalMultiplyDAG(Builder, Factors);
+ if (Ops.empty())
+ return V;
+
+ ValueEntry NewEntry = ValueEntry(getRank(V), V);
+ Ops.insert(std::lower_bound(Ops.begin(), Ops.end(), NewEntry), NewEntry);
+ return nullptr;
+}
+
+Value *Reassociate::OptimizeExpression(BinaryOperator *I,
+ SmallVectorImpl<ValueEntry> &Ops) {
+ // Now that we have the linearized expression tree, try to optimize it.
+ // Start by folding any constants that we found.
+ Constant *Cst = nullptr;
+ unsigned Opcode = I->getOpcode();
+ while (!Ops.empty() && isa<Constant>(Ops.back().Op)) {
+ Constant *C = cast<Constant>(Ops.pop_back_val().Op);
+ Cst = Cst ? ConstantExpr::get(Opcode, C, Cst) : C;
+ }
+ // If there was nothing but constants then we are done.
+ if (Ops.empty())
+ return Cst;
+
+ // Put the combined constant back at the end of the operand list, except if
+ // there is no point. For example, an add of 0 gets dropped here, while a
+ // multiplication by zero turns the whole expression into zero.
+ if (Cst && Cst != ConstantExpr::getBinOpIdentity(Opcode, I->getType())) {
+ if (Cst == ConstantExpr::getBinOpAbsorber(Opcode, I->getType()))
+ return Cst;
+ Ops.push_back(ValueEntry(0, Cst));
+ }
+
+ if (Ops.size() == 1) return Ops[0].Op;
+
+ // Handle destructive annihilation due to identities between elements in the
+ // argument list here.
+ unsigned NumOps = Ops.size();
+ switch (Opcode) {
+ default: break;
+ case Instruction::And:
+ case Instruction::Or:
+ if (Value *Result = OptimizeAndOrXor(Opcode, Ops))
+ return Result;
+ break;
+
+ case Instruction::Xor:
+ if (Value *Result = OptimizeXor(I, Ops))
+ return Result;
+ break;
+
+ case Instruction::Add:
+ case Instruction::FAdd:
+ if (Value *Result = OptimizeAdd(I, Ops))
+ return Result;
+ break;
+
+ case Instruction::Mul:
+ case Instruction::FMul:
+ if (Value *Result = OptimizeMul(I, Ops))
+ return Result;
+ break;
+ }
+
+ if (Ops.size() != NumOps)
+ return OptimizeExpression(I, Ops);
+ return nullptr;
+}
+
+// Remove dead instructions and if any operands are trivially dead add them to
+// Insts so they will be removed as well.
+void Reassociate::RecursivelyEraseDeadInsts(
+ Instruction *I, SetVector<AssertingVH<Instruction>> &Insts) {
+ assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!");
+ SmallVector<Value *, 4> Ops(I->op_begin(), I->op_end());
+ ValueRankMap.erase(I);
+ Insts.remove(I);
+ RedoInsts.remove(I);
+ I->eraseFromParent();
+ for (auto Op : Ops)
+ if (Instruction *OpInst = dyn_cast<Instruction>(Op))
+ if (OpInst->use_empty())
+ Insts.insert(OpInst);
+}
+
+/// Zap the given instruction, adding interesting operands to the work list.
+void Reassociate::EraseInst(Instruction *I) {
+ assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!");
+ SmallVector<Value*, 8> Ops(I->op_begin(), I->op_end());
+ // Erase the dead instruction.
+ ValueRankMap.erase(I);
+ RedoInsts.remove(I);
+ I->eraseFromParent();
+ // Optimize its operands.
+ SmallPtrSet<Instruction *, 8> Visited; // Detect self-referential nodes.
+ for (unsigned i = 0, e = Ops.size(); i != e; ++i)
+ if (Instruction *Op = dyn_cast<Instruction>(Ops[i])) {
+ // If this is a node in an expression tree, climb to the expression root
+ // and add that since that's where optimization actually happens.
+ unsigned Opcode = Op->getOpcode();
+ while (Op->hasOneUse() && Op->user_back()->getOpcode() == Opcode &&
+ Visited.insert(Op).second)
+ Op = Op->user_back();
+ RedoInsts.insert(Op);
+ }
+}
+
+// Canonicalize expressions of the following form:
+// x + (-Constant * y) -> x - (Constant * y)
+// x - (-Constant * y) -> x + (Constant * y)
+Instruction *Reassociate::canonicalizeNegConstExpr(Instruction *I) {
+ if (!I->hasOneUse() || I->getType()->isVectorTy())
+ return nullptr;
+
+ // Must be a fmul or fdiv instruction.
+ unsigned Opcode = I->getOpcode();
+ if (Opcode != Instruction::FMul && Opcode != Instruction::FDiv)
+ return nullptr;
+
+ auto *C0 = dyn_cast<ConstantFP>(I->getOperand(0));
+ auto *C1 = dyn_cast<ConstantFP>(I->getOperand(1));
+
+ // Both operands are constant, let it get constant folded away.
+ if (C0 && C1)
+ return nullptr;
+
+ ConstantFP *CF = C0 ? C0 : C1;
+
+ // Must have one constant operand.
+ if (!CF)
+ return nullptr;
+
+ // Must be a negative ConstantFP.
+ if (!CF->isNegative())
+ return nullptr;
+
+ // User must be a binary operator with one or more uses.
+ Instruction *User = I->user_back();
+ if (!isa<BinaryOperator>(User) || !User->hasNUsesOrMore(1))
+ return nullptr;
+
+ unsigned UserOpcode = User->getOpcode();
+ if (UserOpcode != Instruction::FAdd && UserOpcode != Instruction::FSub)
+ return nullptr;
+
+ // Subtraction is not commutative. Explicitly, the following transform is
+ // not valid: (-Constant * y) - x -> x + (Constant * y)
+ if (!User->isCommutative() && User->getOperand(1) != I)
+ return nullptr;
+
+ // Change the sign of the constant.
+ APFloat Val = CF->getValueAPF();
+ Val.changeSign();
+ I->setOperand(C0 ? 0 : 1, ConstantFP::get(CF->getContext(), Val));
+
+ // Canonicalize I to RHS to simplify the next bit of logic. E.g.,
+ // ((-Const*y) + x) -> (x + (-Const*y)).
+ if (User->getOperand(0) == I && User->isCommutative())
+ cast<BinaryOperator>(User)->swapOperands();
+
+ Value *Op0 = User->getOperand(0);
+ Value *Op1 = User->getOperand(1);
+ BinaryOperator *NI;
+ switch (UserOpcode) {
+ default:
+ llvm_unreachable("Unexpected Opcode!");
+ case Instruction::FAdd:
+ NI = BinaryOperator::CreateFSub(Op0, Op1);
+ NI->setFastMathFlags(cast<FPMathOperator>(User)->getFastMathFlags());
+ break;
+ case Instruction::FSub:
+ NI = BinaryOperator::CreateFAdd(Op0, Op1);
+ NI->setFastMathFlags(cast<FPMathOperator>(User)->getFastMathFlags());
+ break;
+ }
+
+ NI->insertBefore(User);
+ NI->setName(User->getName());
+ User->replaceAllUsesWith(NI);
+ NI->setDebugLoc(I->getDebugLoc());
+ RedoInsts.insert(I);
+ MadeChange = true;
+ return NI;
+}
+
+/// Inspect and optimize the given instruction. Note that erasing
+/// instructions is not allowed.
+void Reassociate::OptimizeInst(Instruction *I) {
+ // Only consider operations that we understand.
+ if (!isa<BinaryOperator>(I))
+ return;
+
+ if (I->getOpcode() == Instruction::Shl && isa<ConstantInt>(I->getOperand(1)))
+ // If an operand of this shift is a reassociable multiply, or if the shift
+ // is used by a reassociable multiply or add, turn into a multiply.
+ if (isReassociableOp(I->getOperand(0), Instruction::Mul) ||
+ (I->hasOneUse() &&
+ (isReassociableOp(I->user_back(), Instruction::Mul) ||
+ isReassociableOp(I->user_back(), Instruction::Add)))) {
+ Instruction *NI = ConvertShiftToMul(I);
+ RedoInsts.insert(I);
+ MadeChange = true;
+ I = NI;
+ }
+
+ // Canonicalize negative constants out of expressions.
+ if (Instruction *Res = canonicalizeNegConstExpr(I))
+ I = Res;
+
+ // Commute binary operators, to canonicalize the order of their operands.
+ // This can potentially expose more CSE opportunities, and makes writing other
+ // transformations simpler.
+ if (I->isCommutative())
+ canonicalizeOperands(I);
+
+ // TODO: We should optimize vector Xor instructions, but they are
+ // currently unsupported.
+ if (I->getType()->isVectorTy() && I->getOpcode() == Instruction::Xor)
+ return;
+
+ // Don't optimize floating point instructions that don't have unsafe algebra.
+ if (I->getType()->isFPOrFPVectorTy() && !I->hasUnsafeAlgebra())
+ return;
+
+ // Do not reassociate boolean (i1) expressions. We want to preserve the
+ // original order of evaluation for short-circuited comparisons that
+ // SimplifyCFG has folded to AND/OR expressions. If the expression
+ // is not further optimized, it is likely to be transformed back to a
+ // short-circuited form for code gen, and the source order may have been
+ // optimized for the most likely conditions.
+ if (I->getType()->isIntegerTy(1))
+ return;
+
+ // If this is a subtract instruction which is not already in negate form,
+ // see if we can convert it to X+-Y.
+ if (I->getOpcode() == Instruction::Sub) {
+ if (ShouldBreakUpSubtract(I)) {
+ Instruction *NI = BreakUpSubtract(I, RedoInsts);
+ RedoInsts.insert(I);
+ MadeChange = true;
+ I = NI;
+ } else if (BinaryOperator::isNeg(I)) {
+ // Otherwise, this is a negation. See if the operand is a multiply tree
+ // and if this is not an inner node of a multiply tree.
+ if (isReassociableOp(I->getOperand(1), Instruction::Mul) &&
+ (!I->hasOneUse() ||
+ !isReassociableOp(I->user_back(), Instruction::Mul))) {
+ Instruction *NI = LowerNegateToMultiply(I);
+ // If the negate was simplified, revisit the users to see if we can
+ // reassociate further.
+ for (User *U : NI->users()) {
+ if (BinaryOperator *Tmp = dyn_cast<BinaryOperator>(U))
+ RedoInsts.insert(Tmp);
+ }
+ RedoInsts.insert(I);
+ MadeChange = true;
+ I = NI;
+ }
+ }
+ } else if (I->getOpcode() == Instruction::FSub) {
+ if (ShouldBreakUpSubtract(I)) {
+ Instruction *NI = BreakUpSubtract(I, RedoInsts);
+ RedoInsts.insert(I);
+ MadeChange = true;
+ I = NI;
+ } else if (BinaryOperator::isFNeg(I)) {
+ // Otherwise, this is a negation. See if the operand is a multiply tree
+ // and if this is not an inner node of a multiply tree.
+ if (isReassociableOp(I->getOperand(1), Instruction::FMul) &&
+ (!I->hasOneUse() ||
+ !isReassociableOp(I->user_back(), Instruction::FMul))) {
+ // If the negate was simplified, revisit the users to see if we can
+ // reassociate further.
+ Instruction *NI = LowerNegateToMultiply(I);
+ for (User *U : NI->users()) {
+ if (BinaryOperator *Tmp = dyn_cast<BinaryOperator>(U))
+ RedoInsts.insert(Tmp);
+ }
+ RedoInsts.insert(I);
+ MadeChange = true;
+ I = NI;
+ }
+ }
+ }
+
+ // If this instruction is an associative binary operator, process it.
+ if (!I->isAssociative()) return;
+ BinaryOperator *BO = cast<BinaryOperator>(I);
+
+ // If this is an interior node of a reassociable tree, ignore it until we
+ // get to the root of the tree, to avoid N^2 analysis.
+ unsigned Opcode = BO->getOpcode();
+ if (BO->hasOneUse() && BO->user_back()->getOpcode() == Opcode) {
+ // During the initial run we will get to the root of the tree.
+ // But if we get here while we are redoing instructions, there is no
+ // guarantee that the root will be visited. So Redo later
+ if (BO->user_back() != BO &&
+ BO->getParent() == BO->user_back()->getParent())
+ RedoInsts.insert(BO->user_back());
+ return;
+ }
+
+ // If this is an add tree that is used by a sub instruction, ignore it
+ // until we process the subtract.
+ if (BO->hasOneUse() && BO->getOpcode() == Instruction::Add &&
+ cast<Instruction>(BO->user_back())->getOpcode() == Instruction::Sub)
+ return;
+ if (BO->hasOneUse() && BO->getOpcode() == Instruction::FAdd &&
+ cast<Instruction>(BO->user_back())->getOpcode() == Instruction::FSub)
+ return;
+
+ ReassociateExpression(BO);
+}
+
+void Reassociate::ReassociateExpression(BinaryOperator *I) {
+ // First, walk the expression tree, linearizing the tree, collecting the
+ // operand information.
+ SmallVector<RepeatedValue, 8> Tree;
+ MadeChange |= LinearizeExprTree(I, Tree);
+ SmallVector<ValueEntry, 8> Ops;
+ Ops.reserve(Tree.size());
+ for (unsigned i = 0, e = Tree.size(); i != e; ++i) {
+ RepeatedValue E = Tree[i];
+ Ops.append(E.second.getZExtValue(),
+ ValueEntry(getRank(E.first), E.first));
+ }
+
+ DEBUG(dbgs() << "RAIn:\t"; PrintOps(I, Ops); dbgs() << '\n');
+
+ // Now that we have linearized the tree to a list and have gathered all of
+ // the operands and their ranks, sort the operands by their rank. Use a
+ // stable_sort so that values with equal ranks will have their relative
+ // positions maintained (and so the compiler is deterministic). Note that
+ // this sorts so that the highest ranking values end up at the beginning of
+ // the vector.
+ std::stable_sort(Ops.begin(), Ops.end());
+
+ // Now that we have the expression tree in a convenient
+ // sorted form, optimize it globally if possible.
+ if (Value *V = OptimizeExpression(I, Ops)) {
+ if (V == I)
+ // Self-referential expression in unreachable code.
+ return;
+ // This expression tree simplified to something that isn't a tree,
+ // eliminate it.
+ DEBUG(dbgs() << "Reassoc to scalar: " << *V << '\n');
+ I->replaceAllUsesWith(V);
+ if (Instruction *VI = dyn_cast<Instruction>(V))
+ VI->setDebugLoc(I->getDebugLoc());
+ RedoInsts.insert(I);
+ ++NumAnnihil;
+ return;
+ }
+
+ // We want to sink immediates as deeply as possible except in the case where
+ // this is a multiply tree used only by an add, and the immediate is a -1.
+ // In this case we reassociate to put the negation on the outside so that we
+ // can fold the negation into the add: (-X)*Y + Z -> Z-X*Y
+ if (I->hasOneUse()) {
+ if (I->getOpcode() == Instruction::Mul &&
+ cast<Instruction>(I->user_back())->getOpcode() == Instruction::Add &&
+ isa<ConstantInt>(Ops.back().Op) &&
+ cast<ConstantInt>(Ops.back().Op)->isAllOnesValue()) {
+ ValueEntry Tmp = Ops.pop_back_val();
+ Ops.insert(Ops.begin(), Tmp);
+ } else if (I->getOpcode() == Instruction::FMul &&
+ cast<Instruction>(I->user_back())->getOpcode() ==
+ Instruction::FAdd &&
+ isa<ConstantFP>(Ops.back().Op) &&
+ cast<ConstantFP>(Ops.back().Op)->isExactlyValue(-1.0)) {
+ ValueEntry Tmp = Ops.pop_back_val();
+ Ops.insert(Ops.begin(), Tmp);
+ }
+ }
+
+ DEBUG(dbgs() << "RAOut:\t"; PrintOps(I, Ops); dbgs() << '\n');
+
+ if (Ops.size() == 1) {
+ if (Ops[0].Op == I)
+ // Self-referential expression in unreachable code.
+ return;
+
+ // This expression tree simplified to something that isn't a tree,
+ // eliminate it.
+ I->replaceAllUsesWith(Ops[0].Op);
+ if (Instruction *OI = dyn_cast<Instruction>(Ops[0].Op))
+ OI->setDebugLoc(I->getDebugLoc());
+ RedoInsts.insert(I);
+ return;
+ }
+
+ // Now that we ordered and optimized the expressions, splat them back into
+ // the expression tree, removing any unneeded nodes.
+ RewriteExprTree(I, Ops);
+}
+
+bool Reassociate::runOnFunction(Function &F) {
+ if (skipOptnoneFunction(F))
+ return false;
+
+ // Calculate the rank map for F
+ BuildRankMap(F);
+
+ MadeChange = false;
+ for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
+ // Optimize every instruction in the basic block.
+ for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE; )
+ if (isInstructionTriviallyDead(&*II)) {
+ EraseInst(&*II++);
+ } else {
+ OptimizeInst(&*II);
+ assert(II->getParent() == BI && "Moved to a different block!");
+ ++II;
+ }
+
+ // Make a copy of all the instructions to be redone so we can remove dead
+ // instructions.
+ SetVector<AssertingVH<Instruction>> ToRedo(RedoInsts);
+ // Iterate over all instructions to be reevaluated and remove trivially dead
+ // instructions. If any operand of the trivially dead instruction becomes
+ // dead mark it for deletion as well. Continue this process until all
+ // trivially dead instructions have been removed.
+ while (!ToRedo.empty()) {
+ Instruction *I = ToRedo.pop_back_val();
+ if (isInstructionTriviallyDead(I))
+ RecursivelyEraseDeadInsts(I, ToRedo);
+ }
+
+ // Now that we have removed dead instructions, we can reoptimize the
+ // remaining instructions.
+ while (!RedoInsts.empty()) {
+ Instruction *I = RedoInsts.pop_back_val();
+ if (isInstructionTriviallyDead(I))
+ EraseInst(I);
+ else
+ OptimizeInst(I);
+ }
+ }
+
+ // We are done with the rank map.
+ RankMap.clear();
+ ValueRankMap.clear();
+
+ return MadeChange;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp b/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
new file mode 100644
index 0000000..915f897
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
@@ -0,0 +1,132 @@
+//===- Reg2Mem.cpp - Convert registers to allocas -------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file demotes all registers to memory references. It is intended to be
+// the inverse of PromoteMemoryToRegister. By converting to loads, the only
+// values live across basic blocks are allocas and loads before phi nodes.
+// It is intended that this should make CFG hacking much easier.
+// To make later hacking easier, the entry block is split into two, such that
+// all introduced allocas and nothing else are in the entry block.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <list>
+using namespace llvm;
+
+#define DEBUG_TYPE "reg2mem"
+
+STATISTIC(NumRegsDemoted, "Number of registers demoted");
+STATISTIC(NumPhisDemoted, "Number of phi-nodes demoted");
+
+namespace {
+ struct RegToMem : public FunctionPass {
+ static char ID; // Pass identification, replacement for typeid
+ RegToMem() : FunctionPass(ID) {
+ initializeRegToMemPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequiredID(BreakCriticalEdgesID);
+ AU.addPreservedID(BreakCriticalEdgesID);
+ }
+
+ bool valueEscapes(const Instruction *Inst) const {
+ const BasicBlock *BB = Inst->getParent();
+ for (const User *U : Inst->users()) {
+ const Instruction *UI = cast<Instruction>(U);
+ if (UI->getParent() != BB || isa<PHINode>(UI))
+ return true;
+ }
+ return false;
+ }
+
+ bool runOnFunction(Function &F) override;
+ };
+}
+
+char RegToMem::ID = 0;
+INITIALIZE_PASS_BEGIN(RegToMem, "reg2mem", "Demote all values to stack slots",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(BreakCriticalEdges)
+INITIALIZE_PASS_END(RegToMem, "reg2mem", "Demote all values to stack slots",
+ false, false)
+
+bool RegToMem::runOnFunction(Function &F) {
+ if (F.isDeclaration())
+ return false;
+
+ // Insert all new allocas into entry block.
+ BasicBlock *BBEntry = &F.getEntryBlock();
+ assert(pred_empty(BBEntry) &&
+ "Entry block to function must not have predecessors!");
+
+ // Find first non-alloca instruction and create insertion point. This is
+ // safe if block is well-formed: it always have terminator, otherwise
+ // we'll get and assertion.
+ BasicBlock::iterator I = BBEntry->begin();
+ while (isa<AllocaInst>(I)) ++I;
+
+ CastInst *AllocaInsertionPoint = new BitCastInst(
+ Constant::getNullValue(Type::getInt32Ty(F.getContext())),
+ Type::getInt32Ty(F.getContext()), "reg2mem alloca point", &*I);
+
+ // Find the escaped instructions. But don't create stack slots for
+ // allocas in entry block.
+ std::list<Instruction*> WorkList;
+ for (Function::iterator ibb = F.begin(), ibe = F.end();
+ ibb != ibe; ++ibb)
+ for (BasicBlock::iterator iib = ibb->begin(), iie = ibb->end();
+ iib != iie; ++iib) {
+ if (!(isa<AllocaInst>(iib) && iib->getParent() == BBEntry) &&
+ valueEscapes(&*iib)) {
+ WorkList.push_front(&*iib);
+ }
+ }
+
+ // Demote escaped instructions
+ NumRegsDemoted += WorkList.size();
+ for (std::list<Instruction*>::iterator ilb = WorkList.begin(),
+ ile = WorkList.end(); ilb != ile; ++ilb)
+ DemoteRegToStack(**ilb, false, AllocaInsertionPoint);
+
+ WorkList.clear();
+
+ // Find all phi's
+ for (Function::iterator ibb = F.begin(), ibe = F.end();
+ ibb != ibe; ++ibb)
+ for (BasicBlock::iterator iib = ibb->begin(), iie = ibb->end();
+ iib != iie; ++iib)
+ if (isa<PHINode>(iib))
+ WorkList.push_front(&*iib);
+
+ // Demote phi nodes
+ NumPhisDemoted += WorkList.size();
+ for (std::list<Instruction*>::iterator ilb = WorkList.begin(),
+ ile = WorkList.end(); ilb != ile; ++ilb)
+ DemotePHIToStack(cast<PHINode>(*ilb), AllocaInsertionPoint);
+
+ return true;
+}
+
+
+// createDemoteRegisterToMemory - Provide an entry point to create this pass.
+char &llvm::DemoteRegisterToMemoryID = RegToMem::ID;
+FunctionPass *llvm::createDemoteRegisterToMemoryPass() {
+ return new RegToMem();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
new file mode 100644
index 0000000..d77d574
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -0,0 +1,2915 @@
+//===- RewriteStatepointsForGC.cpp - Make GC relocations explicit ---------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Rewrite an existing set of gc.statepoints such that they make potential
+// relocations performed by the garbage collector explicit in the IR.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Pass.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Statepoint.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+
+#define DEBUG_TYPE "rewrite-statepoints-for-gc"
+
+using namespace llvm;
+
+// Print the liveset found at the insert location
+static cl::opt<bool> PrintLiveSet("spp-print-liveset", cl::Hidden,
+ cl::init(false));
+static cl::opt<bool> PrintLiveSetSize("spp-print-liveset-size", cl::Hidden,
+ cl::init(false));
+// Print out the base pointers for debugging
+static cl::opt<bool> PrintBasePointers("spp-print-base-pointers", cl::Hidden,
+ cl::init(false));
+
+// Cost threshold measuring when it is profitable to rematerialize value instead
+// of relocating it
+static cl::opt<unsigned>
+RematerializationThreshold("spp-rematerialization-threshold", cl::Hidden,
+ cl::init(6));
+
+#ifdef XDEBUG
+static bool ClobberNonLive = true;
+#else
+static bool ClobberNonLive = false;
+#endif
+static cl::opt<bool, true> ClobberNonLiveOverride("rs4gc-clobber-non-live",
+ cl::location(ClobberNonLive),
+ cl::Hidden);
+
+static cl::opt<bool> UseDeoptBundles("rs4gc-use-deopt-bundles", cl::Hidden,
+ cl::init(false));
+static cl::opt<bool>
+ AllowStatepointWithNoDeoptInfo("rs4gc-allow-statepoint-with-no-deopt-info",
+ cl::Hidden, cl::init(true));
+
+/// Should we split vectors of pointers into their individual elements? This
+/// is known to be buggy, but the alternate implementation isn't yet ready.
+/// This is purely to provide a debugging and dianostic hook until the vector
+/// split is replaced with vector relocations.
+static cl::opt<bool> UseVectorSplit("rs4gc-split-vector-values", cl::Hidden,
+ cl::init(true));
+
+namespace {
+struct RewriteStatepointsForGC : public ModulePass {
+ static char ID; // Pass identification, replacement for typeid
+
+ RewriteStatepointsForGC() : ModulePass(ID) {
+ initializeRewriteStatepointsForGCPass(*PassRegistry::getPassRegistry());
+ }
+ bool runOnFunction(Function &F);
+ bool runOnModule(Module &M) override {
+ bool Changed = false;
+ for (Function &F : M)
+ Changed |= runOnFunction(F);
+
+ if (Changed) {
+ // stripNonValidAttributes asserts that shouldRewriteStatepointsIn
+ // returns true for at least one function in the module. Since at least
+ // one function changed, we know that the precondition is satisfied.
+ stripNonValidAttributes(M);
+ }
+
+ return Changed;
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ // We add and rewrite a bunch of instructions, but don't really do much
+ // else. We could in theory preserve a lot more analyses here.
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ }
+
+ /// The IR fed into RewriteStatepointsForGC may have had attributes implying
+ /// dereferenceability that are no longer valid/correct after
+ /// RewriteStatepointsForGC has run. This is because semantically, after
+ /// RewriteStatepointsForGC runs, all calls to gc.statepoint "free" the entire
+ /// heap. stripNonValidAttributes (conservatively) restores correctness
+ /// by erasing all attributes in the module that externally imply
+ /// dereferenceability.
+ /// Similar reasoning also applies to the noalias attributes. gc.statepoint
+ /// can touch the entire heap including noalias objects.
+ void stripNonValidAttributes(Module &M);
+
+ // Helpers for stripNonValidAttributes
+ void stripNonValidAttributesFromBody(Function &F);
+ void stripNonValidAttributesFromPrototype(Function &F);
+};
+} // namespace
+
+char RewriteStatepointsForGC::ID = 0;
+
+ModulePass *llvm::createRewriteStatepointsForGCPass() {
+ return new RewriteStatepointsForGC();
+}
+
+INITIALIZE_PASS_BEGIN(RewriteStatepointsForGC, "rewrite-statepoints-for-gc",
+ "Make relocations explicit at statepoints", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(RewriteStatepointsForGC, "rewrite-statepoints-for-gc",
+ "Make relocations explicit at statepoints", false, false)
+
+namespace {
+struct GCPtrLivenessData {
+ /// Values defined in this block.
+ DenseMap<BasicBlock *, DenseSet<Value *>> KillSet;
+ /// Values used in this block (and thus live); does not included values
+ /// killed within this block.
+ DenseMap<BasicBlock *, DenseSet<Value *>> LiveSet;
+
+ /// Values live into this basic block (i.e. used by any
+ /// instruction in this basic block or ones reachable from here)
+ DenseMap<BasicBlock *, DenseSet<Value *>> LiveIn;
+
+ /// Values live out of this basic block (i.e. live into
+ /// any successor block)
+ DenseMap<BasicBlock *, DenseSet<Value *>> LiveOut;
+};
+
+// The type of the internal cache used inside the findBasePointers family
+// of functions. From the callers perspective, this is an opaque type and
+// should not be inspected.
+//
+// In the actual implementation this caches two relations:
+// - The base relation itself (i.e. this pointer is based on that one)
+// - The base defining value relation (i.e. before base_phi insertion)
+// Generally, after the execution of a full findBasePointer call, only the
+// base relation will remain. Internally, we add a mixture of the two
+// types, then update all the second type to the first type
+typedef DenseMap<Value *, Value *> DefiningValueMapTy;
+typedef DenseSet<Value *> StatepointLiveSetTy;
+typedef DenseMap<AssertingVH<Instruction>, AssertingVH<Value>>
+ RematerializedValueMapTy;
+
+struct PartiallyConstructedSafepointRecord {
+ /// The set of values known to be live across this safepoint
+ StatepointLiveSetTy LiveSet;
+
+ /// Mapping from live pointers to a base-defining-value
+ DenseMap<Value *, Value *> PointerToBase;
+
+ /// The *new* gc.statepoint instruction itself. This produces the token
+ /// that normal path gc.relocates and the gc.result are tied to.
+ Instruction *StatepointToken;
+
+ /// Instruction to which exceptional gc relocates are attached
+ /// Makes it easier to iterate through them during relocationViaAlloca.
+ Instruction *UnwindToken;
+
+ /// Record live values we are rematerialized instead of relocating.
+ /// They are not included into 'LiveSet' field.
+ /// Maps rematerialized copy to it's original value.
+ RematerializedValueMapTy RematerializedValues;
+};
+}
+
+static ArrayRef<Use> GetDeoptBundleOperands(ImmutableCallSite CS) {
+ assert(UseDeoptBundles && "Should not be called otherwise!");
+
+ Optional<OperandBundleUse> DeoptBundle = CS.getOperandBundle("deopt");
+
+ if (!DeoptBundle.hasValue()) {
+ assert(AllowStatepointWithNoDeoptInfo &&
+ "Found non-leaf call without deopt info!");
+ return None;
+ }
+
+ return DeoptBundle.getValue().Inputs;
+}
+
+/// Compute the live-in set for every basic block in the function
+static void computeLiveInValues(DominatorTree &DT, Function &F,
+ GCPtrLivenessData &Data);
+
+/// Given results from the dataflow liveness computation, find the set of live
+/// Values at a particular instruction.
+static void findLiveSetAtInst(Instruction *inst, GCPtrLivenessData &Data,
+ StatepointLiveSetTy &out);
+
+// TODO: Once we can get to the GCStrategy, this becomes
+// Optional<bool> isGCManagedPointer(const Type *Ty) const override {
+
+static bool isGCPointerType(Type *T) {
+ if (auto *PT = dyn_cast<PointerType>(T))
+ // For the sake of this example GC, we arbitrarily pick addrspace(1) as our
+ // GC managed heap. We know that a pointer into this heap needs to be
+ // updated and that no other pointer does.
+ return (1 == PT->getAddressSpace());
+ return false;
+}
+
+// Return true if this type is one which a) is a gc pointer or contains a GC
+// pointer and b) is of a type this code expects to encounter as a live value.
+// (The insertion code will assert that a type which matches (a) and not (b)
+// is not encountered.)
+static bool isHandledGCPointerType(Type *T) {
+ // We fully support gc pointers
+ if (isGCPointerType(T))
+ return true;
+ // We partially support vectors of gc pointers. The code will assert if it
+ // can't handle something.
+ if (auto VT = dyn_cast<VectorType>(T))
+ if (isGCPointerType(VT->getElementType()))
+ return true;
+ return false;
+}
+
+#ifndef NDEBUG
+/// Returns true if this type contains a gc pointer whether we know how to
+/// handle that type or not.
+static bool containsGCPtrType(Type *Ty) {
+ if (isGCPointerType(Ty))
+ return true;
+ if (VectorType *VT = dyn_cast<VectorType>(Ty))
+ return isGCPointerType(VT->getScalarType());
+ if (ArrayType *AT = dyn_cast<ArrayType>(Ty))
+ return containsGCPtrType(AT->getElementType());
+ if (StructType *ST = dyn_cast<StructType>(Ty))
+ return std::any_of(ST->subtypes().begin(), ST->subtypes().end(),
+ containsGCPtrType);
+ return false;
+}
+
+// Returns true if this is a type which a) is a gc pointer or contains a GC
+// pointer and b) is of a type which the code doesn't expect (i.e. first class
+// aggregates). Used to trip assertions.
+static bool isUnhandledGCPointerType(Type *Ty) {
+ return containsGCPtrType(Ty) && !isHandledGCPointerType(Ty);
+}
+#endif
+
+static bool order_by_name(Value *a, Value *b) {
+ if (a->hasName() && b->hasName()) {
+ return -1 == a->getName().compare(b->getName());
+ } else if (a->hasName() && !b->hasName()) {
+ return true;
+ } else if (!a->hasName() && b->hasName()) {
+ return false;
+ } else {
+ // Better than nothing, but not stable
+ return a < b;
+ }
+}
+
+// Return the name of the value suffixed with the provided value, or if the
+// value didn't have a name, the default value specified.
+static std::string suffixed_name_or(Value *V, StringRef Suffix,
+ StringRef DefaultName) {
+ return V->hasName() ? (V->getName() + Suffix).str() : DefaultName.str();
+}
+
+// Conservatively identifies any definitions which might be live at the
+// given instruction. The analysis is performed immediately before the
+// given instruction. Values defined by that instruction are not considered
+// live. Values used by that instruction are considered live.
+static void analyzeParsePointLiveness(
+ DominatorTree &DT, GCPtrLivenessData &OriginalLivenessData,
+ const CallSite &CS, PartiallyConstructedSafepointRecord &result) {
+ Instruction *inst = CS.getInstruction();
+
+ StatepointLiveSetTy LiveSet;
+ findLiveSetAtInst(inst, OriginalLivenessData, LiveSet);
+
+ if (PrintLiveSet) {
+ // Note: This output is used by several of the test cases
+ // The order of elements in a set is not stable, put them in a vec and sort
+ // by name
+ SmallVector<Value *, 64> Temp;
+ Temp.insert(Temp.end(), LiveSet.begin(), LiveSet.end());
+ std::sort(Temp.begin(), Temp.end(), order_by_name);
+ errs() << "Live Variables:\n";
+ for (Value *V : Temp)
+ dbgs() << " " << V->getName() << " " << *V << "\n";
+ }
+ if (PrintLiveSetSize) {
+ errs() << "Safepoint For: " << CS.getCalledValue()->getName() << "\n";
+ errs() << "Number live values: " << LiveSet.size() << "\n";
+ }
+ result.LiveSet = LiveSet;
+}
+
+static bool isKnownBaseResult(Value *V);
+namespace {
+/// A single base defining value - An immediate base defining value for an
+/// instruction 'Def' is an input to 'Def' whose base is also a base of 'Def'.
+/// For instructions which have multiple pointer [vector] inputs or that
+/// transition between vector and scalar types, there is no immediate base
+/// defining value. The 'base defining value' for 'Def' is the transitive
+/// closure of this relation stopping at the first instruction which has no
+/// immediate base defining value. The b.d.v. might itself be a base pointer,
+/// but it can also be an arbitrary derived pointer.
+struct BaseDefiningValueResult {
+ /// Contains the value which is the base defining value.
+ Value * const BDV;
+ /// True if the base defining value is also known to be an actual base
+ /// pointer.
+ const bool IsKnownBase;
+ BaseDefiningValueResult(Value *BDV, bool IsKnownBase)
+ : BDV(BDV), IsKnownBase(IsKnownBase) {
+#ifndef NDEBUG
+ // Check consistency between new and old means of checking whether a BDV is
+ // a base.
+ bool MustBeBase = isKnownBaseResult(BDV);
+ assert(!MustBeBase || MustBeBase == IsKnownBase);
+#endif
+ }
+};
+}
+
+static BaseDefiningValueResult findBaseDefiningValue(Value *I);
+
+/// Return a base defining value for the 'Index' element of the given vector
+/// instruction 'I'. If Index is null, returns a BDV for the entire vector
+/// 'I'. As an optimization, this method will try to determine when the
+/// element is known to already be a base pointer. If this can be established,
+/// the second value in the returned pair will be true. Note that either a
+/// vector or a pointer typed value can be returned. For the former, the
+/// vector returned is a BDV (and possibly a base) of the entire vector 'I'.
+/// If the later, the return pointer is a BDV (or possibly a base) for the
+/// particular element in 'I'.
+static BaseDefiningValueResult
+findBaseDefiningValueOfVector(Value *I) {
+ // Each case parallels findBaseDefiningValue below, see that code for
+ // detailed motivation.
+
+ if (isa<Argument>(I))
+ // An incoming argument to the function is a base pointer
+ return BaseDefiningValueResult(I, true);
+
+ if (isa<Constant>(I))
+ // Constant vectors consist only of constant pointers.
+ return BaseDefiningValueResult(I, true);
+
+ if (isa<LoadInst>(I))
+ return BaseDefiningValueResult(I, true);
+
+ if (isa<InsertElementInst>(I))
+ // We don't know whether this vector contains entirely base pointers or
+ // not. To be conservatively correct, we treat it as a BDV and will
+ // duplicate code as needed to construct a parallel vector of bases.
+ return BaseDefiningValueResult(I, false);
+
+ if (isa<ShuffleVectorInst>(I))
+ // We don't know whether this vector contains entirely base pointers or
+ // not. To be conservatively correct, we treat it as a BDV and will
+ // duplicate code as needed to construct a parallel vector of bases.
+ // TODO: There a number of local optimizations which could be applied here
+ // for particular sufflevector patterns.
+ return BaseDefiningValueResult(I, false);
+
+ // A PHI or Select is a base defining value. The outer findBasePointer
+ // algorithm is responsible for constructing a base value for this BDV.
+ assert((isa<SelectInst>(I) || isa<PHINode>(I)) &&
+ "unknown vector instruction - no base found for vector element");
+ return BaseDefiningValueResult(I, false);
+}
+
+/// Helper function for findBasePointer - Will return a value which either a)
+/// defines the base pointer for the input, b) blocks the simple search
+/// (i.e. a PHI or Select of two derived pointers), or c) involves a change
+/// from pointer to vector type or back.
+static BaseDefiningValueResult findBaseDefiningValue(Value *I) {
+ assert(I->getType()->isPtrOrPtrVectorTy() &&
+ "Illegal to ask for the base pointer of a non-pointer type");
+
+ if (I->getType()->isVectorTy())
+ return findBaseDefiningValueOfVector(I);
+
+ if (isa<Argument>(I))
+ // An incoming argument to the function is a base pointer
+ // We should have never reached here if this argument isn't an gc value
+ return BaseDefiningValueResult(I, true);
+
+ if (isa<Constant>(I))
+ // We assume that objects with a constant base (e.g. a global) can't move
+ // and don't need to be reported to the collector because they are always
+ // live. All constants have constant bases. Besides global references, all
+ // kinds of constants (e.g. undef, constant expressions, null pointers) can
+ // be introduced by the inliner or the optimizer, especially on dynamically
+ // dead paths. See e.g. test4 in constants.ll.
+ return BaseDefiningValueResult(I, true);
+
+ if (CastInst *CI = dyn_cast<CastInst>(I)) {
+ Value *Def = CI->stripPointerCasts();
+ // If stripping pointer casts changes the address space there is an
+ // addrspacecast in between.
+ assert(cast<PointerType>(Def->getType())->getAddressSpace() ==
+ cast<PointerType>(CI->getType())->getAddressSpace() &&
+ "unsupported addrspacecast");
+ // If we find a cast instruction here, it means we've found a cast which is
+ // not simply a pointer cast (i.e. an inttoptr). We don't know how to
+ // handle int->ptr conversion.
+ assert(!isa<CastInst>(Def) && "shouldn't find another cast here");
+ return findBaseDefiningValue(Def);
+ }
+
+ if (isa<LoadInst>(I))
+ // The value loaded is an gc base itself
+ return BaseDefiningValueResult(I, true);
+
+
+ if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I))
+ // The base of this GEP is the base
+ return findBaseDefiningValue(GEP->getPointerOperand());
+
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+ switch (II->getIntrinsicID()) {
+ default:
+ // fall through to general call handling
+ break;
+ case Intrinsic::experimental_gc_statepoint:
+ llvm_unreachable("statepoints don't produce pointers");
+ case Intrinsic::experimental_gc_relocate: {
+ // Rerunning safepoint insertion after safepoints are already
+ // inserted is not supported. It could probably be made to work,
+ // but why are you doing this? There's no good reason.
+ llvm_unreachable("repeat safepoint insertion is not supported");
+ }
+ case Intrinsic::gcroot:
+ // Currently, this mechanism hasn't been extended to work with gcroot.
+ // There's no reason it couldn't be, but I haven't thought about the
+ // implications much.
+ llvm_unreachable(
+ "interaction with the gcroot mechanism is not supported");
+ }
+ }
+ // We assume that functions in the source language only return base
+ // pointers. This should probably be generalized via attributes to support
+ // both source language and internal functions.
+ if (isa<CallInst>(I) || isa<InvokeInst>(I))
+ return BaseDefiningValueResult(I, true);
+
+ // I have absolutely no idea how to implement this part yet. It's not
+ // necessarily hard, I just haven't really looked at it yet.
+ assert(!isa<LandingPadInst>(I) && "Landing Pad is unimplemented");
+
+ if (isa<AtomicCmpXchgInst>(I))
+ // A CAS is effectively a atomic store and load combined under a
+ // predicate. From the perspective of base pointers, we just treat it
+ // like a load.
+ return BaseDefiningValueResult(I, true);
+
+ assert(!isa<AtomicRMWInst>(I) && "Xchg handled above, all others are "
+ "binary ops which don't apply to pointers");
+
+ // The aggregate ops. Aggregates can either be in the heap or on the
+ // stack, but in either case, this is simply a field load. As a result,
+ // this is a defining definition of the base just like a load is.
+ if (isa<ExtractValueInst>(I))
+ return BaseDefiningValueResult(I, true);
+
+ // We should never see an insert vector since that would require we be
+ // tracing back a struct value not a pointer value.
+ assert(!isa<InsertValueInst>(I) &&
+ "Base pointer for a struct is meaningless");
+
+ // An extractelement produces a base result exactly when it's input does.
+ // We may need to insert a parallel instruction to extract the appropriate
+ // element out of the base vector corresponding to the input. Given this,
+ // it's analogous to the phi and select case even though it's not a merge.
+ if (isa<ExtractElementInst>(I))
+ // Note: There a lot of obvious peephole cases here. This are deliberately
+ // handled after the main base pointer inference algorithm to make writing
+ // test cases to exercise that code easier.
+ return BaseDefiningValueResult(I, false);
+
+ // The last two cases here don't return a base pointer. Instead, they
+ // return a value which dynamically selects from among several base
+ // derived pointers (each with it's own base potentially). It's the job of
+ // the caller to resolve these.
+ assert((isa<SelectInst>(I) || isa<PHINode>(I)) &&
+ "missing instruction case in findBaseDefiningValing");
+ return BaseDefiningValueResult(I, false);
+}
+
+/// Returns the base defining value for this value.
+static Value *findBaseDefiningValueCached(Value *I, DefiningValueMapTy &Cache) {
+ Value *&Cached = Cache[I];
+ if (!Cached) {
+ Cached = findBaseDefiningValue(I).BDV;
+ DEBUG(dbgs() << "fBDV-cached: " << I->getName() << " -> "
+ << Cached->getName() << "\n");
+ }
+ assert(Cache[I] != nullptr);
+ return Cached;
+}
+
+/// Return a base pointer for this value if known. Otherwise, return it's
+/// base defining value.
+static Value *findBaseOrBDV(Value *I, DefiningValueMapTy &Cache) {
+ Value *Def = findBaseDefiningValueCached(I, Cache);
+ auto Found = Cache.find(Def);
+ if (Found != Cache.end()) {
+ // Either a base-of relation, or a self reference. Caller must check.
+ return Found->second;
+ }
+ // Only a BDV available
+ return Def;
+}
+
+/// Given the result of a call to findBaseDefiningValue, or findBaseOrBDV,
+/// is it known to be a base pointer? Or do we need to continue searching.
+static bool isKnownBaseResult(Value *V) {
+ if (!isa<PHINode>(V) && !isa<SelectInst>(V) &&
+ !isa<ExtractElementInst>(V) && !isa<InsertElementInst>(V) &&
+ !isa<ShuffleVectorInst>(V)) {
+ // no recursion possible
+ return true;
+ }
+ if (isa<Instruction>(V) &&
+ cast<Instruction>(V)->getMetadata("is_base_value")) {
+ // This is a previously inserted base phi or select. We know
+ // that this is a base value.
+ return true;
+ }
+
+ // We need to keep searching
+ return false;
+}
+
+namespace {
+/// Models the state of a single base defining value in the findBasePointer
+/// algorithm for determining where a new instruction is needed to propagate
+/// the base of this BDV.
+class BDVState {
+public:
+ enum Status { Unknown, Base, Conflict };
+
+ BDVState(Status s, Value *b = nullptr) : status(s), base(b) {
+ assert(status != Base || b);
+ }
+ explicit BDVState(Value *b) : status(Base), base(b) {}
+ BDVState() : status(Unknown), base(nullptr) {}
+
+ Status getStatus() const { return status; }
+ Value *getBase() const { return base; }
+
+ bool isBase() const { return getStatus() == Base; }
+ bool isUnknown() const { return getStatus() == Unknown; }
+ bool isConflict() const { return getStatus() == Conflict; }
+
+ bool operator==(const BDVState &other) const {
+ return base == other.base && status == other.status;
+ }
+
+ bool operator!=(const BDVState &other) const { return !(*this == other); }
+
+ LLVM_DUMP_METHOD
+ void dump() const { print(dbgs()); dbgs() << '\n'; }
+
+ void print(raw_ostream &OS) const {
+ switch (status) {
+ case Unknown:
+ OS << "U";
+ break;
+ case Base:
+ OS << "B";
+ break;
+ case Conflict:
+ OS << "C";
+ break;
+ };
+ OS << " (" << base << " - "
+ << (base ? base->getName() : "nullptr") << "): ";
+ }
+
+private:
+ Status status;
+ AssertingVH<Value> base; // non null only if status == base
+};
+}
+
+#ifndef NDEBUG
+static raw_ostream &operator<<(raw_ostream &OS, const BDVState &State) {
+ State.print(OS);
+ return OS;
+}
+#endif
+
+namespace {
+// Values of type BDVState form a lattice, and this is a helper
+// class that implementes the meet operation. The meat of the meet
+// operation is implemented in MeetBDVStates::pureMeet
+class MeetBDVStates {
+public:
+ /// Initializes the currentResult to the TOP state so that if can be met with
+ /// any other state to produce that state.
+ MeetBDVStates() {}
+
+ // Destructively meet the current result with the given BDVState
+ void meetWith(BDVState otherState) {
+ currentResult = meet(otherState, currentResult);
+ }
+
+ BDVState getResult() const { return currentResult; }
+
+private:
+ BDVState currentResult;
+
+ /// Perform a meet operation on two elements of the BDVState lattice.
+ static BDVState meet(BDVState LHS, BDVState RHS) {
+ assert((pureMeet(LHS, RHS) == pureMeet(RHS, LHS)) &&
+ "math is wrong: meet does not commute!");
+ BDVState Result = pureMeet(LHS, RHS);
+ DEBUG(dbgs() << "meet of " << LHS << " with " << RHS
+ << " produced " << Result << "\n");
+ return Result;
+ }
+
+ static BDVState pureMeet(const BDVState &stateA, const BDVState &stateB) {
+ switch (stateA.getStatus()) {
+ case BDVState::Unknown:
+ return stateB;
+
+ case BDVState::Base:
+ assert(stateA.getBase() && "can't be null");
+ if (stateB.isUnknown())
+ return stateA;
+
+ if (stateB.isBase()) {
+ if (stateA.getBase() == stateB.getBase()) {
+ assert(stateA == stateB && "equality broken!");
+ return stateA;
+ }
+ return BDVState(BDVState::Conflict);
+ }
+ assert(stateB.isConflict() && "only three states!");
+ return BDVState(BDVState::Conflict);
+
+ case BDVState::Conflict:
+ return stateA;
+ }
+ llvm_unreachable("only three states!");
+ }
+};
+}
+
+
+/// For a given value or instruction, figure out what base ptr it's derived
+/// from. For gc objects, this is simply itself. On success, returns a value
+/// which is the base pointer. (This is reliable and can be used for
+/// relocation.) On failure, returns nullptr.
+static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) {
+ Value *def = findBaseOrBDV(I, cache);
+
+ if (isKnownBaseResult(def)) {
+ return def;
+ }
+
+ // Here's the rough algorithm:
+ // - For every SSA value, construct a mapping to either an actual base
+ // pointer or a PHI which obscures the base pointer.
+ // - Construct a mapping from PHI to unknown TOP state. Use an
+ // optimistic algorithm to propagate base pointer information. Lattice
+ // looks like:
+ // UNKNOWN
+ // b1 b2 b3 b4
+ // CONFLICT
+ // When algorithm terminates, all PHIs will either have a single concrete
+ // base or be in a conflict state.
+ // - For every conflict, insert a dummy PHI node without arguments. Add
+ // these to the base[Instruction] = BasePtr mapping. For every
+ // non-conflict, add the actual base.
+ // - For every conflict, add arguments for the base[a] of each input
+ // arguments.
+ //
+ // Note: A simpler form of this would be to add the conflict form of all
+ // PHIs without running the optimistic algorithm. This would be
+ // analogous to pessimistic data flow and would likely lead to an
+ // overall worse solution.
+
+#ifndef NDEBUG
+ auto isExpectedBDVType = [](Value *BDV) {
+ return isa<PHINode>(BDV) || isa<SelectInst>(BDV) ||
+ isa<ExtractElementInst>(BDV) || isa<InsertElementInst>(BDV);
+ };
+#endif
+
+ // Once populated, will contain a mapping from each potentially non-base BDV
+ // to a lattice value (described above) which corresponds to that BDV.
+ // We use the order of insertion (DFS over the def/use graph) to provide a
+ // stable deterministic ordering for visiting DenseMaps (which are unordered)
+ // below. This is important for deterministic compilation.
+ MapVector<Value *, BDVState> States;
+
+ // Recursively fill in all base defining values reachable from the initial
+ // one for which we don't already know a definite base value for
+ /* scope */ {
+ SmallVector<Value*, 16> Worklist;
+ Worklist.push_back(def);
+ States.insert(std::make_pair(def, BDVState()));
+ while (!Worklist.empty()) {
+ Value *Current = Worklist.pop_back_val();
+ assert(!isKnownBaseResult(Current) && "why did it get added?");
+
+ auto visitIncomingValue = [&](Value *InVal) {
+ Value *Base = findBaseOrBDV(InVal, cache);
+ if (isKnownBaseResult(Base))
+ // Known bases won't need new instructions introduced and can be
+ // ignored safely
+ return;
+ assert(isExpectedBDVType(Base) && "the only non-base values "
+ "we see should be base defining values");
+ if (States.insert(std::make_pair(Base, BDVState())).second)
+ Worklist.push_back(Base);
+ };
+ if (PHINode *Phi = dyn_cast<PHINode>(Current)) {
+ for (Value *InVal : Phi->incoming_values())
+ visitIncomingValue(InVal);
+ } else if (SelectInst *Sel = dyn_cast<SelectInst>(Current)) {
+ visitIncomingValue(Sel->getTrueValue());
+ visitIncomingValue(Sel->getFalseValue());
+ } else if (auto *EE = dyn_cast<ExtractElementInst>(Current)) {
+ visitIncomingValue(EE->getVectorOperand());
+ } else if (auto *IE = dyn_cast<InsertElementInst>(Current)) {
+ visitIncomingValue(IE->getOperand(0)); // vector operand
+ visitIncomingValue(IE->getOperand(1)); // scalar operand
+ } else {
+ // There is one known class of instructions we know we don't handle.
+ assert(isa<ShuffleVectorInst>(Current));
+ llvm_unreachable("unimplemented instruction case");
+ }
+ }
+ }
+
+#ifndef NDEBUG
+ DEBUG(dbgs() << "States after initialization:\n");
+ for (auto Pair : States) {
+ DEBUG(dbgs() << " " << Pair.second << " for " << *Pair.first << "\n");
+ }
+#endif
+
+ // Return a phi state for a base defining value. We'll generate a new
+ // base state for known bases and expect to find a cached state otherwise.
+ auto getStateForBDV = [&](Value *baseValue) {
+ if (isKnownBaseResult(baseValue))
+ return BDVState(baseValue);
+ auto I = States.find(baseValue);
+ assert(I != States.end() && "lookup failed!");
+ return I->second;
+ };
+
+ bool progress = true;
+ while (progress) {
+#ifndef NDEBUG
+ const size_t oldSize = States.size();
+#endif
+ progress = false;
+ // We're only changing values in this loop, thus safe to keep iterators.
+ // Since this is computing a fixed point, the order of visit does not
+ // effect the result. TODO: We could use a worklist here and make this run
+ // much faster.
+ for (auto Pair : States) {
+ Value *BDV = Pair.first;
+ assert(!isKnownBaseResult(BDV) && "why did it get added?");
+
+ // Given an input value for the current instruction, return a BDVState
+ // instance which represents the BDV of that value.
+ auto getStateForInput = [&](Value *V) mutable {
+ Value *BDV = findBaseOrBDV(V, cache);
+ return getStateForBDV(BDV);
+ };
+
+ MeetBDVStates calculateMeet;
+ if (SelectInst *select = dyn_cast<SelectInst>(BDV)) {
+ calculateMeet.meetWith(getStateForInput(select->getTrueValue()));
+ calculateMeet.meetWith(getStateForInput(select->getFalseValue()));
+ } else if (PHINode *Phi = dyn_cast<PHINode>(BDV)) {
+ for (Value *Val : Phi->incoming_values())
+ calculateMeet.meetWith(getStateForInput(Val));
+ } else if (auto *EE = dyn_cast<ExtractElementInst>(BDV)) {
+ // The 'meet' for an extractelement is slightly trivial, but it's still
+ // useful in that it drives us to conflict if our input is.
+ calculateMeet.meetWith(getStateForInput(EE->getVectorOperand()));
+ } else {
+ // Given there's a inherent type mismatch between the operands, will
+ // *always* produce Conflict.
+ auto *IE = cast<InsertElementInst>(BDV);
+ calculateMeet.meetWith(getStateForInput(IE->getOperand(0)));
+ calculateMeet.meetWith(getStateForInput(IE->getOperand(1)));
+ }
+
+ BDVState oldState = States[BDV];
+ BDVState newState = calculateMeet.getResult();
+ if (oldState != newState) {
+ progress = true;
+ States[BDV] = newState;
+ }
+ }
+
+ assert(oldSize == States.size() &&
+ "fixed point shouldn't be adding any new nodes to state");
+ }
+
+#ifndef NDEBUG
+ DEBUG(dbgs() << "States after meet iteration:\n");
+ for (auto Pair : States) {
+ DEBUG(dbgs() << " " << Pair.second << " for " << *Pair.first << "\n");
+ }
+#endif
+
+ // Insert Phis for all conflicts
+ // TODO: adjust naming patterns to avoid this order of iteration dependency
+ for (auto Pair : States) {
+ Instruction *I = cast<Instruction>(Pair.first);
+ BDVState State = Pair.second;
+ assert(!isKnownBaseResult(I) && "why did it get added?");
+ assert(!State.isUnknown() && "Optimistic algorithm didn't complete!");
+
+ // extractelement instructions are a bit special in that we may need to
+ // insert an extract even when we know an exact base for the instruction.
+ // The problem is that we need to convert from a vector base to a scalar
+ // base for the particular indice we're interested in.
+ if (State.isBase() && isa<ExtractElementInst>(I) &&
+ isa<VectorType>(State.getBase()->getType())) {
+ auto *EE = cast<ExtractElementInst>(I);
+ // TODO: In many cases, the new instruction is just EE itself. We should
+ // exploit this, but can't do it here since it would break the invariant
+ // about the BDV not being known to be a base.
+ auto *BaseInst = ExtractElementInst::Create(State.getBase(),
+ EE->getIndexOperand(),
+ "base_ee", EE);
+ BaseInst->setMetadata("is_base_value", MDNode::get(I->getContext(), {}));
+ States[I] = BDVState(BDVState::Base, BaseInst);
+ }
+
+ // Since we're joining a vector and scalar base, they can never be the
+ // same. As a result, we should always see insert element having reached
+ // the conflict state.
+ if (isa<InsertElementInst>(I)) {
+ assert(State.isConflict());
+ }
+
+ if (!State.isConflict())
+ continue;
+
+ /// Create and insert a new instruction which will represent the base of
+ /// the given instruction 'I'.
+ auto MakeBaseInstPlaceholder = [](Instruction *I) -> Instruction* {
+ if (isa<PHINode>(I)) {
+ BasicBlock *BB = I->getParent();
+ int NumPreds = std::distance(pred_begin(BB), pred_end(BB));
+ assert(NumPreds > 0 && "how did we reach here");
+ std::string Name = suffixed_name_or(I, ".base", "base_phi");
+ return PHINode::Create(I->getType(), NumPreds, Name, I);
+ } else if (SelectInst *Sel = dyn_cast<SelectInst>(I)) {
+ // The undef will be replaced later
+ UndefValue *Undef = UndefValue::get(Sel->getType());
+ std::string Name = suffixed_name_or(I, ".base", "base_select");
+ return SelectInst::Create(Sel->getCondition(), Undef,
+ Undef, Name, Sel);
+ } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
+ UndefValue *Undef = UndefValue::get(EE->getVectorOperand()->getType());
+ std::string Name = suffixed_name_or(I, ".base", "base_ee");
+ return ExtractElementInst::Create(Undef, EE->getIndexOperand(), Name,
+ EE);
+ } else {
+ auto *IE = cast<InsertElementInst>(I);
+ UndefValue *VecUndef = UndefValue::get(IE->getOperand(0)->getType());
+ UndefValue *ScalarUndef = UndefValue::get(IE->getOperand(1)->getType());
+ std::string Name = suffixed_name_or(I, ".base", "base_ie");
+ return InsertElementInst::Create(VecUndef, ScalarUndef,
+ IE->getOperand(2), Name, IE);
+ }
+
+ };
+ Instruction *BaseInst = MakeBaseInstPlaceholder(I);
+ // Add metadata marking this as a base value
+ BaseInst->setMetadata("is_base_value", MDNode::get(I->getContext(), {}));
+ States[I] = BDVState(BDVState::Conflict, BaseInst);
+ }
+
+ // Returns a instruction which produces the base pointer for a given
+ // instruction. The instruction is assumed to be an input to one of the BDVs
+ // seen in the inference algorithm above. As such, we must either already
+ // know it's base defining value is a base, or have inserted a new
+ // instruction to propagate the base of it's BDV and have entered that newly
+ // introduced instruction into the state table. In either case, we are
+ // assured to be able to determine an instruction which produces it's base
+ // pointer.
+ auto getBaseForInput = [&](Value *Input, Instruction *InsertPt) {
+ Value *BDV = findBaseOrBDV(Input, cache);
+ Value *Base = nullptr;
+ if (isKnownBaseResult(BDV)) {
+ Base = BDV;
+ } else {
+ // Either conflict or base.
+ assert(States.count(BDV));
+ Base = States[BDV].getBase();
+ }
+ assert(Base && "can't be null");
+ // The cast is needed since base traversal may strip away bitcasts
+ if (Base->getType() != Input->getType() &&
+ InsertPt) {
+ Base = new BitCastInst(Base, Input->getType(), "cast",
+ InsertPt);
+ }
+ return Base;
+ };
+
+ // Fixup all the inputs of the new PHIs. Visit order needs to be
+ // deterministic and predictable because we're naming newly created
+ // instructions.
+ for (auto Pair : States) {
+ Instruction *BDV = cast<Instruction>(Pair.first);
+ BDVState State = Pair.second;
+
+ assert(!isKnownBaseResult(BDV) && "why did it get added?");
+ assert(!State.isUnknown() && "Optimistic algorithm didn't complete!");
+ if (!State.isConflict())
+ continue;
+
+ if (PHINode *basephi = dyn_cast<PHINode>(State.getBase())) {
+ PHINode *phi = cast<PHINode>(BDV);
+ unsigned NumPHIValues = phi->getNumIncomingValues();
+ for (unsigned i = 0; i < NumPHIValues; i++) {
+ Value *InVal = phi->getIncomingValue(i);
+ BasicBlock *InBB = phi->getIncomingBlock(i);
+
+ // If we've already seen InBB, add the same incoming value
+ // we added for it earlier. The IR verifier requires phi
+ // nodes with multiple entries from the same basic block
+ // to have the same incoming value for each of those
+ // entries. If we don't do this check here and basephi
+ // has a different type than base, we'll end up adding two
+ // bitcasts (and hence two distinct values) as incoming
+ // values for the same basic block.
+
+ int blockIndex = basephi->getBasicBlockIndex(InBB);
+ if (blockIndex != -1) {
+ Value *oldBase = basephi->getIncomingValue(blockIndex);
+ basephi->addIncoming(oldBase, InBB);
+
+#ifndef NDEBUG
+ Value *Base = getBaseForInput(InVal, nullptr);
+ // In essence this assert states: the only way two
+ // values incoming from the same basic block may be
+ // different is by being different bitcasts of the same
+ // value. A cleanup that remains TODO is changing
+ // findBaseOrBDV to return an llvm::Value of the correct
+ // type (and still remain pure). This will remove the
+ // need to add bitcasts.
+ assert(Base->stripPointerCasts() == oldBase->stripPointerCasts() &&
+ "sanity -- findBaseOrBDV should be pure!");
+#endif
+ continue;
+ }
+
+ // Find the instruction which produces the base for each input. We may
+ // need to insert a bitcast in the incoming block.
+ // TODO: Need to split critical edges if insertion is needed
+ Value *Base = getBaseForInput(InVal, InBB->getTerminator());
+ basephi->addIncoming(Base, InBB);
+ }
+ assert(basephi->getNumIncomingValues() == NumPHIValues);
+ } else if (SelectInst *BaseSel = dyn_cast<SelectInst>(State.getBase())) {
+ SelectInst *Sel = cast<SelectInst>(BDV);
+ // Operand 1 & 2 are true, false path respectively. TODO: refactor to
+ // something more safe and less hacky.
+ for (int i = 1; i <= 2; i++) {
+ Value *InVal = Sel->getOperand(i);
+ // Find the instruction which produces the base for each input. We may
+ // need to insert a bitcast.
+ Value *Base = getBaseForInput(InVal, BaseSel);
+ BaseSel->setOperand(i, Base);
+ }
+ } else if (auto *BaseEE = dyn_cast<ExtractElementInst>(State.getBase())) {
+ Value *InVal = cast<ExtractElementInst>(BDV)->getVectorOperand();
+ // Find the instruction which produces the base for each input. We may
+ // need to insert a bitcast.
+ Value *Base = getBaseForInput(InVal, BaseEE);
+ BaseEE->setOperand(0, Base);
+ } else {
+ auto *BaseIE = cast<InsertElementInst>(State.getBase());
+ auto *BdvIE = cast<InsertElementInst>(BDV);
+ auto UpdateOperand = [&](int OperandIdx) {
+ Value *InVal = BdvIE->getOperand(OperandIdx);
+ Value *Base = getBaseForInput(InVal, BaseIE);
+ BaseIE->setOperand(OperandIdx, Base);
+ };
+ UpdateOperand(0); // vector operand
+ UpdateOperand(1); // scalar operand
+ }
+
+ }
+
+ // Now that we're done with the algorithm, see if we can optimize the
+ // results slightly by reducing the number of new instructions needed.
+ // Arguably, this should be integrated into the algorithm above, but
+ // doing as a post process step is easier to reason about for the moment.
+ DenseMap<Value *, Value *> ReverseMap;
+ SmallPtrSet<Instruction *, 16> NewInsts;
+ SmallSetVector<AssertingVH<Instruction>, 16> Worklist;
+ // Note: We need to visit the states in a deterministic order. We uses the
+ // Keys we sorted above for this purpose. Note that we are papering over a
+ // bigger problem with the algorithm above - it's visit order is not
+ // deterministic. A larger change is needed to fix this.
+ for (auto Pair : States) {
+ auto *BDV = Pair.first;
+ auto State = Pair.second;
+ Value *Base = State.getBase();
+ assert(BDV && Base);
+ assert(!isKnownBaseResult(BDV) && "why did it get added?");
+ assert(isKnownBaseResult(Base) &&
+ "must be something we 'know' is a base pointer");
+ if (!State.isConflict())
+ continue;
+
+ ReverseMap[Base] = BDV;
+ if (auto *BaseI = dyn_cast<Instruction>(Base)) {
+ NewInsts.insert(BaseI);
+ Worklist.insert(BaseI);
+ }
+ }
+ auto ReplaceBaseInstWith = [&](Value *BDV, Instruction *BaseI,
+ Value *Replacement) {
+ // Add users which are new instructions (excluding self references)
+ for (User *U : BaseI->users())
+ if (auto *UI = dyn_cast<Instruction>(U))
+ if (NewInsts.count(UI) && UI != BaseI)
+ Worklist.insert(UI);
+ // Then do the actual replacement
+ NewInsts.erase(BaseI);
+ ReverseMap.erase(BaseI);
+ BaseI->replaceAllUsesWith(Replacement);
+ assert(States.count(BDV));
+ assert(States[BDV].isConflict() && States[BDV].getBase() == BaseI);
+ States[BDV] = BDVState(BDVState::Conflict, Replacement);
+ BaseI->eraseFromParent();
+ };
+ const DataLayout &DL = cast<Instruction>(def)->getModule()->getDataLayout();
+ while (!Worklist.empty()) {
+ Instruction *BaseI = Worklist.pop_back_val();
+ assert(NewInsts.count(BaseI));
+ Value *Bdv = ReverseMap[BaseI];
+ if (auto *BdvI = dyn_cast<Instruction>(Bdv))
+ if (BaseI->isIdenticalTo(BdvI)) {
+ DEBUG(dbgs() << "Identical Base: " << *BaseI << "\n");
+ ReplaceBaseInstWith(Bdv, BaseI, Bdv);
+ continue;
+ }
+ if (Value *V = SimplifyInstruction(BaseI, DL)) {
+ DEBUG(dbgs() << "Base " << *BaseI << " simplified to " << *V << "\n");
+ ReplaceBaseInstWith(Bdv, BaseI, V);
+ continue;
+ }
+ }
+
+ // Cache all of our results so we can cheaply reuse them
+ // NOTE: This is actually two caches: one of the base defining value
+ // relation and one of the base pointer relation! FIXME
+ for (auto Pair : States) {
+ auto *BDV = Pair.first;
+ Value *base = Pair.second.getBase();
+ assert(BDV && base);
+
+ std::string fromstr = cache.count(BDV) ? cache[BDV]->getName() : "none";
+ DEBUG(dbgs() << "Updating base value cache"
+ << " for: " << BDV->getName()
+ << " from: " << fromstr
+ << " to: " << base->getName() << "\n");
+
+ if (cache.count(BDV)) {
+ // Once we transition from the BDV relation being store in the cache to
+ // the base relation being stored, it must be stable
+ assert((!isKnownBaseResult(cache[BDV]) || cache[BDV] == base) &&
+ "base relation should be stable");
+ }
+ cache[BDV] = base;
+ }
+ assert(cache.count(def));
+ return cache[def];
+}
+
+// For a set of live pointers (base and/or derived), identify the base
+// pointer of the object which they are derived from. This routine will
+// mutate the IR graph as needed to make the 'base' pointer live at the
+// definition site of 'derived'. This ensures that any use of 'derived' can
+// also use 'base'. This may involve the insertion of a number of
+// additional PHI nodes.
+//
+// preconditions: live is a set of pointer type Values
+//
+// side effects: may insert PHI nodes into the existing CFG, will preserve
+// CFG, will not remove or mutate any existing nodes
+//
+// post condition: PointerToBase contains one (derived, base) pair for every
+// pointer in live. Note that derived can be equal to base if the original
+// pointer was a base pointer.
+static void
+findBasePointers(const StatepointLiveSetTy &live,
+ DenseMap<Value *, Value *> &PointerToBase,
+ DominatorTree *DT, DefiningValueMapTy &DVCache) {
+ // For the naming of values inserted to be deterministic - which makes for
+ // much cleaner and more stable tests - we need to assign an order to the
+ // live values. DenseSets do not provide a deterministic order across runs.
+ SmallVector<Value *, 64> Temp;
+ Temp.insert(Temp.end(), live.begin(), live.end());
+ std::sort(Temp.begin(), Temp.end(), order_by_name);
+ for (Value *ptr : Temp) {
+ Value *base = findBasePointer(ptr, DVCache);
+ assert(base && "failed to find base pointer");
+ PointerToBase[ptr] = base;
+ assert((!isa<Instruction>(base) || !isa<Instruction>(ptr) ||
+ DT->dominates(cast<Instruction>(base)->getParent(),
+ cast<Instruction>(ptr)->getParent())) &&
+ "The base we found better dominate the derived pointer");
+
+ // If you see this trip and like to live really dangerously, the code should
+ // be correct, just with idioms the verifier can't handle. You can try
+ // disabling the verifier at your own substantial risk.
+ assert(!isa<ConstantPointerNull>(base) &&
+ "the relocation code needs adjustment to handle the relocation of "
+ "a null pointer constant without causing false positives in the "
+ "safepoint ir verifier.");
+ }
+}
+
+/// Find the required based pointers (and adjust the live set) for the given
+/// parse point.
+static void findBasePointers(DominatorTree &DT, DefiningValueMapTy &DVCache,
+ const CallSite &CS,
+ PartiallyConstructedSafepointRecord &result) {
+ DenseMap<Value *, Value *> PointerToBase;
+ findBasePointers(result.LiveSet, PointerToBase, &DT, DVCache);
+
+ if (PrintBasePointers) {
+ // Note: Need to print these in a stable order since this is checked in
+ // some tests.
+ errs() << "Base Pairs (w/o Relocation):\n";
+ SmallVector<Value *, 64> Temp;
+ Temp.reserve(PointerToBase.size());
+ for (auto Pair : PointerToBase) {
+ Temp.push_back(Pair.first);
+ }
+ std::sort(Temp.begin(), Temp.end(), order_by_name);
+ for (Value *Ptr : Temp) {
+ Value *Base = PointerToBase[Ptr];
+ errs() << " derived ";
+ Ptr->printAsOperand(errs(), false);
+ errs() << " base ";
+ Base->printAsOperand(errs(), false);
+ errs() << "\n";;
+ }
+ }
+
+ result.PointerToBase = PointerToBase;
+}
+
+/// Given an updated version of the dataflow liveness results, update the
+/// liveset and base pointer maps for the call site CS.
+static void recomputeLiveInValues(GCPtrLivenessData &RevisedLivenessData,
+ const CallSite &CS,
+ PartiallyConstructedSafepointRecord &result);
+
+static void recomputeLiveInValues(
+ Function &F, DominatorTree &DT, ArrayRef<CallSite> toUpdate,
+ MutableArrayRef<struct PartiallyConstructedSafepointRecord> records) {
+ // TODO-PERF: reuse the original liveness, then simply run the dataflow
+ // again. The old values are still live and will help it stabilize quickly.
+ GCPtrLivenessData RevisedLivenessData;
+ computeLiveInValues(DT, F, RevisedLivenessData);
+ for (size_t i = 0; i < records.size(); i++) {
+ struct PartiallyConstructedSafepointRecord &info = records[i];
+ const CallSite &CS = toUpdate[i];
+ recomputeLiveInValues(RevisedLivenessData, CS, info);
+ }
+}
+
+// When inserting gc.relocate and gc.result calls, we need to ensure there are
+// no uses of the original value / return value between the gc.statepoint and
+// the gc.relocate / gc.result call. One case which can arise is a phi node
+// starting one of the successor blocks. We also need to be able to insert the
+// gc.relocates only on the path which goes through the statepoint. We might
+// need to split an edge to make this possible.
+static BasicBlock *
+normalizeForInvokeSafepoint(BasicBlock *BB, BasicBlock *InvokeParent,
+ DominatorTree &DT) {
+ BasicBlock *Ret = BB;
+ if (!BB->getUniquePredecessor())
+ Ret = SplitBlockPredecessors(BB, InvokeParent, "", &DT);
+
+ // Now that 'Ret' has unique predecessor we can safely remove all phi nodes
+ // from it
+ FoldSingleEntryPHINodes(Ret);
+ assert(!isa<PHINode>(Ret->begin()) &&
+ "All PHI nodes should have been removed!");
+
+ // At this point, we can safely insert a gc.relocate or gc.result as the first
+ // instruction in Ret if needed.
+ return Ret;
+}
+
+// Create new attribute set containing only attributes which can be transferred
+// from original call to the safepoint.
+static AttributeSet legalizeCallAttributes(AttributeSet AS) {
+ AttributeSet Ret;
+
+ for (unsigned Slot = 0; Slot < AS.getNumSlots(); Slot++) {
+ unsigned Index = AS.getSlotIndex(Slot);
+
+ if (Index == AttributeSet::ReturnIndex ||
+ Index == AttributeSet::FunctionIndex) {
+
+ for (Attribute Attr : make_range(AS.begin(Slot), AS.end(Slot))) {
+
+ // Do not allow certain attributes - just skip them
+ // Safepoint can not be read only or read none.
+ if (Attr.hasAttribute(Attribute::ReadNone) ||
+ Attr.hasAttribute(Attribute::ReadOnly))
+ continue;
+
+ // These attributes control the generation of the gc.statepoint call /
+ // invoke itself; and once the gc.statepoint is in place, they're of no
+ // use.
+ if (Attr.hasAttribute("statepoint-num-patch-bytes") ||
+ Attr.hasAttribute("statepoint-id"))
+ continue;
+
+ Ret = Ret.addAttributes(
+ AS.getContext(), Index,
+ AttributeSet::get(AS.getContext(), Index, AttrBuilder(Attr)));
+ }
+ }
+
+ // Just skip parameter attributes for now
+ }
+
+ return Ret;
+}
+
+/// Helper function to place all gc relocates necessary for the given
+/// statepoint.
+/// Inputs:
+/// liveVariables - list of variables to be relocated.
+/// liveStart - index of the first live variable.
+/// basePtrs - base pointers.
+/// statepointToken - statepoint instruction to which relocates should be
+/// bound.
+/// Builder - Llvm IR builder to be used to construct new calls.
+static void CreateGCRelocates(ArrayRef<Value *> LiveVariables,
+ const int LiveStart,
+ ArrayRef<Value *> BasePtrs,
+ Instruction *StatepointToken,
+ IRBuilder<> Builder) {
+ if (LiveVariables.empty())
+ return;
+
+ auto FindIndex = [](ArrayRef<Value *> LiveVec, Value *Val) {
+ auto ValIt = std::find(LiveVec.begin(), LiveVec.end(), Val);
+ assert(ValIt != LiveVec.end() && "Val not found in LiveVec!");
+ size_t Index = std::distance(LiveVec.begin(), ValIt);
+ assert(Index < LiveVec.size() && "Bug in std::find?");
+ return Index;
+ };
+ Module *M = StatepointToken->getModule();
+
+ // All gc_relocate are generated as i8 addrspace(1)* (or a vector type whose
+ // element type is i8 addrspace(1)*). We originally generated unique
+ // declarations for each pointer type, but this proved problematic because
+ // the intrinsic mangling code is incomplete and fragile. Since we're moving
+ // towards a single unified pointer type anyways, we can just cast everything
+ // to an i8* of the right address space. A bitcast is added later to convert
+ // gc_relocate to the actual value's type.
+ auto getGCRelocateDecl = [&] (Type *Ty) {
+ assert(isHandledGCPointerType(Ty));
+ auto AS = Ty->getScalarType()->getPointerAddressSpace();
+ Type *NewTy = Type::getInt8PtrTy(M->getContext(), AS);
+ if (auto *VT = dyn_cast<VectorType>(Ty))
+ NewTy = VectorType::get(NewTy, VT->getNumElements());
+ return Intrinsic::getDeclaration(M, Intrinsic::experimental_gc_relocate,
+ {NewTy});
+ };
+
+ // Lazily populated map from input types to the canonicalized form mentioned
+ // in the comment above. This should probably be cached somewhere more
+ // broadly.
+ DenseMap<Type*, Value*> TypeToDeclMap;
+
+ for (unsigned i = 0; i < LiveVariables.size(); i++) {
+ // Generate the gc.relocate call and save the result
+ Value *BaseIdx =
+ Builder.getInt32(LiveStart + FindIndex(LiveVariables, BasePtrs[i]));
+ Value *LiveIdx = Builder.getInt32(LiveStart + i);
+
+ Type *Ty = LiveVariables[i]->getType();
+ if (!TypeToDeclMap.count(Ty))
+ TypeToDeclMap[Ty] = getGCRelocateDecl(Ty);
+ Value *GCRelocateDecl = TypeToDeclMap[Ty];
+
+ // only specify a debug name if we can give a useful one
+ CallInst *Reloc = Builder.CreateCall(
+ GCRelocateDecl, {StatepointToken, BaseIdx, LiveIdx},
+ suffixed_name_or(LiveVariables[i], ".relocated", ""));
+ // Trick CodeGen into thinking there are lots of free registers at this
+ // fake call.
+ Reloc->setCallingConv(CallingConv::Cold);
+ }
+}
+
+namespace {
+
+/// This struct is used to defer RAUWs and `eraseFromParent` s. Using this
+/// avoids having to worry about keeping around dangling pointers to Values.
+class DeferredReplacement {
+ AssertingVH<Instruction> Old;
+ AssertingVH<Instruction> New;
+
+public:
+ explicit DeferredReplacement(Instruction *Old, Instruction *New) :
+ Old(Old), New(New) {
+ assert(Old != New && "Not allowed!");
+ }
+
+ /// Does the task represented by this instance.
+ void doReplacement() {
+ Instruction *OldI = Old;
+ Instruction *NewI = New;
+
+ assert(OldI != NewI && "Disallowed at construction?!");
+
+ Old = nullptr;
+ New = nullptr;
+
+ if (NewI)
+ OldI->replaceAllUsesWith(NewI);
+ OldI->eraseFromParent();
+ }
+};
+}
+
+static void
+makeStatepointExplicitImpl(const CallSite CS, /* to replace */
+ const SmallVectorImpl<Value *> &BasePtrs,
+ const SmallVectorImpl<Value *> &LiveVariables,
+ PartiallyConstructedSafepointRecord &Result,
+ std::vector<DeferredReplacement> &Replacements) {
+ assert(BasePtrs.size() == LiveVariables.size());
+ assert((UseDeoptBundles || isStatepoint(CS)) &&
+ "This method expects to be rewriting a statepoint");
+
+ // Then go ahead and use the builder do actually do the inserts. We insert
+ // immediately before the previous instruction under the assumption that all
+ // arguments will be available here. We can't insert afterwards since we may
+ // be replacing a terminator.
+ Instruction *InsertBefore = CS.getInstruction();
+ IRBuilder<> Builder(InsertBefore);
+
+ ArrayRef<Value *> GCArgs(LiveVariables);
+ uint64_t StatepointID = 0xABCDEF00;
+ uint32_t NumPatchBytes = 0;
+ uint32_t Flags = uint32_t(StatepointFlags::None);
+
+ ArrayRef<Use> CallArgs;
+ ArrayRef<Use> DeoptArgs;
+ ArrayRef<Use> TransitionArgs;
+
+ Value *CallTarget = nullptr;
+
+ if (UseDeoptBundles) {
+ CallArgs = {CS.arg_begin(), CS.arg_end()};
+ DeoptArgs = GetDeoptBundleOperands(CS);
+ // TODO: we don't fill in TransitionArgs or Flags in this branch, but we
+ // could have an operand bundle for that too.
+ AttributeSet OriginalAttrs = CS.getAttributes();
+
+ Attribute AttrID = OriginalAttrs.getAttribute(AttributeSet::FunctionIndex,
+ "statepoint-id");
+ if (AttrID.isStringAttribute())
+ AttrID.getValueAsString().getAsInteger(10, StatepointID);
+
+ Attribute AttrNumPatchBytes = OriginalAttrs.getAttribute(
+ AttributeSet::FunctionIndex, "statepoint-num-patch-bytes");
+ if (AttrNumPatchBytes.isStringAttribute())
+ AttrNumPatchBytes.getValueAsString().getAsInteger(10, NumPatchBytes);
+
+ CallTarget = CS.getCalledValue();
+ } else {
+ // This branch will be gone soon, and we will soon only support the
+ // UseDeoptBundles == true configuration.
+ Statepoint OldSP(CS);
+ StatepointID = OldSP.getID();
+ NumPatchBytes = OldSP.getNumPatchBytes();
+ Flags = OldSP.getFlags();
+
+ CallArgs = {OldSP.arg_begin(), OldSP.arg_end()};
+ DeoptArgs = {OldSP.vm_state_begin(), OldSP.vm_state_end()};
+ TransitionArgs = {OldSP.gc_transition_args_begin(),
+ OldSP.gc_transition_args_end()};
+ CallTarget = OldSP.getCalledValue();
+ }
+
+ // Create the statepoint given all the arguments
+ Instruction *Token = nullptr;
+ AttributeSet ReturnAttrs;
+ if (CS.isCall()) {
+ CallInst *ToReplace = cast<CallInst>(CS.getInstruction());
+ CallInst *Call = Builder.CreateGCStatepointCall(
+ StatepointID, NumPatchBytes, CallTarget, Flags, CallArgs,
+ TransitionArgs, DeoptArgs, GCArgs, "safepoint_token");
+
+ Call->setTailCall(ToReplace->isTailCall());
+ Call->setCallingConv(ToReplace->getCallingConv());
+
+ // Currently we will fail on parameter attributes and on certain
+ // function attributes.
+ AttributeSet NewAttrs = legalizeCallAttributes(ToReplace->getAttributes());
+ // In case if we can handle this set of attributes - set up function attrs
+ // directly on statepoint and return attrs later for gc_result intrinsic.
+ Call->setAttributes(NewAttrs.getFnAttributes());
+ ReturnAttrs = NewAttrs.getRetAttributes();
+
+ Token = Call;
+
+ // Put the following gc_result and gc_relocate calls immediately after the
+ // the old call (which we're about to delete)
+ assert(ToReplace->getNextNode() && "Not a terminator, must have next!");
+ Builder.SetInsertPoint(ToReplace->getNextNode());
+ Builder.SetCurrentDebugLocation(ToReplace->getNextNode()->getDebugLoc());
+ } else {
+ InvokeInst *ToReplace = cast<InvokeInst>(CS.getInstruction());
+
+ // Insert the new invoke into the old block. We'll remove the old one in a
+ // moment at which point this will become the new terminator for the
+ // original block.
+ InvokeInst *Invoke = Builder.CreateGCStatepointInvoke(
+ StatepointID, NumPatchBytes, CallTarget, ToReplace->getNormalDest(),
+ ToReplace->getUnwindDest(), Flags, CallArgs, TransitionArgs, DeoptArgs,
+ GCArgs, "statepoint_token");
+
+ Invoke->setCallingConv(ToReplace->getCallingConv());
+
+ // Currently we will fail on parameter attributes and on certain
+ // function attributes.
+ AttributeSet NewAttrs = legalizeCallAttributes(ToReplace->getAttributes());
+ // In case if we can handle this set of attributes - set up function attrs
+ // directly on statepoint and return attrs later for gc_result intrinsic.
+ Invoke->setAttributes(NewAttrs.getFnAttributes());
+ ReturnAttrs = NewAttrs.getRetAttributes();
+
+ Token = Invoke;
+
+ // Generate gc relocates in exceptional path
+ BasicBlock *UnwindBlock = ToReplace->getUnwindDest();
+ assert(!isa<PHINode>(UnwindBlock->begin()) &&
+ UnwindBlock->getUniquePredecessor() &&
+ "can't safely insert in this block!");
+
+ Builder.SetInsertPoint(&*UnwindBlock->getFirstInsertionPt());
+ Builder.SetCurrentDebugLocation(ToReplace->getDebugLoc());
+
+ // Attach exceptional gc relocates to the landingpad.
+ Instruction *ExceptionalToken = UnwindBlock->getLandingPadInst();
+ Result.UnwindToken = ExceptionalToken;
+
+ const unsigned LiveStartIdx = Statepoint(Token).gcArgsStartIdx();
+ CreateGCRelocates(LiveVariables, LiveStartIdx, BasePtrs, ExceptionalToken,
+ Builder);
+
+ // Generate gc relocates and returns for normal block
+ BasicBlock *NormalDest = ToReplace->getNormalDest();
+ assert(!isa<PHINode>(NormalDest->begin()) &&
+ NormalDest->getUniquePredecessor() &&
+ "can't safely insert in this block!");
+
+ Builder.SetInsertPoint(&*NormalDest->getFirstInsertionPt());
+
+ // gc relocates will be generated later as if it were regular call
+ // statepoint
+ }
+ assert(Token && "Should be set in one of the above branches!");
+
+ if (UseDeoptBundles) {
+ Token->setName("statepoint_token");
+ if (!CS.getType()->isVoidTy() && !CS.getInstruction()->use_empty()) {
+ StringRef Name =
+ CS.getInstruction()->hasName() ? CS.getInstruction()->getName() : "";
+ CallInst *GCResult = Builder.CreateGCResult(Token, CS.getType(), Name);
+ GCResult->setAttributes(CS.getAttributes().getRetAttributes());
+
+ // We cannot RAUW or delete CS.getInstruction() because it could be in the
+ // live set of some other safepoint, in which case that safepoint's
+ // PartiallyConstructedSafepointRecord will hold a raw pointer to this
+ // llvm::Instruction. Instead, we defer the replacement and deletion to
+ // after the live sets have been made explicit in the IR, and we no longer
+ // have raw pointers to worry about.
+ Replacements.emplace_back(CS.getInstruction(), GCResult);
+ } else {
+ Replacements.emplace_back(CS.getInstruction(), nullptr);
+ }
+ } else {
+ assert(!CS.getInstruction()->hasNUsesOrMore(2) &&
+ "only valid use before rewrite is gc.result");
+ assert(!CS.getInstruction()->hasOneUse() ||
+ isGCResult(cast<Instruction>(*CS.getInstruction()->user_begin())));
+
+ // Take the name of the original statepoint token if there was one.
+ Token->takeName(CS.getInstruction());
+
+ // Update the gc.result of the original statepoint (if any) to use the newly
+ // inserted statepoint. This is safe to do here since the token can't be
+ // considered a live reference.
+ CS.getInstruction()->replaceAllUsesWith(Token);
+ CS.getInstruction()->eraseFromParent();
+ }
+
+ Result.StatepointToken = Token;
+
+ // Second, create a gc.relocate for every live variable
+ const unsigned LiveStartIdx = Statepoint(Token).gcArgsStartIdx();
+ CreateGCRelocates(LiveVariables, LiveStartIdx, BasePtrs, Token, Builder);
+}
+
+namespace {
+struct NameOrdering {
+ Value *Base;
+ Value *Derived;
+
+ bool operator()(NameOrdering const &a, NameOrdering const &b) {
+ return -1 == a.Derived->getName().compare(b.Derived->getName());
+ }
+};
+}
+
+static void StabilizeOrder(SmallVectorImpl<Value *> &BaseVec,
+ SmallVectorImpl<Value *> &LiveVec) {
+ assert(BaseVec.size() == LiveVec.size());
+
+ SmallVector<NameOrdering, 64> Temp;
+ for (size_t i = 0; i < BaseVec.size(); i++) {
+ NameOrdering v;
+ v.Base = BaseVec[i];
+ v.Derived = LiveVec[i];
+ Temp.push_back(v);
+ }
+
+ std::sort(Temp.begin(), Temp.end(), NameOrdering());
+ for (size_t i = 0; i < BaseVec.size(); i++) {
+ BaseVec[i] = Temp[i].Base;
+ LiveVec[i] = Temp[i].Derived;
+ }
+}
+
+// Replace an existing gc.statepoint with a new one and a set of gc.relocates
+// which make the relocations happening at this safepoint explicit.
+//
+// WARNING: Does not do any fixup to adjust users of the original live
+// values. That's the callers responsibility.
+static void
+makeStatepointExplicit(DominatorTree &DT, const CallSite &CS,
+ PartiallyConstructedSafepointRecord &Result,
+ std::vector<DeferredReplacement> &Replacements) {
+ const auto &LiveSet = Result.LiveSet;
+ const auto &PointerToBase = Result.PointerToBase;
+
+ // Convert to vector for efficient cross referencing.
+ SmallVector<Value *, 64> BaseVec, LiveVec;
+ LiveVec.reserve(LiveSet.size());
+ BaseVec.reserve(LiveSet.size());
+ for (Value *L : LiveSet) {
+ LiveVec.push_back(L);
+ assert(PointerToBase.count(L));
+ Value *Base = PointerToBase.find(L)->second;
+ BaseVec.push_back(Base);
+ }
+ assert(LiveVec.size() == BaseVec.size());
+
+ // To make the output IR slightly more stable (for use in diffs), ensure a
+ // fixed order of the values in the safepoint (by sorting the value name).
+ // The order is otherwise meaningless.
+ StabilizeOrder(BaseVec, LiveVec);
+
+ // Do the actual rewriting and delete the old statepoint
+ makeStatepointExplicitImpl(CS, BaseVec, LiveVec, Result, Replacements);
+}
+
+// Helper function for the relocationViaAlloca.
+//
+// It receives iterator to the statepoint gc relocates and emits a store to the
+// assigned location (via allocaMap) for the each one of them. It adds the
+// visited values into the visitedLiveValues set, which we will later use them
+// for sanity checking.
+static void
+insertRelocationStores(iterator_range<Value::user_iterator> GCRelocs,
+ DenseMap<Value *, Value *> &AllocaMap,
+ DenseSet<Value *> &VisitedLiveValues) {
+
+ for (User *U : GCRelocs) {
+ GCRelocateInst *Relocate = dyn_cast<GCRelocateInst>(U);
+ if (!Relocate)
+ continue;
+
+ Value *OriginalValue = const_cast<Value *>(Relocate->getDerivedPtr());
+ assert(AllocaMap.count(OriginalValue));
+ Value *Alloca = AllocaMap[OriginalValue];
+
+ // Emit store into the related alloca
+ // All gc_relocates are i8 addrspace(1)* typed, and it must be bitcasted to
+ // the correct type according to alloca.
+ assert(Relocate->getNextNode() &&
+ "Should always have one since it's not a terminator");
+ IRBuilder<> Builder(Relocate->getNextNode());
+ Value *CastedRelocatedValue =
+ Builder.CreateBitCast(Relocate,
+ cast<AllocaInst>(Alloca)->getAllocatedType(),
+ suffixed_name_or(Relocate, ".casted", ""));
+
+ StoreInst *Store = new StoreInst(CastedRelocatedValue, Alloca);
+ Store->insertAfter(cast<Instruction>(CastedRelocatedValue));
+
+#ifndef NDEBUG
+ VisitedLiveValues.insert(OriginalValue);
+#endif
+ }
+}
+
+// Helper function for the "relocationViaAlloca". Similar to the
+// "insertRelocationStores" but works for rematerialized values.
+static void
+insertRematerializationStores(
+ RematerializedValueMapTy RematerializedValues,
+ DenseMap<Value *, Value *> &AllocaMap,
+ DenseSet<Value *> &VisitedLiveValues) {
+
+ for (auto RematerializedValuePair: RematerializedValues) {
+ Instruction *RematerializedValue = RematerializedValuePair.first;
+ Value *OriginalValue = RematerializedValuePair.second;
+
+ assert(AllocaMap.count(OriginalValue) &&
+ "Can not find alloca for rematerialized value");
+ Value *Alloca = AllocaMap[OriginalValue];
+
+ StoreInst *Store = new StoreInst(RematerializedValue, Alloca);
+ Store->insertAfter(RematerializedValue);
+
+#ifndef NDEBUG
+ VisitedLiveValues.insert(OriginalValue);
+#endif
+ }
+}
+
+/// Do all the relocation update via allocas and mem2reg
+static void relocationViaAlloca(
+ Function &F, DominatorTree &DT, ArrayRef<Value *> Live,
+ ArrayRef<PartiallyConstructedSafepointRecord> Records) {
+#ifndef NDEBUG
+ // record initial number of (static) allocas; we'll check we have the same
+ // number when we get done.
+ int InitialAllocaNum = 0;
+ for (auto I = F.getEntryBlock().begin(), E = F.getEntryBlock().end(); I != E;
+ I++)
+ if (isa<AllocaInst>(*I))
+ InitialAllocaNum++;
+#endif
+
+ // TODO-PERF: change data structures, reserve
+ DenseMap<Value *, Value *> AllocaMap;
+ SmallVector<AllocaInst *, 200> PromotableAllocas;
+ // Used later to chack that we have enough allocas to store all values
+ std::size_t NumRematerializedValues = 0;
+ PromotableAllocas.reserve(Live.size());
+
+ // Emit alloca for "LiveValue" and record it in "allocaMap" and
+ // "PromotableAllocas"
+ auto emitAllocaFor = [&](Value *LiveValue) {
+ AllocaInst *Alloca = new AllocaInst(LiveValue->getType(), "",
+ F.getEntryBlock().getFirstNonPHI());
+ AllocaMap[LiveValue] = Alloca;
+ PromotableAllocas.push_back(Alloca);
+ };
+
+ // Emit alloca for each live gc pointer
+ for (Value *V : Live)
+ emitAllocaFor(V);
+
+ // Emit allocas for rematerialized values
+ for (const auto &Info : Records)
+ for (auto RematerializedValuePair : Info.RematerializedValues) {
+ Value *OriginalValue = RematerializedValuePair.second;
+ if (AllocaMap.count(OriginalValue) != 0)
+ continue;
+
+ emitAllocaFor(OriginalValue);
+ ++NumRematerializedValues;
+ }
+
+ // The next two loops are part of the same conceptual operation. We need to
+ // insert a store to the alloca after the original def and at each
+ // redefinition. We need to insert a load before each use. These are split
+ // into distinct loops for performance reasons.
+
+ // Update gc pointer after each statepoint: either store a relocated value or
+ // null (if no relocated value was found for this gc pointer and it is not a
+ // gc_result). This must happen before we update the statepoint with load of
+ // alloca otherwise we lose the link between statepoint and old def.
+ for (const auto &Info : Records) {
+ Value *Statepoint = Info.StatepointToken;
+
+ // This will be used for consistency check
+ DenseSet<Value *> VisitedLiveValues;
+
+ // Insert stores for normal statepoint gc relocates
+ insertRelocationStores(Statepoint->users(), AllocaMap, VisitedLiveValues);
+
+ // In case if it was invoke statepoint
+ // we will insert stores for exceptional path gc relocates.
+ if (isa<InvokeInst>(Statepoint)) {
+ insertRelocationStores(Info.UnwindToken->users(), AllocaMap,
+ VisitedLiveValues);
+ }
+
+ // Do similar thing with rematerialized values
+ insertRematerializationStores(Info.RematerializedValues, AllocaMap,
+ VisitedLiveValues);
+
+ if (ClobberNonLive) {
+ // As a debugging aid, pretend that an unrelocated pointer becomes null at
+ // the gc.statepoint. This will turn some subtle GC problems into
+ // slightly easier to debug SEGVs. Note that on large IR files with
+ // lots of gc.statepoints this is extremely costly both memory and time
+ // wise.
+ SmallVector<AllocaInst *, 64> ToClobber;
+ for (auto Pair : AllocaMap) {
+ Value *Def = Pair.first;
+ AllocaInst *Alloca = cast<AllocaInst>(Pair.second);
+
+ // This value was relocated
+ if (VisitedLiveValues.count(Def)) {
+ continue;
+ }
+ ToClobber.push_back(Alloca);
+ }
+
+ auto InsertClobbersAt = [&](Instruction *IP) {
+ for (auto *AI : ToClobber) {
+ auto AIType = cast<PointerType>(AI->getType());
+ auto PT = cast<PointerType>(AIType->getElementType());
+ Constant *CPN = ConstantPointerNull::get(PT);
+ StoreInst *Store = new StoreInst(CPN, AI);
+ Store->insertBefore(IP);
+ }
+ };
+
+ // Insert the clobbering stores. These may get intermixed with the
+ // gc.results and gc.relocates, but that's fine.
+ if (auto II = dyn_cast<InvokeInst>(Statepoint)) {
+ InsertClobbersAt(&*II->getNormalDest()->getFirstInsertionPt());
+ InsertClobbersAt(&*II->getUnwindDest()->getFirstInsertionPt());
+ } else {
+ InsertClobbersAt(cast<Instruction>(Statepoint)->getNextNode());
+ }
+ }
+ }
+
+ // Update use with load allocas and add store for gc_relocated.
+ for (auto Pair : AllocaMap) {
+ Value *Def = Pair.first;
+ Value *Alloca = Pair.second;
+
+ // We pre-record the uses of allocas so that we dont have to worry about
+ // later update that changes the user information..
+
+ SmallVector<Instruction *, 20> Uses;
+ // PERF: trade a linear scan for repeated reallocation
+ Uses.reserve(std::distance(Def->user_begin(), Def->user_end()));
+ for (User *U : Def->users()) {
+ if (!isa<ConstantExpr>(U)) {
+ // If the def has a ConstantExpr use, then the def is either a
+ // ConstantExpr use itself or null. In either case
+ // (recursively in the first, directly in the second), the oop
+ // it is ultimately dependent on is null and this particular
+ // use does not need to be fixed up.
+ Uses.push_back(cast<Instruction>(U));
+ }
+ }
+
+ std::sort(Uses.begin(), Uses.end());
+ auto Last = std::unique(Uses.begin(), Uses.end());
+ Uses.erase(Last, Uses.end());
+
+ for (Instruction *Use : Uses) {
+ if (isa<PHINode>(Use)) {
+ PHINode *Phi = cast<PHINode>(Use);
+ for (unsigned i = 0; i < Phi->getNumIncomingValues(); i++) {
+ if (Def == Phi->getIncomingValue(i)) {
+ LoadInst *Load = new LoadInst(
+ Alloca, "", Phi->getIncomingBlock(i)->getTerminator());
+ Phi->setIncomingValue(i, Load);
+ }
+ }
+ } else {
+ LoadInst *Load = new LoadInst(Alloca, "", Use);
+ Use->replaceUsesOfWith(Def, Load);
+ }
+ }
+
+ // Emit store for the initial gc value. Store must be inserted after load,
+ // otherwise store will be in alloca's use list and an extra load will be
+ // inserted before it.
+ StoreInst *Store = new StoreInst(Def, Alloca);
+ if (Instruction *Inst = dyn_cast<Instruction>(Def)) {
+ if (InvokeInst *Invoke = dyn_cast<InvokeInst>(Inst)) {
+ // InvokeInst is a TerminatorInst so the store need to be inserted
+ // into its normal destination block.
+ BasicBlock *NormalDest = Invoke->getNormalDest();
+ Store->insertBefore(NormalDest->getFirstNonPHI());
+ } else {
+ assert(!Inst->isTerminator() &&
+ "The only TerminatorInst that can produce a value is "
+ "InvokeInst which is handled above.");
+ Store->insertAfter(Inst);
+ }
+ } else {
+ assert(isa<Argument>(Def));
+ Store->insertAfter(cast<Instruction>(Alloca));
+ }
+ }
+
+ assert(PromotableAllocas.size() == Live.size() + NumRematerializedValues &&
+ "we must have the same allocas with lives");
+ if (!PromotableAllocas.empty()) {
+ // Apply mem2reg to promote alloca to SSA
+ PromoteMemToReg(PromotableAllocas, DT);
+ }
+
+#ifndef NDEBUG
+ for (auto &I : F.getEntryBlock())
+ if (isa<AllocaInst>(I))
+ InitialAllocaNum--;
+ assert(InitialAllocaNum == 0 && "We must not introduce any extra allocas");
+#endif
+}
+
+/// Implement a unique function which doesn't require we sort the input
+/// vector. Doing so has the effect of changing the output of a couple of
+/// tests in ways which make them less useful in testing fused safepoints.
+template <typename T> static void unique_unsorted(SmallVectorImpl<T> &Vec) {
+ SmallSet<T, 8> Seen;
+ Vec.erase(std::remove_if(Vec.begin(), Vec.end(), [&](const T &V) {
+ return !Seen.insert(V).second;
+ }), Vec.end());
+}
+
+/// Insert holders so that each Value is obviously live through the entire
+/// lifetime of the call.
+static void insertUseHolderAfter(CallSite &CS, const ArrayRef<Value *> Values,
+ SmallVectorImpl<CallInst *> &Holders) {
+ if (Values.empty())
+ // No values to hold live, might as well not insert the empty holder
+ return;
+
+ Module *M = CS.getInstruction()->getModule();
+ // Use a dummy vararg function to actually hold the values live
+ Function *Func = cast<Function>(M->getOrInsertFunction(
+ "__tmp_use", FunctionType::get(Type::getVoidTy(M->getContext()), true)));
+ if (CS.isCall()) {
+ // For call safepoints insert dummy calls right after safepoint
+ Holders.push_back(CallInst::Create(Func, Values, "",
+ &*++CS.getInstruction()->getIterator()));
+ return;
+ }
+ // For invoke safepooints insert dummy calls both in normal and
+ // exceptional destination blocks
+ auto *II = cast<InvokeInst>(CS.getInstruction());
+ Holders.push_back(CallInst::Create(
+ Func, Values, "", &*II->getNormalDest()->getFirstInsertionPt()));
+ Holders.push_back(CallInst::Create(
+ Func, Values, "", &*II->getUnwindDest()->getFirstInsertionPt()));
+}
+
+static void findLiveReferences(
+ Function &F, DominatorTree &DT, ArrayRef<CallSite> toUpdate,
+ MutableArrayRef<struct PartiallyConstructedSafepointRecord> records) {
+ GCPtrLivenessData OriginalLivenessData;
+ computeLiveInValues(DT, F, OriginalLivenessData);
+ for (size_t i = 0; i < records.size(); i++) {
+ struct PartiallyConstructedSafepointRecord &info = records[i];
+ const CallSite &CS = toUpdate[i];
+ analyzeParsePointLiveness(DT, OriginalLivenessData, CS, info);
+ }
+}
+
+/// Remove any vector of pointers from the live set by scalarizing them over the
+/// statepoint instruction. Adds the scalarized pieces to the live set. It
+/// would be preferable to include the vector in the statepoint itself, but
+/// the lowering code currently does not handle that. Extending it would be
+/// slightly non-trivial since it requires a format change. Given how rare
+/// such cases are (for the moment?) scalarizing is an acceptable compromise.
+static void splitVectorValues(Instruction *StatepointInst,
+ StatepointLiveSetTy &LiveSet,
+ DenseMap<Value *, Value *>& PointerToBase,
+ DominatorTree &DT) {
+ SmallVector<Value *, 16> ToSplit;
+ for (Value *V : LiveSet)
+ if (isa<VectorType>(V->getType()))
+ ToSplit.push_back(V);
+
+ if (ToSplit.empty())
+ return;
+
+ DenseMap<Value *, SmallVector<Value *, 16>> ElementMapping;
+
+ Function &F = *(StatepointInst->getParent()->getParent());
+
+ DenseMap<Value *, AllocaInst *> AllocaMap;
+ // First is normal return, second is exceptional return (invoke only)
+ DenseMap<Value *, std::pair<Value *, Value *>> Replacements;
+ for (Value *V : ToSplit) {
+ AllocaInst *Alloca =
+ new AllocaInst(V->getType(), "", F.getEntryBlock().getFirstNonPHI());
+ AllocaMap[V] = Alloca;
+
+ VectorType *VT = cast<VectorType>(V->getType());
+ IRBuilder<> Builder(StatepointInst);
+ SmallVector<Value *, 16> Elements;
+ for (unsigned i = 0; i < VT->getNumElements(); i++)
+ Elements.push_back(Builder.CreateExtractElement(V, Builder.getInt32(i)));
+ ElementMapping[V] = Elements;
+
+ auto InsertVectorReform = [&](Instruction *IP) {
+ Builder.SetInsertPoint(IP);
+ Builder.SetCurrentDebugLocation(IP->getDebugLoc());
+ Value *ResultVec = UndefValue::get(VT);
+ for (unsigned i = 0; i < VT->getNumElements(); i++)
+ ResultVec = Builder.CreateInsertElement(ResultVec, Elements[i],
+ Builder.getInt32(i));
+ return ResultVec;
+ };
+
+ if (isa<CallInst>(StatepointInst)) {
+ BasicBlock::iterator Next(StatepointInst);
+ Next++;
+ Instruction *IP = &*(Next);
+ Replacements[V].first = InsertVectorReform(IP);
+ Replacements[V].second = nullptr;
+ } else {
+ InvokeInst *Invoke = cast<InvokeInst>(StatepointInst);
+ // We've already normalized - check that we don't have shared destination
+ // blocks
+ BasicBlock *NormalDest = Invoke->getNormalDest();
+ assert(!isa<PHINode>(NormalDest->begin()));
+ BasicBlock *UnwindDest = Invoke->getUnwindDest();
+ assert(!isa<PHINode>(UnwindDest->begin()));
+ // Insert insert element sequences in both successors
+ Instruction *IP = &*(NormalDest->getFirstInsertionPt());
+ Replacements[V].first = InsertVectorReform(IP);
+ IP = &*(UnwindDest->getFirstInsertionPt());
+ Replacements[V].second = InsertVectorReform(IP);
+ }
+ }
+
+ for (Value *V : ToSplit) {
+ AllocaInst *Alloca = AllocaMap[V];
+
+ // Capture all users before we start mutating use lists
+ SmallVector<Instruction *, 16> Users;
+ for (User *U : V->users())
+ Users.push_back(cast<Instruction>(U));
+
+ for (Instruction *I : Users) {
+ if (auto Phi = dyn_cast<PHINode>(I)) {
+ for (unsigned i = 0; i < Phi->getNumIncomingValues(); i++)
+ if (V == Phi->getIncomingValue(i)) {
+ LoadInst *Load = new LoadInst(
+ Alloca, "", Phi->getIncomingBlock(i)->getTerminator());
+ Phi->setIncomingValue(i, Load);
+ }
+ } else {
+ LoadInst *Load = new LoadInst(Alloca, "", I);
+ I->replaceUsesOfWith(V, Load);
+ }
+ }
+
+ // Store the original value and the replacement value into the alloca
+ StoreInst *Store = new StoreInst(V, Alloca);
+ if (auto I = dyn_cast<Instruction>(V))
+ Store->insertAfter(I);
+ else
+ Store->insertAfter(Alloca);
+
+ // Normal return for invoke, or call return
+ Instruction *Replacement = cast<Instruction>(Replacements[V].first);
+ (new StoreInst(Replacement, Alloca))->insertAfter(Replacement);
+ // Unwind return for invoke only
+ Replacement = cast_or_null<Instruction>(Replacements[V].second);
+ if (Replacement)
+ (new StoreInst(Replacement, Alloca))->insertAfter(Replacement);
+ }
+
+ // apply mem2reg to promote alloca to SSA
+ SmallVector<AllocaInst *, 16> Allocas;
+ for (Value *V : ToSplit)
+ Allocas.push_back(AllocaMap[V]);
+ PromoteMemToReg(Allocas, DT);
+
+ // Update our tracking of live pointers and base mappings to account for the
+ // changes we just made.
+ for (Value *V : ToSplit) {
+ auto &Elements = ElementMapping[V];
+
+ LiveSet.erase(V);
+ LiveSet.insert(Elements.begin(), Elements.end());
+ // We need to update the base mapping as well.
+ assert(PointerToBase.count(V));
+ Value *OldBase = PointerToBase[V];
+ auto &BaseElements = ElementMapping[OldBase];
+ PointerToBase.erase(V);
+ assert(Elements.size() == BaseElements.size());
+ for (unsigned i = 0; i < Elements.size(); i++) {
+ Value *Elem = Elements[i];
+ PointerToBase[Elem] = BaseElements[i];
+ }
+ }
+}
+
+// Helper function for the "rematerializeLiveValues". It walks use chain
+// starting from the "CurrentValue" until it meets "BaseValue". Only "simple"
+// values are visited (currently it is GEP's and casts). Returns true if it
+// successfully reached "BaseValue" and false otherwise.
+// Fills "ChainToBase" array with all visited values. "BaseValue" is not
+// recorded.
+static bool findRematerializableChainToBasePointer(
+ SmallVectorImpl<Instruction*> &ChainToBase,
+ Value *CurrentValue, Value *BaseValue) {
+
+ // We have found a base value
+ if (CurrentValue == BaseValue) {
+ return true;
+ }
+
+ if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(CurrentValue)) {
+ ChainToBase.push_back(GEP);
+ return findRematerializableChainToBasePointer(ChainToBase,
+ GEP->getPointerOperand(),
+ BaseValue);
+ }
+
+ if (CastInst *CI = dyn_cast<CastInst>(CurrentValue)) {
+ if (!CI->isNoopCast(CI->getModule()->getDataLayout()))
+ return false;
+
+ ChainToBase.push_back(CI);
+ return findRematerializableChainToBasePointer(ChainToBase,
+ CI->getOperand(0), BaseValue);
+ }
+
+ // Not supported instruction in the chain
+ return false;
+}
+
+// Helper function for the "rematerializeLiveValues". Compute cost of the use
+// chain we are going to rematerialize.
+static unsigned
+chainToBasePointerCost(SmallVectorImpl<Instruction*> &Chain,
+ TargetTransformInfo &TTI) {
+ unsigned Cost = 0;
+
+ for (Instruction *Instr : Chain) {
+ if (CastInst *CI = dyn_cast<CastInst>(Instr)) {
+ assert(CI->isNoopCast(CI->getModule()->getDataLayout()) &&
+ "non noop cast is found during rematerialization");
+
+ Type *SrcTy = CI->getOperand(0)->getType();
+ Cost += TTI.getCastInstrCost(CI->getOpcode(), CI->getType(), SrcTy);
+
+ } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) {
+ // Cost of the address calculation
+ Type *ValTy = GEP->getPointerOperandType()->getPointerElementType();
+ Cost += TTI.getAddressComputationCost(ValTy);
+
+ // And cost of the GEP itself
+ // TODO: Use TTI->getGEPCost here (it exists, but appears to be not
+ // allowed for the external usage)
+ if (!GEP->hasAllConstantIndices())
+ Cost += 2;
+
+ } else {
+ llvm_unreachable("unsupported instruciton type during rematerialization");
+ }
+ }
+
+ return Cost;
+}
+
+// From the statepoint live set pick values that are cheaper to recompute then
+// to relocate. Remove this values from the live set, rematerialize them after
+// statepoint and record them in "Info" structure. Note that similar to
+// relocated values we don't do any user adjustments here.
+static void rematerializeLiveValues(CallSite CS,
+ PartiallyConstructedSafepointRecord &Info,
+ TargetTransformInfo &TTI) {
+ const unsigned int ChainLengthThreshold = 10;
+
+ // Record values we are going to delete from this statepoint live set.
+ // We can not di this in following loop due to iterator invalidation.
+ SmallVector<Value *, 32> LiveValuesToBeDeleted;
+
+ for (Value *LiveValue: Info.LiveSet) {
+ // For each live pointer find it's defining chain
+ SmallVector<Instruction *, 3> ChainToBase;
+ assert(Info.PointerToBase.count(LiveValue));
+ bool FoundChain =
+ findRematerializableChainToBasePointer(ChainToBase,
+ LiveValue,
+ Info.PointerToBase[LiveValue]);
+ // Nothing to do, or chain is too long
+ if (!FoundChain ||
+ ChainToBase.size() == 0 ||
+ ChainToBase.size() > ChainLengthThreshold)
+ continue;
+
+ // Compute cost of this chain
+ unsigned Cost = chainToBasePointerCost(ChainToBase, TTI);
+ // TODO: We can also account for cases when we will be able to remove some
+ // of the rematerialized values by later optimization passes. I.e if
+ // we rematerialized several intersecting chains. Or if original values
+ // don't have any uses besides this statepoint.
+
+ // For invokes we need to rematerialize each chain twice - for normal and
+ // for unwind basic blocks. Model this by multiplying cost by two.
+ if (CS.isInvoke()) {
+ Cost *= 2;
+ }
+ // If it's too expensive - skip it
+ if (Cost >= RematerializationThreshold)
+ continue;
+
+ // Remove value from the live set
+ LiveValuesToBeDeleted.push_back(LiveValue);
+
+ // Clone instructions and record them inside "Info" structure
+
+ // Walk backwards to visit top-most instructions first
+ std::reverse(ChainToBase.begin(), ChainToBase.end());
+
+ // Utility function which clones all instructions from "ChainToBase"
+ // and inserts them before "InsertBefore". Returns rematerialized value
+ // which should be used after statepoint.
+ auto rematerializeChain = [&ChainToBase](Instruction *InsertBefore) {
+ Instruction *LastClonedValue = nullptr;
+ Instruction *LastValue = nullptr;
+ for (Instruction *Instr: ChainToBase) {
+ // Only GEP's and casts are suported as we need to be careful to not
+ // introduce any new uses of pointers not in the liveset.
+ // Note that it's fine to introduce new uses of pointers which were
+ // otherwise not used after this statepoint.
+ assert(isa<GetElementPtrInst>(Instr) || isa<CastInst>(Instr));
+
+ Instruction *ClonedValue = Instr->clone();
+ ClonedValue->insertBefore(InsertBefore);
+ ClonedValue->setName(Instr->getName() + ".remat");
+
+ // If it is not first instruction in the chain then it uses previously
+ // cloned value. We should update it to use cloned value.
+ if (LastClonedValue) {
+ assert(LastValue);
+ ClonedValue->replaceUsesOfWith(LastValue, LastClonedValue);
+#ifndef NDEBUG
+ // Assert that cloned instruction does not use any instructions from
+ // this chain other than LastClonedValue
+ for (auto OpValue : ClonedValue->operand_values()) {
+ assert(std::find(ChainToBase.begin(), ChainToBase.end(), OpValue) ==
+ ChainToBase.end() &&
+ "incorrect use in rematerialization chain");
+ }
+#endif
+ }
+
+ LastClonedValue = ClonedValue;
+ LastValue = Instr;
+ }
+ assert(LastClonedValue);
+ return LastClonedValue;
+ };
+
+ // Different cases for calls and invokes. For invokes we need to clone
+ // instructions both on normal and unwind path.
+ if (CS.isCall()) {
+ Instruction *InsertBefore = CS.getInstruction()->getNextNode();
+ assert(InsertBefore);
+ Instruction *RematerializedValue = rematerializeChain(InsertBefore);
+ Info.RematerializedValues[RematerializedValue] = LiveValue;
+ } else {
+ InvokeInst *Invoke = cast<InvokeInst>(CS.getInstruction());
+
+ Instruction *NormalInsertBefore =
+ &*Invoke->getNormalDest()->getFirstInsertionPt();
+ Instruction *UnwindInsertBefore =
+ &*Invoke->getUnwindDest()->getFirstInsertionPt();
+
+ Instruction *NormalRematerializedValue =
+ rematerializeChain(NormalInsertBefore);
+ Instruction *UnwindRematerializedValue =
+ rematerializeChain(UnwindInsertBefore);
+
+ Info.RematerializedValues[NormalRematerializedValue] = LiveValue;
+ Info.RematerializedValues[UnwindRematerializedValue] = LiveValue;
+ }
+ }
+
+ // Remove rematerializaed values from the live set
+ for (auto LiveValue: LiveValuesToBeDeleted) {
+ Info.LiveSet.erase(LiveValue);
+ }
+}
+
+static bool insertParsePoints(Function &F, DominatorTree &DT,
+ TargetTransformInfo &TTI,
+ SmallVectorImpl<CallSite> &ToUpdate) {
+#ifndef NDEBUG
+ // sanity check the input
+ std::set<CallSite> Uniqued;
+ Uniqued.insert(ToUpdate.begin(), ToUpdate.end());
+ assert(Uniqued.size() == ToUpdate.size() && "no duplicates please!");
+
+ for (CallSite CS : ToUpdate) {
+ assert(CS.getInstruction()->getParent()->getParent() == &F);
+ assert((UseDeoptBundles || isStatepoint(CS)) &&
+ "expected to already be a deopt statepoint");
+ }
+#endif
+
+ // When inserting gc.relocates for invokes, we need to be able to insert at
+ // the top of the successor blocks. See the comment on
+ // normalForInvokeSafepoint on exactly what is needed. Note that this step
+ // may restructure the CFG.
+ for (CallSite CS : ToUpdate) {
+ if (!CS.isInvoke())
+ continue;
+ auto *II = cast<InvokeInst>(CS.getInstruction());
+ normalizeForInvokeSafepoint(II->getNormalDest(), II->getParent(), DT);
+ normalizeForInvokeSafepoint(II->getUnwindDest(), II->getParent(), DT);
+ }
+
+ // A list of dummy calls added to the IR to keep various values obviously
+ // live in the IR. We'll remove all of these when done.
+ SmallVector<CallInst *, 64> Holders;
+
+ // Insert a dummy call with all of the arguments to the vm_state we'll need
+ // for the actual safepoint insertion. This ensures reference arguments in
+ // the deopt argument list are considered live through the safepoint (and
+ // thus makes sure they get relocated.)
+ for (CallSite CS : ToUpdate) {
+ SmallVector<Value *, 64> DeoptValues;
+
+ iterator_range<const Use *> DeoptStateRange =
+ UseDeoptBundles
+ ? iterator_range<const Use *>(GetDeoptBundleOperands(CS))
+ : iterator_range<const Use *>(Statepoint(CS).vm_state_args());
+
+ for (Value *Arg : DeoptStateRange) {
+ assert(!isUnhandledGCPointerType(Arg->getType()) &&
+ "support for FCA unimplemented");
+ if (isHandledGCPointerType(Arg->getType()))
+ DeoptValues.push_back(Arg);
+ }
+
+ insertUseHolderAfter(CS, DeoptValues, Holders);
+ }
+
+ SmallVector<PartiallyConstructedSafepointRecord, 64> Records(ToUpdate.size());
+
+ // A) Identify all gc pointers which are statically live at the given call
+ // site.
+ findLiveReferences(F, DT, ToUpdate, Records);
+
+ // B) Find the base pointers for each live pointer
+ /* scope for caching */ {
+ // Cache the 'defining value' relation used in the computation and
+ // insertion of base phis and selects. This ensures that we don't insert
+ // large numbers of duplicate base_phis.
+ DefiningValueMapTy DVCache;
+
+ for (size_t i = 0; i < Records.size(); i++) {
+ PartiallyConstructedSafepointRecord &info = Records[i];
+ findBasePointers(DT, DVCache, ToUpdate[i], info);
+ }
+ } // end of cache scope
+
+ // The base phi insertion logic (for any safepoint) may have inserted new
+ // instructions which are now live at some safepoint. The simplest such
+ // example is:
+ // loop:
+ // phi a <-- will be a new base_phi here
+ // safepoint 1 <-- that needs to be live here
+ // gep a + 1
+ // safepoint 2
+ // br loop
+ // We insert some dummy calls after each safepoint to definitely hold live
+ // the base pointers which were identified for that safepoint. We'll then
+ // ask liveness for _every_ base inserted to see what is now live. Then we
+ // remove the dummy calls.
+ Holders.reserve(Holders.size() + Records.size());
+ for (size_t i = 0; i < Records.size(); i++) {
+ PartiallyConstructedSafepointRecord &Info = Records[i];
+
+ SmallVector<Value *, 128> Bases;
+ for (auto Pair : Info.PointerToBase)
+ Bases.push_back(Pair.second);
+
+ insertUseHolderAfter(ToUpdate[i], Bases, Holders);
+ }
+
+ // By selecting base pointers, we've effectively inserted new uses. Thus, we
+ // need to rerun liveness. We may *also* have inserted new defs, but that's
+ // not the key issue.
+ recomputeLiveInValues(F, DT, ToUpdate, Records);
+
+ if (PrintBasePointers) {
+ for (auto &Info : Records) {
+ errs() << "Base Pairs: (w/Relocation)\n";
+ for (auto Pair : Info.PointerToBase) {
+ errs() << " derived ";
+ Pair.first->printAsOperand(errs(), false);
+ errs() << " base ";
+ Pair.second->printAsOperand(errs(), false);
+ errs() << "\n";
+ }
+ }
+ }
+
+ // It is possible that non-constant live variables have a constant base. For
+ // example, a GEP with a variable offset from a global. In this case we can
+ // remove it from the liveset. We already don't add constants to the liveset
+ // because we assume they won't move at runtime and the GC doesn't need to be
+ // informed about them. The same reasoning applies if the base is constant.
+ // Note that the relocation placement code relies on this filtering for
+ // correctness as it expects the base to be in the liveset, which isn't true
+ // if the base is constant.
+ for (auto &Info : Records)
+ for (auto &BasePair : Info.PointerToBase)
+ if (isa<Constant>(BasePair.second))
+ Info.LiveSet.erase(BasePair.first);
+
+ for (CallInst *CI : Holders)
+ CI->eraseFromParent();
+
+ Holders.clear();
+
+ // Do a limited scalarization of any live at safepoint vector values which
+ // contain pointers. This enables this pass to run after vectorization at
+ // the cost of some possible performance loss. Note: This is known to not
+ // handle updating of the side tables correctly which can lead to relocation
+ // bugs when the same vector is live at multiple statepoints. We're in the
+ // process of implementing the alternate lowering - relocating the
+ // vector-of-pointers as first class item and updating the backend to
+ // understand that - but that's not yet complete.
+ if (UseVectorSplit)
+ for (size_t i = 0; i < Records.size(); i++) {
+ PartiallyConstructedSafepointRecord &Info = Records[i];
+ Instruction *Statepoint = ToUpdate[i].getInstruction();
+ splitVectorValues(cast<Instruction>(Statepoint), Info.LiveSet,
+ Info.PointerToBase, DT);
+ }
+
+ // In order to reduce live set of statepoint we might choose to rematerialize
+ // some values instead of relocating them. This is purely an optimization and
+ // does not influence correctness.
+ for (size_t i = 0; i < Records.size(); i++)
+ rematerializeLiveValues(ToUpdate[i], Records[i], TTI);
+
+ // We need this to safely RAUW and delete call or invoke return values that
+ // may themselves be live over a statepoint. For details, please see usage in
+ // makeStatepointExplicitImpl.
+ std::vector<DeferredReplacement> Replacements;
+
+ // Now run through and replace the existing statepoints with new ones with
+ // the live variables listed. We do not yet update uses of the values being
+ // relocated. We have references to live variables that need to
+ // survive to the last iteration of this loop. (By construction, the
+ // previous statepoint can not be a live variable, thus we can and remove
+ // the old statepoint calls as we go.)
+ for (size_t i = 0; i < Records.size(); i++)
+ makeStatepointExplicit(DT, ToUpdate[i], Records[i], Replacements);
+
+ ToUpdate.clear(); // prevent accident use of invalid CallSites
+
+ for (auto &PR : Replacements)
+ PR.doReplacement();
+
+ Replacements.clear();
+
+ for (auto &Info : Records) {
+ // These live sets may contain state Value pointers, since we replaced calls
+ // with operand bundles with calls wrapped in gc.statepoint, and some of
+ // those calls may have been def'ing live gc pointers. Clear these out to
+ // avoid accidentally using them.
+ //
+ // TODO: We should create a separate data structure that does not contain
+ // these live sets, and migrate to using that data structure from this point
+ // onward.
+ Info.LiveSet.clear();
+ Info.PointerToBase.clear();
+ }
+
+ // Do all the fixups of the original live variables to their relocated selves
+ SmallVector<Value *, 128> Live;
+ for (size_t i = 0; i < Records.size(); i++) {
+ PartiallyConstructedSafepointRecord &Info = Records[i];
+
+ // We can't simply save the live set from the original insertion. One of
+ // the live values might be the result of a call which needs a safepoint.
+ // That Value* no longer exists and we need to use the new gc_result.
+ // Thankfully, the live set is embedded in the statepoint (and updated), so
+ // we just grab that.
+ Statepoint Statepoint(Info.StatepointToken);
+ Live.insert(Live.end(), Statepoint.gc_args_begin(),
+ Statepoint.gc_args_end());
+#ifndef NDEBUG
+ // Do some basic sanity checks on our liveness results before performing
+ // relocation. Relocation can and will turn mistakes in liveness results
+ // into non-sensical code which is must harder to debug.
+ // TODO: It would be nice to test consistency as well
+ assert(DT.isReachableFromEntry(Info.StatepointToken->getParent()) &&
+ "statepoint must be reachable or liveness is meaningless");
+ for (Value *V : Statepoint.gc_args()) {
+ if (!isa<Instruction>(V))
+ // Non-instruction values trivial dominate all possible uses
+ continue;
+ auto *LiveInst = cast<Instruction>(V);
+ assert(DT.isReachableFromEntry(LiveInst->getParent()) &&
+ "unreachable values should never be live");
+ assert(DT.dominates(LiveInst, Info.StatepointToken) &&
+ "basic SSA liveness expectation violated by liveness analysis");
+ }
+#endif
+ }
+ unique_unsorted(Live);
+
+#ifndef NDEBUG
+ // sanity check
+ for (auto *Ptr : Live)
+ assert(isHandledGCPointerType(Ptr->getType()) &&
+ "must be a gc pointer type");
+#endif
+
+ relocationViaAlloca(F, DT, Live, Records);
+ return !Records.empty();
+}
+
+// Handles both return values and arguments for Functions and CallSites.
+template <typename AttrHolder>
+static void RemoveNonValidAttrAtIndex(LLVMContext &Ctx, AttrHolder &AH,
+ unsigned Index) {
+ AttrBuilder R;
+ if (AH.getDereferenceableBytes(Index))
+ R.addAttribute(Attribute::get(Ctx, Attribute::Dereferenceable,
+ AH.getDereferenceableBytes(Index)));
+ if (AH.getDereferenceableOrNullBytes(Index))
+ R.addAttribute(Attribute::get(Ctx, Attribute::DereferenceableOrNull,
+ AH.getDereferenceableOrNullBytes(Index)));
+ if (AH.doesNotAlias(Index))
+ R.addAttribute(Attribute::NoAlias);
+
+ if (!R.empty())
+ AH.setAttributes(AH.getAttributes().removeAttributes(
+ Ctx, Index, AttributeSet::get(Ctx, Index, R)));
+}
+
+void
+RewriteStatepointsForGC::stripNonValidAttributesFromPrototype(Function &F) {
+ LLVMContext &Ctx = F.getContext();
+
+ for (Argument &A : F.args())
+ if (isa<PointerType>(A.getType()))
+ RemoveNonValidAttrAtIndex(Ctx, F, A.getArgNo() + 1);
+
+ if (isa<PointerType>(F.getReturnType()))
+ RemoveNonValidAttrAtIndex(Ctx, F, AttributeSet::ReturnIndex);
+}
+
+void RewriteStatepointsForGC::stripNonValidAttributesFromBody(Function &F) {
+ if (F.empty())
+ return;
+
+ LLVMContext &Ctx = F.getContext();
+ MDBuilder Builder(Ctx);
+
+ for (Instruction &I : instructions(F)) {
+ if (const MDNode *MD = I.getMetadata(LLVMContext::MD_tbaa)) {
+ assert(MD->getNumOperands() < 5 && "unrecognized metadata shape!");
+ bool IsImmutableTBAA =
+ MD->getNumOperands() == 4 &&
+ mdconst::extract<ConstantInt>(MD->getOperand(3))->getValue() == 1;
+
+ if (!IsImmutableTBAA)
+ continue; // no work to do, MD_tbaa is already marked mutable
+
+ MDNode *Base = cast<MDNode>(MD->getOperand(0));
+ MDNode *Access = cast<MDNode>(MD->getOperand(1));
+ uint64_t Offset =
+ mdconst::extract<ConstantInt>(MD->getOperand(2))->getZExtValue();
+
+ MDNode *MutableTBAA =
+ Builder.createTBAAStructTagNode(Base, Access, Offset);
+ I.setMetadata(LLVMContext::MD_tbaa, MutableTBAA);
+ }
+
+ if (CallSite CS = CallSite(&I)) {
+ for (int i = 0, e = CS.arg_size(); i != e; i++)
+ if (isa<PointerType>(CS.getArgument(i)->getType()))
+ RemoveNonValidAttrAtIndex(Ctx, CS, i + 1);
+ if (isa<PointerType>(CS.getType()))
+ RemoveNonValidAttrAtIndex(Ctx, CS, AttributeSet::ReturnIndex);
+ }
+ }
+}
+
+/// Returns true if this function should be rewritten by this pass. The main
+/// point of this function is as an extension point for custom logic.
+static bool shouldRewriteStatepointsIn(Function &F) {
+ // TODO: This should check the GCStrategy
+ if (F.hasGC()) {
+ const auto &FunctionGCName = F.getGC();
+ const StringRef StatepointExampleName("statepoint-example");
+ const StringRef CoreCLRName("coreclr");
+ return (StatepointExampleName == FunctionGCName) ||
+ (CoreCLRName == FunctionGCName);
+ } else
+ return false;
+}
+
+void RewriteStatepointsForGC::stripNonValidAttributes(Module &M) {
+#ifndef NDEBUG
+ assert(std::any_of(M.begin(), M.end(), shouldRewriteStatepointsIn) &&
+ "precondition!");
+#endif
+
+ for (Function &F : M)
+ stripNonValidAttributesFromPrototype(F);
+
+ for (Function &F : M)
+ stripNonValidAttributesFromBody(F);
+}
+
+bool RewriteStatepointsForGC::runOnFunction(Function &F) {
+ // Nothing to do for declarations.
+ if (F.isDeclaration() || F.empty())
+ return false;
+
+ // Policy choice says not to rewrite - the most common reason is that we're
+ // compiling code without a GCStrategy.
+ if (!shouldRewriteStatepointsIn(F))
+ return false;
+
+ DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
+ TargetTransformInfo &TTI =
+ getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+
+ auto NeedsRewrite = [](Instruction &I) {
+ if (UseDeoptBundles) {
+ if (ImmutableCallSite CS = ImmutableCallSite(&I))
+ return !callsGCLeafFunction(CS);
+ return false;
+ }
+
+ return isStatepoint(I);
+ };
+
+ // Gather all the statepoints which need rewritten. Be careful to only
+ // consider those in reachable code since we need to ask dominance queries
+ // when rewriting. We'll delete the unreachable ones in a moment.
+ SmallVector<CallSite, 64> ParsePointNeeded;
+ bool HasUnreachableStatepoint = false;
+ for (Instruction &I : instructions(F)) {
+ // TODO: only the ones with the flag set!
+ if (NeedsRewrite(I)) {
+ if (DT.isReachableFromEntry(I.getParent()))
+ ParsePointNeeded.push_back(CallSite(&I));
+ else
+ HasUnreachableStatepoint = true;
+ }
+ }
+
+ bool MadeChange = false;
+
+ // Delete any unreachable statepoints so that we don't have unrewritten
+ // statepoints surviving this pass. This makes testing easier and the
+ // resulting IR less confusing to human readers. Rather than be fancy, we
+ // just reuse a utility function which removes the unreachable blocks.
+ if (HasUnreachableStatepoint)
+ MadeChange |= removeUnreachableBlocks(F);
+
+ // Return early if no work to do.
+ if (ParsePointNeeded.empty())
+ return MadeChange;
+
+ // As a prepass, go ahead and aggressively destroy single entry phi nodes.
+ // These are created by LCSSA. They have the effect of increasing the size
+ // of liveness sets for no good reason. It may be harder to do this post
+ // insertion since relocations and base phis can confuse things.
+ for (BasicBlock &BB : F)
+ if (BB.getUniquePredecessor()) {
+ MadeChange = true;
+ FoldSingleEntryPHINodes(&BB);
+ }
+
+ // Before we start introducing relocations, we want to tweak the IR a bit to
+ // avoid unfortunate code generation effects. The main example is that we
+ // want to try to make sure the comparison feeding a branch is after any
+ // safepoints. Otherwise, we end up with a comparison of pre-relocation
+ // values feeding a branch after relocation. This is semantically correct,
+ // but results in extra register pressure since both the pre-relocation and
+ // post-relocation copies must be available in registers. For code without
+ // relocations this is handled elsewhere, but teaching the scheduler to
+ // reverse the transform we're about to do would be slightly complex.
+ // Note: This may extend the live range of the inputs to the icmp and thus
+ // increase the liveset of any statepoint we move over. This is profitable
+ // as long as all statepoints are in rare blocks. If we had in-register
+ // lowering for live values this would be a much safer transform.
+ auto getConditionInst = [](TerminatorInst *TI) -> Instruction* {
+ if (auto *BI = dyn_cast<BranchInst>(TI))
+ if (BI->isConditional())
+ return dyn_cast<Instruction>(BI->getCondition());
+ // TODO: Extend this to handle switches
+ return nullptr;
+ };
+ for (BasicBlock &BB : F) {
+ TerminatorInst *TI = BB.getTerminator();
+ if (auto *Cond = getConditionInst(TI))
+ // TODO: Handle more than just ICmps here. We should be able to move
+ // most instructions without side effects or memory access.
+ if (isa<ICmpInst>(Cond) && Cond->hasOneUse()) {
+ MadeChange = true;
+ Cond->moveBefore(TI);
+ }
+ }
+
+ MadeChange |= insertParsePoints(F, DT, TTI, ParsePointNeeded);
+ return MadeChange;
+}
+
+// liveness computation via standard dataflow
+// -------------------------------------------------------------------
+
+// TODO: Consider using bitvectors for liveness, the set of potentially
+// interesting values should be small and easy to pre-compute.
+
+/// Compute the live-in set for the location rbegin starting from
+/// the live-out set of the basic block
+static void computeLiveInValues(BasicBlock::reverse_iterator rbegin,
+ BasicBlock::reverse_iterator rend,
+ DenseSet<Value *> &LiveTmp) {
+
+ for (BasicBlock::reverse_iterator ritr = rbegin; ritr != rend; ritr++) {
+ Instruction *I = &*ritr;
+
+ // KILL/Def - Remove this definition from LiveIn
+ LiveTmp.erase(I);
+
+ // Don't consider *uses* in PHI nodes, we handle their contribution to
+ // predecessor blocks when we seed the LiveOut sets
+ if (isa<PHINode>(I))
+ continue;
+
+ // USE - Add to the LiveIn set for this instruction
+ for (Value *V : I->operands()) {
+ assert(!isUnhandledGCPointerType(V->getType()) &&
+ "support for FCA unimplemented");
+ if (isHandledGCPointerType(V->getType()) && !isa<Constant>(V)) {
+ // The choice to exclude all things constant here is slightly subtle.
+ // There are two independent reasons:
+ // - We assume that things which are constant (from LLVM's definition)
+ // do not move at runtime. For example, the address of a global
+ // variable is fixed, even though it's contents may not be.
+ // - Second, we can't disallow arbitrary inttoptr constants even
+ // if the language frontend does. Optimization passes are free to
+ // locally exploit facts without respect to global reachability. This
+ // can create sections of code which are dynamically unreachable and
+ // contain just about anything. (see constants.ll in tests)
+ LiveTmp.insert(V);
+ }
+ }
+ }
+}
+
+static void computeLiveOutSeed(BasicBlock *BB, DenseSet<Value *> &LiveTmp) {
+
+ for (BasicBlock *Succ : successors(BB)) {
+ const BasicBlock::iterator E(Succ->getFirstNonPHI());
+ for (BasicBlock::iterator I = Succ->begin(); I != E; I++) {
+ PHINode *Phi = cast<PHINode>(&*I);
+ Value *V = Phi->getIncomingValueForBlock(BB);
+ assert(!isUnhandledGCPointerType(V->getType()) &&
+ "support for FCA unimplemented");
+ if (isHandledGCPointerType(V->getType()) && !isa<Constant>(V)) {
+ LiveTmp.insert(V);
+ }
+ }
+ }
+}
+
+static DenseSet<Value *> computeKillSet(BasicBlock *BB) {
+ DenseSet<Value *> KillSet;
+ for (Instruction &I : *BB)
+ if (isHandledGCPointerType(I.getType()))
+ KillSet.insert(&I);
+ return KillSet;
+}
+
+#ifndef NDEBUG
+/// Check that the items in 'Live' dominate 'TI'. This is used as a basic
+/// sanity check for the liveness computation.
+static void checkBasicSSA(DominatorTree &DT, DenseSet<Value *> &Live,
+ TerminatorInst *TI, bool TermOkay = false) {
+ for (Value *V : Live) {
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ // The terminator can be a member of the LiveOut set. LLVM's definition
+ // of instruction dominance states that V does not dominate itself. As
+ // such, we need to special case this to allow it.
+ if (TermOkay && TI == I)
+ continue;
+ assert(DT.dominates(I, TI) &&
+ "basic SSA liveness expectation violated by liveness analysis");
+ }
+ }
+}
+
+/// Check that all the liveness sets used during the computation of liveness
+/// obey basic SSA properties. This is useful for finding cases where we miss
+/// a def.
+static void checkBasicSSA(DominatorTree &DT, GCPtrLivenessData &Data,
+ BasicBlock &BB) {
+ checkBasicSSA(DT, Data.LiveSet[&BB], BB.getTerminator());
+ checkBasicSSA(DT, Data.LiveOut[&BB], BB.getTerminator(), true);
+ checkBasicSSA(DT, Data.LiveIn[&BB], BB.getTerminator());
+}
+#endif
+
+static void computeLiveInValues(DominatorTree &DT, Function &F,
+ GCPtrLivenessData &Data) {
+
+ SmallSetVector<BasicBlock *, 200> Worklist;
+ auto AddPredsToWorklist = [&](BasicBlock *BB) {
+ // We use a SetVector so that we don't have duplicates in the worklist.
+ Worklist.insert(pred_begin(BB), pred_end(BB));
+ };
+ auto NextItem = [&]() {
+ BasicBlock *BB = Worklist.back();
+ Worklist.pop_back();
+ return BB;
+ };
+
+ // Seed the liveness for each individual block
+ for (BasicBlock &BB : F) {
+ Data.KillSet[&BB] = computeKillSet(&BB);
+ Data.LiveSet[&BB].clear();
+ computeLiveInValues(BB.rbegin(), BB.rend(), Data.LiveSet[&BB]);
+
+#ifndef NDEBUG
+ for (Value *Kill : Data.KillSet[&BB])
+ assert(!Data.LiveSet[&BB].count(Kill) && "live set contains kill");
+#endif
+
+ Data.LiveOut[&BB] = DenseSet<Value *>();
+ computeLiveOutSeed(&BB, Data.LiveOut[&BB]);
+ Data.LiveIn[&BB] = Data.LiveSet[&BB];
+ set_union(Data.LiveIn[&BB], Data.LiveOut[&BB]);
+ set_subtract(Data.LiveIn[&BB], Data.KillSet[&BB]);
+ if (!Data.LiveIn[&BB].empty())
+ AddPredsToWorklist(&BB);
+ }
+
+ // Propagate that liveness until stable
+ while (!Worklist.empty()) {
+ BasicBlock *BB = NextItem();
+
+ // Compute our new liveout set, then exit early if it hasn't changed
+ // despite the contribution of our successor.
+ DenseSet<Value *> LiveOut = Data.LiveOut[BB];
+ const auto OldLiveOutSize = LiveOut.size();
+ for (BasicBlock *Succ : successors(BB)) {
+ assert(Data.LiveIn.count(Succ));
+ set_union(LiveOut, Data.LiveIn[Succ]);
+ }
+ // assert OutLiveOut is a subset of LiveOut
+ if (OldLiveOutSize == LiveOut.size()) {
+ // If the sets are the same size, then we didn't actually add anything
+ // when unioning our successors LiveIn Thus, the LiveIn of this block
+ // hasn't changed.
+ continue;
+ }
+ Data.LiveOut[BB] = LiveOut;
+
+ // Apply the effects of this basic block
+ DenseSet<Value *> LiveTmp = LiveOut;
+ set_union(LiveTmp, Data.LiveSet[BB]);
+ set_subtract(LiveTmp, Data.KillSet[BB]);
+
+ assert(Data.LiveIn.count(BB));
+ const DenseSet<Value *> &OldLiveIn = Data.LiveIn[BB];
+ // assert: OldLiveIn is a subset of LiveTmp
+ if (OldLiveIn.size() != LiveTmp.size()) {
+ Data.LiveIn[BB] = LiveTmp;
+ AddPredsToWorklist(BB);
+ }
+ } // while( !worklist.empty() )
+
+#ifndef NDEBUG
+ // Sanity check our output against SSA properties. This helps catch any
+ // missing kills during the above iteration.
+ for (BasicBlock &BB : F) {
+ checkBasicSSA(DT, Data, BB);
+ }
+#endif
+}
+
+static void findLiveSetAtInst(Instruction *Inst, GCPtrLivenessData &Data,
+ StatepointLiveSetTy &Out) {
+
+ BasicBlock *BB = Inst->getParent();
+
+ // Note: The copy is intentional and required
+ assert(Data.LiveOut.count(BB));
+ DenseSet<Value *> LiveOut = Data.LiveOut[BB];
+
+ // We want to handle the statepoint itself oddly. It's
+ // call result is not live (normal), nor are it's arguments
+ // (unless they're used again later). This adjustment is
+ // specifically what we need to relocate
+ BasicBlock::reverse_iterator rend(Inst->getIterator());
+ computeLiveInValues(BB->rbegin(), rend, LiveOut);
+ LiveOut.erase(Inst);
+ Out.insert(LiveOut.begin(), LiveOut.end());
+}
+
+static void recomputeLiveInValues(GCPtrLivenessData &RevisedLivenessData,
+ const CallSite &CS,
+ PartiallyConstructedSafepointRecord &Info) {
+ Instruction *Inst = CS.getInstruction();
+ StatepointLiveSetTy Updated;
+ findLiveSetAtInst(Inst, RevisedLivenessData, Updated);
+
+#ifndef NDEBUG
+ DenseSet<Value *> Bases;
+ for (auto KVPair : Info.PointerToBase) {
+ Bases.insert(KVPair.second);
+ }
+#endif
+ // We may have base pointers which are now live that weren't before. We need
+ // to update the PointerToBase structure to reflect this.
+ for (auto V : Updated)
+ if (!Info.PointerToBase.count(V)) {
+ assert(Bases.count(V) && "can't find base for unexpected live value");
+ Info.PointerToBase[V] = V;
+ continue;
+ }
+
+#ifndef NDEBUG
+ for (auto V : Updated) {
+ assert(Info.PointerToBase.count(V) &&
+ "must be able to find base for live value");
+ }
+#endif
+
+ // Remove any stale base mappings - this can happen since our liveness is
+ // more precise then the one inherent in the base pointer analysis
+ DenseSet<Value *> ToErase;
+ for (auto KVPair : Info.PointerToBase)
+ if (!Updated.count(KVPair.first))
+ ToErase.insert(KVPair.first);
+ for (auto V : ToErase)
+ Info.PointerToBase.erase(V);
+
+#ifndef NDEBUG
+ for (auto KVPair : Info.PointerToBase)
+ assert(Updated.count(KVPair.first) && "record for non-live value");
+#endif
+
+ Info.LiveSet = Updated;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp
new file mode 100644
index 0000000..8569e08
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp
@@ -0,0 +1,1980 @@
+//===- SCCP.cpp - Sparse Conditional Constant Propagation -----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements sparse conditional constant propagation and merging:
+//
+// Specifically, this:
+// * Assumes values are constant unless proven otherwise
+// * Assumes BasicBlocks are dead unless proven otherwise
+// * Proves values to be constant, and replaces them with constants
+// * Proves conditional branches to be unconditional
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+using namespace llvm;
+
+#define DEBUG_TYPE "sccp"
+
+STATISTIC(NumInstRemoved, "Number of instructions removed");
+STATISTIC(NumDeadBlocks , "Number of basic blocks unreachable");
+
+STATISTIC(IPNumInstRemoved, "Number of instructions removed by IPSCCP");
+STATISTIC(IPNumArgsElimed ,"Number of arguments constant propagated by IPSCCP");
+STATISTIC(IPNumGlobalConst, "Number of globals found to be constant by IPSCCP");
+
+namespace {
+/// LatticeVal class - This class represents the different lattice values that
+/// an LLVM value may occupy. It is a simple class with value semantics.
+///
+class LatticeVal {
+ enum LatticeValueTy {
+ /// undefined - This LLVM Value has no known value yet.
+ undefined,
+
+ /// constant - This LLVM Value has a specific constant value.
+ constant,
+
+ /// forcedconstant - This LLVM Value was thought to be undef until
+ /// ResolvedUndefsIn. This is treated just like 'constant', but if merged
+ /// with another (different) constant, it goes to overdefined, instead of
+ /// asserting.
+ forcedconstant,
+
+ /// overdefined - This instruction is not known to be constant, and we know
+ /// it has a value.
+ overdefined
+ };
+
+ /// Val: This stores the current lattice value along with the Constant* for
+ /// the constant if this is a 'constant' or 'forcedconstant' value.
+ PointerIntPair<Constant *, 2, LatticeValueTy> Val;
+
+ LatticeValueTy getLatticeValue() const {
+ return Val.getInt();
+ }
+
+public:
+ LatticeVal() : Val(nullptr, undefined) {}
+
+ bool isUndefined() const { return getLatticeValue() == undefined; }
+ bool isConstant() const {
+ return getLatticeValue() == constant || getLatticeValue() == forcedconstant;
+ }
+ bool isOverdefined() const { return getLatticeValue() == overdefined; }
+
+ Constant *getConstant() const {
+ assert(isConstant() && "Cannot get the constant of a non-constant!");
+ return Val.getPointer();
+ }
+
+ /// markOverdefined - Return true if this is a change in status.
+ bool markOverdefined() {
+ if (isOverdefined())
+ return false;
+
+ Val.setInt(overdefined);
+ return true;
+ }
+
+ /// markConstant - Return true if this is a change in status.
+ bool markConstant(Constant *V) {
+ if (getLatticeValue() == constant) { // Constant but not forcedconstant.
+ assert(getConstant() == V && "Marking constant with different value");
+ return false;
+ }
+
+ if (isUndefined()) {
+ Val.setInt(constant);
+ assert(V && "Marking constant with NULL");
+ Val.setPointer(V);
+ } else {
+ assert(getLatticeValue() == forcedconstant &&
+ "Cannot move from overdefined to constant!");
+ // Stay at forcedconstant if the constant is the same.
+ if (V == getConstant()) return false;
+
+ // Otherwise, we go to overdefined. Assumptions made based on the
+ // forced value are possibly wrong. Assuming this is another constant
+ // could expose a contradiction.
+ Val.setInt(overdefined);
+ }
+ return true;
+ }
+
+ /// getConstantInt - If this is a constant with a ConstantInt value, return it
+ /// otherwise return null.
+ ConstantInt *getConstantInt() const {
+ if (isConstant())
+ return dyn_cast<ConstantInt>(getConstant());
+ return nullptr;
+ }
+
+ void markForcedConstant(Constant *V) {
+ assert(isUndefined() && "Can't force a defined value!");
+ Val.setInt(forcedconstant);
+ Val.setPointer(V);
+ }
+};
+} // end anonymous namespace.
+
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+//
+/// SCCPSolver - This class is a general purpose solver for Sparse Conditional
+/// Constant Propagation.
+///
+class SCCPSolver : public InstVisitor<SCCPSolver> {
+ const DataLayout &DL;
+ const TargetLibraryInfo *TLI;
+ SmallPtrSet<BasicBlock*, 8> BBExecutable; // The BBs that are executable.
+ DenseMap<Value*, LatticeVal> ValueState; // The state each value is in.
+
+ /// StructValueState - This maintains ValueState for values that have
+ /// StructType, for example for formal arguments, calls, insertelement, etc.
+ ///
+ DenseMap<std::pair<Value*, unsigned>, LatticeVal> StructValueState;
+
+ /// GlobalValue - If we are tracking any values for the contents of a global
+ /// variable, we keep a mapping from the constant accessor to the element of
+ /// the global, to the currently known value. If the value becomes
+ /// overdefined, it's entry is simply removed from this map.
+ DenseMap<GlobalVariable*, LatticeVal> TrackedGlobals;
+
+ /// TrackedRetVals - If we are tracking arguments into and the return
+ /// value out of a function, it will have an entry in this map, indicating
+ /// what the known return value for the function is.
+ DenseMap<Function*, LatticeVal> TrackedRetVals;
+
+ /// TrackedMultipleRetVals - Same as TrackedRetVals, but used for functions
+ /// that return multiple values.
+ DenseMap<std::pair<Function*, unsigned>, LatticeVal> TrackedMultipleRetVals;
+
+ /// MRVFunctionsTracked - Each function in TrackedMultipleRetVals is
+ /// represented here for efficient lookup.
+ SmallPtrSet<Function*, 16> MRVFunctionsTracked;
+
+ /// TrackingIncomingArguments - This is the set of functions for whose
+ /// arguments we make optimistic assumptions about and try to prove as
+ /// constants.
+ SmallPtrSet<Function*, 16> TrackingIncomingArguments;
+
+ /// The reason for two worklists is that overdefined is the lowest state
+ /// on the lattice, and moving things to overdefined as fast as possible
+ /// makes SCCP converge much faster.
+ ///
+ /// By having a separate worklist, we accomplish this because everything
+ /// possibly overdefined will become overdefined at the soonest possible
+ /// point.
+ SmallVector<Value*, 64> OverdefinedInstWorkList;
+ SmallVector<Value*, 64> InstWorkList;
+
+
+ SmallVector<BasicBlock*, 64> BBWorkList; // The BasicBlock work list
+
+ /// KnownFeasibleEdges - Entries in this set are edges which have already had
+ /// PHI nodes retriggered.
+ typedef std::pair<BasicBlock*, BasicBlock*> Edge;
+ DenseSet<Edge> KnownFeasibleEdges;
+public:
+ SCCPSolver(const DataLayout &DL, const TargetLibraryInfo *tli)
+ : DL(DL), TLI(tli) {}
+
+ /// MarkBlockExecutable - This method can be used by clients to mark all of
+ /// the blocks that are known to be intrinsically live in the processed unit.
+ ///
+ /// This returns true if the block was not considered live before.
+ bool MarkBlockExecutable(BasicBlock *BB) {
+ if (!BBExecutable.insert(BB).second)
+ return false;
+ DEBUG(dbgs() << "Marking Block Executable: " << BB->getName() << '\n');
+ BBWorkList.push_back(BB); // Add the block to the work list!
+ return true;
+ }
+
+ /// TrackValueOfGlobalVariable - Clients can use this method to
+ /// inform the SCCPSolver that it should track loads and stores to the
+ /// specified global variable if it can. This is only legal to call if
+ /// performing Interprocedural SCCP.
+ void TrackValueOfGlobalVariable(GlobalVariable *GV) {
+ // We only track the contents of scalar globals.
+ if (GV->getType()->getElementType()->isSingleValueType()) {
+ LatticeVal &IV = TrackedGlobals[GV];
+ if (!isa<UndefValue>(GV->getInitializer()))
+ IV.markConstant(GV->getInitializer());
+ }
+ }
+
+ /// AddTrackedFunction - If the SCCP solver is supposed to track calls into
+ /// and out of the specified function (which cannot have its address taken),
+ /// this method must be called.
+ void AddTrackedFunction(Function *F) {
+ // Add an entry, F -> undef.
+ if (StructType *STy = dyn_cast<StructType>(F->getReturnType())) {
+ MRVFunctionsTracked.insert(F);
+ for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
+ TrackedMultipleRetVals.insert(std::make_pair(std::make_pair(F, i),
+ LatticeVal()));
+ } else
+ TrackedRetVals.insert(std::make_pair(F, LatticeVal()));
+ }
+
+ void AddArgumentTrackedFunction(Function *F) {
+ TrackingIncomingArguments.insert(F);
+ }
+
+ /// Solve - Solve for constants and executable blocks.
+ ///
+ void Solve();
+
+ /// ResolvedUndefsIn - While solving the dataflow for a function, we assume
+ /// that branches on undef values cannot reach any of their successors.
+ /// However, this is not a safe assumption. After we solve dataflow, this
+ /// method should be use to handle this. If this returns true, the solver
+ /// should be rerun.
+ bool ResolvedUndefsIn(Function &F);
+
+ bool isBlockExecutable(BasicBlock *BB) const {
+ return BBExecutable.count(BB);
+ }
+
+ LatticeVal getLatticeValueFor(Value *V) const {
+ DenseMap<Value*, LatticeVal>::const_iterator I = ValueState.find(V);
+ assert(I != ValueState.end() && "V is not in valuemap!");
+ return I->second;
+ }
+
+ /// getTrackedRetVals - Get the inferred return value map.
+ ///
+ const DenseMap<Function*, LatticeVal> &getTrackedRetVals() {
+ return TrackedRetVals;
+ }
+
+ /// getTrackedGlobals - Get and return the set of inferred initializers for
+ /// global variables.
+ const DenseMap<GlobalVariable*, LatticeVal> &getTrackedGlobals() {
+ return TrackedGlobals;
+ }
+
+ void markOverdefined(Value *V) {
+ assert(!V->getType()->isStructTy() && "Should use other method");
+ markOverdefined(ValueState[V], V);
+ }
+
+ /// markAnythingOverdefined - Mark the specified value overdefined. This
+ /// works with both scalars and structs.
+ void markAnythingOverdefined(Value *V) {
+ if (StructType *STy = dyn_cast<StructType>(V->getType()))
+ for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
+ markOverdefined(getStructValueState(V, i), V);
+ else
+ markOverdefined(V);
+ }
+
+private:
+ // markConstant - Make a value be marked as "constant". If the value
+ // is not already a constant, add it to the instruction work list so that
+ // the users of the instruction are updated later.
+ //
+ void markConstant(LatticeVal &IV, Value *V, Constant *C) {
+ if (!IV.markConstant(C)) return;
+ DEBUG(dbgs() << "markConstant: " << *C << ": " << *V << '\n');
+ if (IV.isOverdefined())
+ OverdefinedInstWorkList.push_back(V);
+ else
+ InstWorkList.push_back(V);
+ }
+
+ void markConstant(Value *V, Constant *C) {
+ assert(!V->getType()->isStructTy() && "Should use other method");
+ markConstant(ValueState[V], V, C);
+ }
+
+ void markForcedConstant(Value *V, Constant *C) {
+ assert(!V->getType()->isStructTy() && "Should use other method");
+ LatticeVal &IV = ValueState[V];
+ IV.markForcedConstant(C);
+ DEBUG(dbgs() << "markForcedConstant: " << *C << ": " << *V << '\n');
+ if (IV.isOverdefined())
+ OverdefinedInstWorkList.push_back(V);
+ else
+ InstWorkList.push_back(V);
+ }
+
+
+ // markOverdefined - Make a value be marked as "overdefined". If the
+ // value is not already overdefined, add it to the overdefined instruction
+ // work list so that the users of the instruction are updated later.
+ void markOverdefined(LatticeVal &IV, Value *V) {
+ if (!IV.markOverdefined()) return;
+
+ DEBUG(dbgs() << "markOverdefined: ";
+ if (Function *F = dyn_cast<Function>(V))
+ dbgs() << "Function '" << F->getName() << "'\n";
+ else
+ dbgs() << *V << '\n');
+ // Only instructions go on the work list
+ OverdefinedInstWorkList.push_back(V);
+ }
+
+ void mergeInValue(LatticeVal &IV, Value *V, LatticeVal MergeWithV) {
+ if (IV.isOverdefined() || MergeWithV.isUndefined())
+ return; // Noop.
+ if (MergeWithV.isOverdefined())
+ markOverdefined(IV, V);
+ else if (IV.isUndefined())
+ markConstant(IV, V, MergeWithV.getConstant());
+ else if (IV.getConstant() != MergeWithV.getConstant())
+ markOverdefined(IV, V);
+ }
+
+ void mergeInValue(Value *V, LatticeVal MergeWithV) {
+ assert(!V->getType()->isStructTy() && "Should use other method");
+ mergeInValue(ValueState[V], V, MergeWithV);
+ }
+
+
+ /// getValueState - Return the LatticeVal object that corresponds to the
+ /// value. This function handles the case when the value hasn't been seen yet
+ /// by properly seeding constants etc.
+ LatticeVal &getValueState(Value *V) {
+ assert(!V->getType()->isStructTy() && "Should use getStructValueState");
+
+ std::pair<DenseMap<Value*, LatticeVal>::iterator, bool> I =
+ ValueState.insert(std::make_pair(V, LatticeVal()));
+ LatticeVal &LV = I.first->second;
+
+ if (!I.second)
+ return LV; // Common case, already in the map.
+
+ if (Constant *C = dyn_cast<Constant>(V)) {
+ // Undef values remain undefined.
+ if (!isa<UndefValue>(V))
+ LV.markConstant(C); // Constants are constant
+ }
+
+ // All others are underdefined by default.
+ return LV;
+ }
+
+ /// getStructValueState - Return the LatticeVal object that corresponds to the
+ /// value/field pair. This function handles the case when the value hasn't
+ /// been seen yet by properly seeding constants etc.
+ LatticeVal &getStructValueState(Value *V, unsigned i) {
+ assert(V->getType()->isStructTy() && "Should use getValueState");
+ assert(i < cast<StructType>(V->getType())->getNumElements() &&
+ "Invalid element #");
+
+ std::pair<DenseMap<std::pair<Value*, unsigned>, LatticeVal>::iterator,
+ bool> I = StructValueState.insert(
+ std::make_pair(std::make_pair(V, i), LatticeVal()));
+ LatticeVal &LV = I.first->second;
+
+ if (!I.second)
+ return LV; // Common case, already in the map.
+
+ if (Constant *C = dyn_cast<Constant>(V)) {
+ Constant *Elt = C->getAggregateElement(i);
+
+ if (!Elt)
+ LV.markOverdefined(); // Unknown sort of constant.
+ else if (isa<UndefValue>(Elt))
+ ; // Undef values remain undefined.
+ else
+ LV.markConstant(Elt); // Constants are constant.
+ }
+
+ // All others are underdefined by default.
+ return LV;
+ }
+
+
+ /// markEdgeExecutable - Mark a basic block as executable, adding it to the BB
+ /// work list if it is not already executable.
+ void markEdgeExecutable(BasicBlock *Source, BasicBlock *Dest) {
+ if (!KnownFeasibleEdges.insert(Edge(Source, Dest)).second)
+ return; // This edge is already known to be executable!
+
+ if (!MarkBlockExecutable(Dest)) {
+ // If the destination is already executable, we just made an *edge*
+ // feasible that wasn't before. Revisit the PHI nodes in the block
+ // because they have potentially new operands.
+ DEBUG(dbgs() << "Marking Edge Executable: " << Source->getName()
+ << " -> " << Dest->getName() << '\n');
+
+ PHINode *PN;
+ for (BasicBlock::iterator I = Dest->begin();
+ (PN = dyn_cast<PHINode>(I)); ++I)
+ visitPHINode(*PN);
+ }
+ }
+
+ // getFeasibleSuccessors - Return a vector of booleans to indicate which
+ // successors are reachable from a given terminator instruction.
+ //
+ void getFeasibleSuccessors(TerminatorInst &TI, SmallVectorImpl<bool> &Succs);
+
+ // isEdgeFeasible - Return true if the control flow edge from the 'From' basic
+ // block to the 'To' basic block is currently feasible.
+ //
+ bool isEdgeFeasible(BasicBlock *From, BasicBlock *To);
+
+ // OperandChangedState - This method is invoked on all of the users of an
+ // instruction that was just changed state somehow. Based on this
+ // information, we need to update the specified user of this instruction.
+ //
+ void OperandChangedState(Instruction *I) {
+ if (BBExecutable.count(I->getParent())) // Inst is executable?
+ visit(*I);
+ }
+
+private:
+ friend class InstVisitor<SCCPSolver>;
+
+ // visit implementations - Something changed in this instruction. Either an
+ // operand made a transition, or the instruction is newly executable. Change
+ // the value type of I to reflect these changes if appropriate.
+ void visitPHINode(PHINode &I);
+
+ // Terminators
+ void visitReturnInst(ReturnInst &I);
+ void visitTerminatorInst(TerminatorInst &TI);
+
+ void visitCastInst(CastInst &I);
+ void visitSelectInst(SelectInst &I);
+ void visitBinaryOperator(Instruction &I);
+ void visitCmpInst(CmpInst &I);
+ void visitExtractElementInst(ExtractElementInst &I);
+ void visitInsertElementInst(InsertElementInst &I);
+ void visitShuffleVectorInst(ShuffleVectorInst &I);
+ void visitExtractValueInst(ExtractValueInst &EVI);
+ void visitInsertValueInst(InsertValueInst &IVI);
+ void visitLandingPadInst(LandingPadInst &I) { markAnythingOverdefined(&I); }
+ void visitFuncletPadInst(FuncletPadInst &FPI) {
+ markAnythingOverdefined(&FPI);
+ }
+ void visitCatchSwitchInst(CatchSwitchInst &CPI) {
+ markAnythingOverdefined(&CPI);
+ visitTerminatorInst(CPI);
+ }
+
+ // Instructions that cannot be folded away.
+ void visitStoreInst (StoreInst &I);
+ void visitLoadInst (LoadInst &I);
+ void visitGetElementPtrInst(GetElementPtrInst &I);
+ void visitCallInst (CallInst &I) {
+ visitCallSite(&I);
+ }
+ void visitInvokeInst (InvokeInst &II) {
+ visitCallSite(&II);
+ visitTerminatorInst(II);
+ }
+ void visitCallSite (CallSite CS);
+ void visitResumeInst (TerminatorInst &I) { /*returns void*/ }
+ void visitUnreachableInst(TerminatorInst &I) { /*returns void*/ }
+ void visitFenceInst (FenceInst &I) { /*returns void*/ }
+ void visitAtomicCmpXchgInst(AtomicCmpXchgInst &I) {
+ markAnythingOverdefined(&I);
+ }
+ void visitAtomicRMWInst (AtomicRMWInst &I) { markOverdefined(&I); }
+ void visitAllocaInst (Instruction &I) { markOverdefined(&I); }
+ void visitVAArgInst (Instruction &I) { markAnythingOverdefined(&I); }
+
+ void visitInstruction(Instruction &I) {
+ // If a new instruction is added to LLVM that we don't handle.
+ dbgs() << "SCCP: Don't know how to handle: " << I << '\n';
+ markAnythingOverdefined(&I); // Just in case
+ }
+};
+
+} // end anonymous namespace
+
+
+// getFeasibleSuccessors - Return a vector of booleans to indicate which
+// successors are reachable from a given terminator instruction.
+//
+void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI,
+ SmallVectorImpl<bool> &Succs) {
+ Succs.resize(TI.getNumSuccessors());
+ if (BranchInst *BI = dyn_cast<BranchInst>(&TI)) {
+ if (BI->isUnconditional()) {
+ Succs[0] = true;
+ return;
+ }
+
+ LatticeVal BCValue = getValueState(BI->getCondition());
+ ConstantInt *CI = BCValue.getConstantInt();
+ if (!CI) {
+ // Overdefined condition variables, and branches on unfoldable constant
+ // conditions, mean the branch could go either way.
+ if (!BCValue.isUndefined())
+ Succs[0] = Succs[1] = true;
+ return;
+ }
+
+ // Constant condition variables mean the branch can only go a single way.
+ Succs[CI->isZero()] = true;
+ return;
+ }
+
+ // Unwinding instructions successors are always executable.
+ if (TI.isExceptional()) {
+ Succs.assign(TI.getNumSuccessors(), true);
+ return;
+ }
+
+ if (SwitchInst *SI = dyn_cast<SwitchInst>(&TI)) {
+ if (!SI->getNumCases()) {
+ Succs[0] = true;
+ return;
+ }
+ LatticeVal SCValue = getValueState(SI->getCondition());
+ ConstantInt *CI = SCValue.getConstantInt();
+
+ if (!CI) { // Overdefined or undefined condition?
+ // All destinations are executable!
+ if (!SCValue.isUndefined())
+ Succs.assign(TI.getNumSuccessors(), true);
+ return;
+ }
+
+ Succs[SI->findCaseValue(CI).getSuccessorIndex()] = true;
+ return;
+ }
+
+ // TODO: This could be improved if the operand is a [cast of a] BlockAddress.
+ if (isa<IndirectBrInst>(&TI)) {
+ // Just mark all destinations executable!
+ Succs.assign(TI.getNumSuccessors(), true);
+ return;
+ }
+
+#ifndef NDEBUG
+ dbgs() << "Unknown terminator instruction: " << TI << '\n';
+#endif
+ llvm_unreachable("SCCP: Don't know how to handle this terminator!");
+}
+
+
+// isEdgeFeasible - Return true if the control flow edge from the 'From' basic
+// block to the 'To' basic block is currently feasible.
+//
+bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) {
+ assert(BBExecutable.count(To) && "Dest should always be alive!");
+
+ // Make sure the source basic block is executable!!
+ if (!BBExecutable.count(From)) return false;
+
+ // Check to make sure this edge itself is actually feasible now.
+ TerminatorInst *TI = From->getTerminator();
+ if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+ if (BI->isUnconditional())
+ return true;
+
+ LatticeVal BCValue = getValueState(BI->getCondition());
+
+ // Overdefined condition variables mean the branch could go either way,
+ // undef conditions mean that neither edge is feasible yet.
+ ConstantInt *CI = BCValue.getConstantInt();
+ if (!CI)
+ return !BCValue.isUndefined();
+
+ // Constant condition variables mean the branch can only go a single way.
+ return BI->getSuccessor(CI->isZero()) == To;
+ }
+
+ // Unwinding instructions successors are always executable.
+ if (TI->isExceptional())
+ return true;
+
+ if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+ if (SI->getNumCases() < 1)
+ return true;
+
+ LatticeVal SCValue = getValueState(SI->getCondition());
+ ConstantInt *CI = SCValue.getConstantInt();
+
+ if (!CI)
+ return !SCValue.isUndefined();
+
+ return SI->findCaseValue(CI).getCaseSuccessor() == To;
+ }
+
+ // Just mark all destinations executable!
+ // TODO: This could be improved if the operand is a [cast of a] BlockAddress.
+ if (isa<IndirectBrInst>(TI))
+ return true;
+
+#ifndef NDEBUG
+ dbgs() << "Unknown terminator instruction: " << *TI << '\n';
+#endif
+ llvm_unreachable("SCCP: Don't know how to handle this terminator!");
+}
+
+// visit Implementations - Something changed in this instruction, either an
+// operand made a transition, or the instruction is newly executable. Change
+// the value type of I to reflect these changes if appropriate. This method
+// makes sure to do the following actions:
+//
+// 1. If a phi node merges two constants in, and has conflicting value coming
+// from different branches, or if the PHI node merges in an overdefined
+// value, then the PHI node becomes overdefined.
+// 2. If a phi node merges only constants in, and they all agree on value, the
+// PHI node becomes a constant value equal to that.
+// 3. If V <- x (op) y && isConstant(x) && isConstant(y) V = Constant
+// 4. If V <- x (op) y && (isOverdefined(x) || isOverdefined(y)) V = Overdefined
+// 5. If V <- MEM or V <- CALL or V <- (unknown) then V = Overdefined
+// 6. If a conditional branch has a value that is constant, make the selected
+// destination executable
+// 7. If a conditional branch has a value that is overdefined, make all
+// successors executable.
+//
+void SCCPSolver::visitPHINode(PHINode &PN) {
+ // If this PN returns a struct, just mark the result overdefined.
+ // TODO: We could do a lot better than this if code actually uses this.
+ if (PN.getType()->isStructTy())
+ return markAnythingOverdefined(&PN);
+
+ if (getValueState(&PN).isOverdefined())
+ return; // Quick exit
+
+ // Super-extra-high-degree PHI nodes are unlikely to ever be marked constant,
+ // and slow us down a lot. Just mark them overdefined.
+ if (PN.getNumIncomingValues() > 64)
+ return markOverdefined(&PN);
+
+ // Look at all of the executable operands of the PHI node. If any of them
+ // are overdefined, the PHI becomes overdefined as well. If they are all
+ // constant, and they agree with each other, the PHI becomes the identical
+ // constant. If they are constant and don't agree, the PHI is overdefined.
+ // If there are no executable operands, the PHI remains undefined.
+ //
+ Constant *OperandVal = nullptr;
+ for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
+ LatticeVal IV = getValueState(PN.getIncomingValue(i));
+ if (IV.isUndefined()) continue; // Doesn't influence PHI node.
+
+ if (!isEdgeFeasible(PN.getIncomingBlock(i), PN.getParent()))
+ continue;
+
+ if (IV.isOverdefined()) // PHI node becomes overdefined!
+ return markOverdefined(&PN);
+
+ if (!OperandVal) { // Grab the first value.
+ OperandVal = IV.getConstant();
+ continue;
+ }
+
+ // There is already a reachable operand. If we conflict with it,
+ // then the PHI node becomes overdefined. If we agree with it, we
+ // can continue on.
+
+ // Check to see if there are two different constants merging, if so, the PHI
+ // node is overdefined.
+ if (IV.getConstant() != OperandVal)
+ return markOverdefined(&PN);
+ }
+
+ // If we exited the loop, this means that the PHI node only has constant
+ // arguments that agree with each other(and OperandVal is the constant) or
+ // OperandVal is null because there are no defined incoming arguments. If
+ // this is the case, the PHI remains undefined.
+ //
+ if (OperandVal)
+ markConstant(&PN, OperandVal); // Acquire operand value
+}
+
+void SCCPSolver::visitReturnInst(ReturnInst &I) {
+ if (I.getNumOperands() == 0) return; // ret void
+
+ Function *F = I.getParent()->getParent();
+ Value *ResultOp = I.getOperand(0);
+
+ // If we are tracking the return value of this function, merge it in.
+ if (!TrackedRetVals.empty() && !ResultOp->getType()->isStructTy()) {
+ DenseMap<Function*, LatticeVal>::iterator TFRVI =
+ TrackedRetVals.find(F);
+ if (TFRVI != TrackedRetVals.end()) {
+ mergeInValue(TFRVI->second, F, getValueState(ResultOp));
+ return;
+ }
+ }
+
+ // Handle functions that return multiple values.
+ if (!TrackedMultipleRetVals.empty()) {
+ if (StructType *STy = dyn_cast<StructType>(ResultOp->getType()))
+ if (MRVFunctionsTracked.count(F))
+ for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
+ mergeInValue(TrackedMultipleRetVals[std::make_pair(F, i)], F,
+ getStructValueState(ResultOp, i));
+
+ }
+}
+
+void SCCPSolver::visitTerminatorInst(TerminatorInst &TI) {
+ SmallVector<bool, 16> SuccFeasible;
+ getFeasibleSuccessors(TI, SuccFeasible);
+
+ BasicBlock *BB = TI.getParent();
+
+ // Mark all feasible successors executable.
+ for (unsigned i = 0, e = SuccFeasible.size(); i != e; ++i)
+ if (SuccFeasible[i])
+ markEdgeExecutable(BB, TI.getSuccessor(i));
+}
+
+void SCCPSolver::visitCastInst(CastInst &I) {
+ LatticeVal OpSt = getValueState(I.getOperand(0));
+ if (OpSt.isOverdefined()) // Inherit overdefinedness of operand
+ markOverdefined(&I);
+ else if (OpSt.isConstant()) {
+ Constant *C =
+ ConstantExpr::getCast(I.getOpcode(), OpSt.getConstant(), I.getType());
+ if (isa<UndefValue>(C))
+ return;
+ // Propagate constant value
+ markConstant(&I, C);
+ }
+}
+
+
+void SCCPSolver::visitExtractValueInst(ExtractValueInst &EVI) {
+ // If this returns a struct, mark all elements over defined, we don't track
+ // structs in structs.
+ if (EVI.getType()->isStructTy())
+ return markAnythingOverdefined(&EVI);
+
+ // If this is extracting from more than one level of struct, we don't know.
+ if (EVI.getNumIndices() != 1)
+ return markOverdefined(&EVI);
+
+ Value *AggVal = EVI.getAggregateOperand();
+ if (AggVal->getType()->isStructTy()) {
+ unsigned i = *EVI.idx_begin();
+ LatticeVal EltVal = getStructValueState(AggVal, i);
+ mergeInValue(getValueState(&EVI), &EVI, EltVal);
+ } else {
+ // Otherwise, must be extracting from an array.
+ return markOverdefined(&EVI);
+ }
+}
+
+void SCCPSolver::visitInsertValueInst(InsertValueInst &IVI) {
+ StructType *STy = dyn_cast<StructType>(IVI.getType());
+ if (!STy)
+ return markOverdefined(&IVI);
+
+ // If this has more than one index, we can't handle it, drive all results to
+ // undef.
+ if (IVI.getNumIndices() != 1)
+ return markAnythingOverdefined(&IVI);
+
+ Value *Aggr = IVI.getAggregateOperand();
+ unsigned Idx = *IVI.idx_begin();
+
+ // Compute the result based on what we're inserting.
+ for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+ // This passes through all values that aren't the inserted element.
+ if (i != Idx) {
+ LatticeVal EltVal = getStructValueState(Aggr, i);
+ mergeInValue(getStructValueState(&IVI, i), &IVI, EltVal);
+ continue;
+ }
+
+ Value *Val = IVI.getInsertedValueOperand();
+ if (Val->getType()->isStructTy())
+ // We don't track structs in structs.
+ markOverdefined(getStructValueState(&IVI, i), &IVI);
+ else {
+ LatticeVal InVal = getValueState(Val);
+ mergeInValue(getStructValueState(&IVI, i), &IVI, InVal);
+ }
+ }
+}
+
+void SCCPSolver::visitSelectInst(SelectInst &I) {
+ // If this select returns a struct, just mark the result overdefined.
+ // TODO: We could do a lot better than this if code actually uses this.
+ if (I.getType()->isStructTy())
+ return markAnythingOverdefined(&I);
+
+ LatticeVal CondValue = getValueState(I.getCondition());
+ if (CondValue.isUndefined())
+ return;
+
+ if (ConstantInt *CondCB = CondValue.getConstantInt()) {
+ Value *OpVal = CondCB->isZero() ? I.getFalseValue() : I.getTrueValue();
+ mergeInValue(&I, getValueState(OpVal));
+ return;
+ }
+
+ // Otherwise, the condition is overdefined or a constant we can't evaluate.
+ // See if we can produce something better than overdefined based on the T/F
+ // value.
+ LatticeVal TVal = getValueState(I.getTrueValue());
+ LatticeVal FVal = getValueState(I.getFalseValue());
+
+ // select ?, C, C -> C.
+ if (TVal.isConstant() && FVal.isConstant() &&
+ TVal.getConstant() == FVal.getConstant())
+ return markConstant(&I, FVal.getConstant());
+
+ if (TVal.isUndefined()) // select ?, undef, X -> X.
+ return mergeInValue(&I, FVal);
+ if (FVal.isUndefined()) // select ?, X, undef -> X.
+ return mergeInValue(&I, TVal);
+ markOverdefined(&I);
+}
+
+// Handle Binary Operators.
+void SCCPSolver::visitBinaryOperator(Instruction &I) {
+ LatticeVal V1State = getValueState(I.getOperand(0));
+ LatticeVal V2State = getValueState(I.getOperand(1));
+
+ LatticeVal &IV = ValueState[&I];
+ if (IV.isOverdefined()) return;
+
+ if (V1State.isConstant() && V2State.isConstant()) {
+ Constant *C = ConstantExpr::get(I.getOpcode(), V1State.getConstant(),
+ V2State.getConstant());
+ // X op Y -> undef.
+ if (isa<UndefValue>(C))
+ return;
+ return markConstant(IV, &I, C);
+ }
+
+ // If something is undef, wait for it to resolve.
+ if (!V1State.isOverdefined() && !V2State.isOverdefined())
+ return;
+
+ // Otherwise, one of our operands is overdefined. Try to produce something
+ // better than overdefined with some tricks.
+
+ // If this is an AND or OR with 0 or -1, it doesn't matter that the other
+ // operand is overdefined.
+ if (I.getOpcode() == Instruction::And || I.getOpcode() == Instruction::Or) {
+ LatticeVal *NonOverdefVal = nullptr;
+ if (!V1State.isOverdefined())
+ NonOverdefVal = &V1State;
+ else if (!V2State.isOverdefined())
+ NonOverdefVal = &V2State;
+
+ if (NonOverdefVal) {
+ if (NonOverdefVal->isUndefined()) {
+ // Could annihilate value.
+ if (I.getOpcode() == Instruction::And)
+ markConstant(IV, &I, Constant::getNullValue(I.getType()));
+ else if (VectorType *PT = dyn_cast<VectorType>(I.getType()))
+ markConstant(IV, &I, Constant::getAllOnesValue(PT));
+ else
+ markConstant(IV, &I,
+ Constant::getAllOnesValue(I.getType()));
+ return;
+ }
+
+ if (I.getOpcode() == Instruction::And) {
+ // X and 0 = 0
+ if (NonOverdefVal->getConstant()->isNullValue())
+ return markConstant(IV, &I, NonOverdefVal->getConstant());
+ } else {
+ if (ConstantInt *CI = NonOverdefVal->getConstantInt())
+ if (CI->isAllOnesValue()) // X or -1 = -1
+ return markConstant(IV, &I, NonOverdefVal->getConstant());
+ }
+ }
+ }
+
+
+ markOverdefined(&I);
+}
+
+// Handle ICmpInst instruction.
+void SCCPSolver::visitCmpInst(CmpInst &I) {
+ LatticeVal V1State = getValueState(I.getOperand(0));
+ LatticeVal V2State = getValueState(I.getOperand(1));
+
+ LatticeVal &IV = ValueState[&I];
+ if (IV.isOverdefined()) return;
+
+ if (V1State.isConstant() && V2State.isConstant()) {
+ Constant *C = ConstantExpr::getCompare(
+ I.getPredicate(), V1State.getConstant(), V2State.getConstant());
+ if (isa<UndefValue>(C))
+ return;
+ return markConstant(IV, &I, C);
+ }
+
+ // If operands are still undefined, wait for it to resolve.
+ if (!V1State.isOverdefined() && !V2State.isOverdefined())
+ return;
+
+ markOverdefined(&I);
+}
+
+void SCCPSolver::visitExtractElementInst(ExtractElementInst &I) {
+ // TODO : SCCP does not handle vectors properly.
+ return markOverdefined(&I);
+
+#if 0
+ LatticeVal &ValState = getValueState(I.getOperand(0));
+ LatticeVal &IdxState = getValueState(I.getOperand(1));
+
+ if (ValState.isOverdefined() || IdxState.isOverdefined())
+ markOverdefined(&I);
+ else if(ValState.isConstant() && IdxState.isConstant())
+ markConstant(&I, ConstantExpr::getExtractElement(ValState.getConstant(),
+ IdxState.getConstant()));
+#endif
+}
+
+void SCCPSolver::visitInsertElementInst(InsertElementInst &I) {
+ // TODO : SCCP does not handle vectors properly.
+ return markOverdefined(&I);
+#if 0
+ LatticeVal &ValState = getValueState(I.getOperand(0));
+ LatticeVal &EltState = getValueState(I.getOperand(1));
+ LatticeVal &IdxState = getValueState(I.getOperand(2));
+
+ if (ValState.isOverdefined() || EltState.isOverdefined() ||
+ IdxState.isOverdefined())
+ markOverdefined(&I);
+ else if(ValState.isConstant() && EltState.isConstant() &&
+ IdxState.isConstant())
+ markConstant(&I, ConstantExpr::getInsertElement(ValState.getConstant(),
+ EltState.getConstant(),
+ IdxState.getConstant()));
+ else if (ValState.isUndefined() && EltState.isConstant() &&
+ IdxState.isConstant())
+ markConstant(&I,ConstantExpr::getInsertElement(UndefValue::get(I.getType()),
+ EltState.getConstant(),
+ IdxState.getConstant()));
+#endif
+}
+
+void SCCPSolver::visitShuffleVectorInst(ShuffleVectorInst &I) {
+ // TODO : SCCP does not handle vectors properly.
+ return markOverdefined(&I);
+#if 0
+ LatticeVal &V1State = getValueState(I.getOperand(0));
+ LatticeVal &V2State = getValueState(I.getOperand(1));
+ LatticeVal &MaskState = getValueState(I.getOperand(2));
+
+ if (MaskState.isUndefined() ||
+ (V1State.isUndefined() && V2State.isUndefined()))
+ return; // Undefined output if mask or both inputs undefined.
+
+ if (V1State.isOverdefined() || V2State.isOverdefined() ||
+ MaskState.isOverdefined()) {
+ markOverdefined(&I);
+ } else {
+ // A mix of constant/undef inputs.
+ Constant *V1 = V1State.isConstant() ?
+ V1State.getConstant() : UndefValue::get(I.getType());
+ Constant *V2 = V2State.isConstant() ?
+ V2State.getConstant() : UndefValue::get(I.getType());
+ Constant *Mask = MaskState.isConstant() ?
+ MaskState.getConstant() : UndefValue::get(I.getOperand(2)->getType());
+ markConstant(&I, ConstantExpr::getShuffleVector(V1, V2, Mask));
+ }
+#endif
+}
+
+// Handle getelementptr instructions. If all operands are constants then we
+// can turn this into a getelementptr ConstantExpr.
+//
+void SCCPSolver::visitGetElementPtrInst(GetElementPtrInst &I) {
+ if (ValueState[&I].isOverdefined()) return;
+
+ SmallVector<Constant*, 8> Operands;
+ Operands.reserve(I.getNumOperands());
+
+ for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i) {
+ LatticeVal State = getValueState(I.getOperand(i));
+ if (State.isUndefined())
+ return; // Operands are not resolved yet.
+
+ if (State.isOverdefined())
+ return markOverdefined(&I);
+
+ assert(State.isConstant() && "Unknown state!");
+ Operands.push_back(State.getConstant());
+ }
+
+ Constant *Ptr = Operands[0];
+ auto Indices = makeArrayRef(Operands.begin() + 1, Operands.end());
+ Constant *C =
+ ConstantExpr::getGetElementPtr(I.getSourceElementType(), Ptr, Indices);
+ if (isa<UndefValue>(C))
+ return;
+ markConstant(&I, C);
+}
+
+void SCCPSolver::visitStoreInst(StoreInst &SI) {
+ // If this store is of a struct, ignore it.
+ if (SI.getOperand(0)->getType()->isStructTy())
+ return;
+
+ if (TrackedGlobals.empty() || !isa<GlobalVariable>(SI.getOperand(1)))
+ return;
+
+ GlobalVariable *GV = cast<GlobalVariable>(SI.getOperand(1));
+ DenseMap<GlobalVariable*, LatticeVal>::iterator I = TrackedGlobals.find(GV);
+ if (I == TrackedGlobals.end() || I->second.isOverdefined()) return;
+
+ // Get the value we are storing into the global, then merge it.
+ mergeInValue(I->second, GV, getValueState(SI.getOperand(0)));
+ if (I->second.isOverdefined())
+ TrackedGlobals.erase(I); // No need to keep tracking this!
+}
+
+
+// Handle load instructions. If the operand is a constant pointer to a constant
+// global, we can replace the load with the loaded constant value!
+void SCCPSolver::visitLoadInst(LoadInst &I) {
+ // If this load is of a struct, just mark the result overdefined.
+ if (I.getType()->isStructTy())
+ return markAnythingOverdefined(&I);
+
+ LatticeVal PtrVal = getValueState(I.getOperand(0));
+ if (PtrVal.isUndefined()) return; // The pointer is not resolved yet!
+
+ LatticeVal &IV = ValueState[&I];
+ if (IV.isOverdefined()) return;
+
+ if (!PtrVal.isConstant() || I.isVolatile())
+ return markOverdefined(IV, &I);
+
+ Constant *Ptr = PtrVal.getConstant();
+
+ // load null is undefined.
+ if (isa<ConstantPointerNull>(Ptr) && I.getPointerAddressSpace() == 0)
+ return;
+
+ // Transform load (constant global) into the value loaded.
+ if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Ptr)) {
+ if (!TrackedGlobals.empty()) {
+ // If we are tracking this global, merge in the known value for it.
+ DenseMap<GlobalVariable*, LatticeVal>::iterator It =
+ TrackedGlobals.find(GV);
+ if (It != TrackedGlobals.end()) {
+ mergeInValue(IV, &I, It->second);
+ return;
+ }
+ }
+ }
+
+ // Transform load from a constant into a constant if possible.
+ if (Constant *C = ConstantFoldLoadFromConstPtr(Ptr, DL)) {
+ if (isa<UndefValue>(C))
+ return;
+ return markConstant(IV, &I, C);
+ }
+
+ // Otherwise we cannot say for certain what value this load will produce.
+ // Bail out.
+ markOverdefined(IV, &I);
+}
+
+void SCCPSolver::visitCallSite(CallSite CS) {
+ Function *F = CS.getCalledFunction();
+ Instruction *I = CS.getInstruction();
+
+ // The common case is that we aren't tracking the callee, either because we
+ // are not doing interprocedural analysis or the callee is indirect, or is
+ // external. Handle these cases first.
+ if (!F || F->isDeclaration()) {
+CallOverdefined:
+ // Void return and not tracking callee, just bail.
+ if (I->getType()->isVoidTy()) return;
+
+ // Otherwise, if we have a single return value case, and if the function is
+ // a declaration, maybe we can constant fold it.
+ if (F && F->isDeclaration() && !I->getType()->isStructTy() &&
+ canConstantFoldCallTo(F)) {
+
+ SmallVector<Constant*, 8> Operands;
+ for (CallSite::arg_iterator AI = CS.arg_begin(), E = CS.arg_end();
+ AI != E; ++AI) {
+ LatticeVal State = getValueState(*AI);
+
+ if (State.isUndefined())
+ return; // Operands are not resolved yet.
+ if (State.isOverdefined())
+ return markOverdefined(I);
+ assert(State.isConstant() && "Unknown state!");
+ Operands.push_back(State.getConstant());
+ }
+
+ if (getValueState(I).isOverdefined())
+ return;
+
+ // If we can constant fold this, mark the result of the call as a
+ // constant.
+ if (Constant *C = ConstantFoldCall(F, Operands, TLI)) {
+ // call -> undef.
+ if (isa<UndefValue>(C))
+ return;
+ return markConstant(I, C);
+ }
+ }
+
+ // Otherwise, we don't know anything about this call, mark it overdefined.
+ return markAnythingOverdefined(I);
+ }
+
+ // If this is a local function that doesn't have its address taken, mark its
+ // entry block executable and merge in the actual arguments to the call into
+ // the formal arguments of the function.
+ if (!TrackingIncomingArguments.empty() && TrackingIncomingArguments.count(F)){
+ MarkBlockExecutable(&F->front());
+
+ // Propagate information from this call site into the callee.
+ CallSite::arg_iterator CAI = CS.arg_begin();
+ for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end();
+ AI != E; ++AI, ++CAI) {
+ // If this argument is byval, and if the function is not readonly, there
+ // will be an implicit copy formed of the input aggregate.
+ if (AI->hasByValAttr() && !F->onlyReadsMemory()) {
+ markOverdefined(&*AI);
+ continue;
+ }
+
+ if (StructType *STy = dyn_cast<StructType>(AI->getType())) {
+ for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+ LatticeVal CallArg = getStructValueState(*CAI, i);
+ mergeInValue(getStructValueState(&*AI, i), &*AI, CallArg);
+ }
+ } else {
+ mergeInValue(&*AI, getValueState(*CAI));
+ }
+ }
+ }
+
+ // If this is a single/zero retval case, see if we're tracking the function.
+ if (StructType *STy = dyn_cast<StructType>(F->getReturnType())) {
+ if (!MRVFunctionsTracked.count(F))
+ goto CallOverdefined; // Not tracking this callee.
+
+ // If we are tracking this callee, propagate the result of the function
+ // into this call site.
+ for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
+ mergeInValue(getStructValueState(I, i), I,
+ TrackedMultipleRetVals[std::make_pair(F, i)]);
+ } else {
+ DenseMap<Function*, LatticeVal>::iterator TFRVI = TrackedRetVals.find(F);
+ if (TFRVI == TrackedRetVals.end())
+ goto CallOverdefined; // Not tracking this callee.
+
+ // If so, propagate the return value of the callee into this call result.
+ mergeInValue(I, TFRVI->second);
+ }
+}
+
+void SCCPSolver::Solve() {
+ // Process the work lists until they are empty!
+ while (!BBWorkList.empty() || !InstWorkList.empty() ||
+ !OverdefinedInstWorkList.empty()) {
+ // Process the overdefined instruction's work list first, which drives other
+ // things to overdefined more quickly.
+ while (!OverdefinedInstWorkList.empty()) {
+ Value *I = OverdefinedInstWorkList.pop_back_val();
+
+ DEBUG(dbgs() << "\nPopped off OI-WL: " << *I << '\n');
+
+ // "I" got into the work list because it either made the transition from
+ // bottom to constant, or to overdefined.
+ //
+ // Anything on this worklist that is overdefined need not be visited
+ // since all of its users will have already been marked as overdefined
+ // Update all of the users of this instruction's value.
+ //
+ for (User *U : I->users())
+ if (Instruction *UI = dyn_cast<Instruction>(U))
+ OperandChangedState(UI);
+ }
+
+ // Process the instruction work list.
+ while (!InstWorkList.empty()) {
+ Value *I = InstWorkList.pop_back_val();
+
+ DEBUG(dbgs() << "\nPopped off I-WL: " << *I << '\n');
+
+ // "I" got into the work list because it made the transition from undef to
+ // constant.
+ //
+ // Anything on this worklist that is overdefined need not be visited
+ // since all of its users will have already been marked as overdefined.
+ // Update all of the users of this instruction's value.
+ //
+ if (I->getType()->isStructTy() || !getValueState(I).isOverdefined())
+ for (User *U : I->users())
+ if (Instruction *UI = dyn_cast<Instruction>(U))
+ OperandChangedState(UI);
+ }
+
+ // Process the basic block work list.
+ while (!BBWorkList.empty()) {
+ BasicBlock *BB = BBWorkList.back();
+ BBWorkList.pop_back();
+
+ DEBUG(dbgs() << "\nPopped off BBWL: " << *BB << '\n');
+
+ // Notify all instructions in this basic block that they are newly
+ // executable.
+ visit(BB);
+ }
+ }
+}
+
+/// ResolvedUndefsIn - While solving the dataflow for a function, we assume
+/// that branches on undef values cannot reach any of their successors.
+/// However, this is not a safe assumption. After we solve dataflow, this
+/// method should be use to handle this. If this returns true, the solver
+/// should be rerun.
+///
+/// This method handles this by finding an unresolved branch and marking it one
+/// of the edges from the block as being feasible, even though the condition
+/// doesn't say it would otherwise be. This allows SCCP to find the rest of the
+/// CFG and only slightly pessimizes the analysis results (by marking one,
+/// potentially infeasible, edge feasible). This cannot usefully modify the
+/// constraints on the condition of the branch, as that would impact other users
+/// of the value.
+///
+/// This scan also checks for values that use undefs, whose results are actually
+/// defined. For example, 'zext i8 undef to i32' should produce all zeros
+/// conservatively, as "(zext i8 X -> i32) & 0xFF00" must always return zero,
+/// even if X isn't defined.
+bool SCCPSolver::ResolvedUndefsIn(Function &F) {
+ for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+ if (!BBExecutable.count(&*BB))
+ continue;
+
+ for (Instruction &I : *BB) {
+ // Look for instructions which produce undef values.
+ if (I.getType()->isVoidTy()) continue;
+
+ if (StructType *STy = dyn_cast<StructType>(I.getType())) {
+ // Only a few things that can be structs matter for undef.
+
+ // Tracked calls must never be marked overdefined in ResolvedUndefsIn.
+ if (CallSite CS = CallSite(&I))
+ if (Function *F = CS.getCalledFunction())
+ if (MRVFunctionsTracked.count(F))
+ continue;
+
+ // extractvalue and insertvalue don't need to be marked; they are
+ // tracked as precisely as their operands.
+ if (isa<ExtractValueInst>(I) || isa<InsertValueInst>(I))
+ continue;
+
+ // Send the results of everything else to overdefined. We could be
+ // more precise than this but it isn't worth bothering.
+ for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+ LatticeVal &LV = getStructValueState(&I, i);
+ if (LV.isUndefined())
+ markOverdefined(LV, &I);
+ }
+ continue;
+ }
+
+ LatticeVal &LV = getValueState(&I);
+ if (!LV.isUndefined()) continue;
+
+ // extractvalue is safe; check here because the argument is a struct.
+ if (isa<ExtractValueInst>(I))
+ continue;
+
+ // Compute the operand LatticeVals, for convenience below.
+ // Anything taking a struct is conservatively assumed to require
+ // overdefined markings.
+ if (I.getOperand(0)->getType()->isStructTy()) {
+ markOverdefined(&I);
+ return true;
+ }
+ LatticeVal Op0LV = getValueState(I.getOperand(0));
+ LatticeVal Op1LV;
+ if (I.getNumOperands() == 2) {
+ if (I.getOperand(1)->getType()->isStructTy()) {
+ markOverdefined(&I);
+ return true;
+ }
+
+ Op1LV = getValueState(I.getOperand(1));
+ }
+ // If this is an instructions whose result is defined even if the input is
+ // not fully defined, propagate the information.
+ Type *ITy = I.getType();
+ switch (I.getOpcode()) {
+ case Instruction::Add:
+ case Instruction::Sub:
+ case Instruction::Trunc:
+ case Instruction::FPTrunc:
+ case Instruction::BitCast:
+ break; // Any undef -> undef
+ case Instruction::FSub:
+ case Instruction::FAdd:
+ case Instruction::FMul:
+ case Instruction::FDiv:
+ case Instruction::FRem:
+ // Floating-point binary operation: be conservative.
+ if (Op0LV.isUndefined() && Op1LV.isUndefined())
+ markForcedConstant(&I, Constant::getNullValue(ITy));
+ else
+ markOverdefined(&I);
+ return true;
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::SIToFP:
+ case Instruction::UIToFP:
+ // undef -> 0; some outputs are impossible
+ markForcedConstant(&I, Constant::getNullValue(ITy));
+ return true;
+ case Instruction::Mul:
+ case Instruction::And:
+ // Both operands undef -> undef
+ if (Op0LV.isUndefined() && Op1LV.isUndefined())
+ break;
+ // undef * X -> 0. X could be zero.
+ // undef & X -> 0. X could be zero.
+ markForcedConstant(&I, Constant::getNullValue(ITy));
+ return true;
+
+ case Instruction::Or:
+ // Both operands undef -> undef
+ if (Op0LV.isUndefined() && Op1LV.isUndefined())
+ break;
+ // undef | X -> -1. X could be -1.
+ markForcedConstant(&I, Constant::getAllOnesValue(ITy));
+ return true;
+
+ case Instruction::Xor:
+ // undef ^ undef -> 0; strictly speaking, this is not strictly
+ // necessary, but we try to be nice to people who expect this
+ // behavior in simple cases
+ if (Op0LV.isUndefined() && Op1LV.isUndefined()) {
+ markForcedConstant(&I, Constant::getNullValue(ITy));
+ return true;
+ }
+ // undef ^ X -> undef
+ break;
+
+ case Instruction::SDiv:
+ case Instruction::UDiv:
+ case Instruction::SRem:
+ case Instruction::URem:
+ // X / undef -> undef. No change.
+ // X % undef -> undef. No change.
+ if (Op1LV.isUndefined()) break;
+
+ // X / 0 -> undef. No change.
+ // X % 0 -> undef. No change.
+ if (Op1LV.isConstant() && Op1LV.getConstant()->isZeroValue())
+ break;
+
+ // undef / X -> 0. X could be maxint.
+ // undef % X -> 0. X could be 1.
+ markForcedConstant(&I, Constant::getNullValue(ITy));
+ return true;
+
+ case Instruction::AShr:
+ // X >>a undef -> undef.
+ if (Op1LV.isUndefined()) break;
+
+ // undef >>a X -> all ones
+ markForcedConstant(&I, Constant::getAllOnesValue(ITy));
+ return true;
+ case Instruction::LShr:
+ case Instruction::Shl:
+ // X << undef -> undef.
+ // X >> undef -> undef.
+ if (Op1LV.isUndefined()) break;
+
+ // undef << X -> 0
+ // undef >> X -> 0
+ markForcedConstant(&I, Constant::getNullValue(ITy));
+ return true;
+ case Instruction::Select:
+ Op1LV = getValueState(I.getOperand(1));
+ // undef ? X : Y -> X or Y. There could be commonality between X/Y.
+ if (Op0LV.isUndefined()) {
+ if (!Op1LV.isConstant()) // Pick the constant one if there is any.
+ Op1LV = getValueState(I.getOperand(2));
+ } else if (Op1LV.isUndefined()) {
+ // c ? undef : undef -> undef. No change.
+ Op1LV = getValueState(I.getOperand(2));
+ if (Op1LV.isUndefined())
+ break;
+ // Otherwise, c ? undef : x -> x.
+ } else {
+ // Leave Op1LV as Operand(1)'s LatticeValue.
+ }
+
+ if (Op1LV.isConstant())
+ markForcedConstant(&I, Op1LV.getConstant());
+ else
+ markOverdefined(&I);
+ return true;
+ case Instruction::Load:
+ // A load here means one of two things: a load of undef from a global,
+ // a load from an unknown pointer. Either way, having it return undef
+ // is okay.
+ break;
+ case Instruction::ICmp:
+ // X == undef -> undef. Other comparisons get more complicated.
+ if (cast<ICmpInst>(&I)->isEquality())
+ break;
+ markOverdefined(&I);
+ return true;
+ case Instruction::Call:
+ case Instruction::Invoke: {
+ // There are two reasons a call can have an undef result
+ // 1. It could be tracked.
+ // 2. It could be constant-foldable.
+ // Because of the way we solve return values, tracked calls must
+ // never be marked overdefined in ResolvedUndefsIn.
+ if (Function *F = CallSite(&I).getCalledFunction())
+ if (TrackedRetVals.count(F))
+ break;
+
+ // If the call is constant-foldable, we mark it overdefined because
+ // we do not know what return values are valid.
+ markOverdefined(&I);
+ return true;
+ }
+ default:
+ // If we don't know what should happen here, conservatively mark it
+ // overdefined.
+ markOverdefined(&I);
+ return true;
+ }
+ }
+
+ // Check to see if we have a branch or switch on an undefined value. If so
+ // we force the branch to go one way or the other to make the successor
+ // values live. It doesn't really matter which way we force it.
+ TerminatorInst *TI = BB->getTerminator();
+ if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+ if (!BI->isConditional()) continue;
+ if (!getValueState(BI->getCondition()).isUndefined())
+ continue;
+
+ // If the input to SCCP is actually branch on undef, fix the undef to
+ // false.
+ if (isa<UndefValue>(BI->getCondition())) {
+ BI->setCondition(ConstantInt::getFalse(BI->getContext()));
+ markEdgeExecutable(&*BB, TI->getSuccessor(1));
+ return true;
+ }
+
+ // Otherwise, it is a branch on a symbolic value which is currently
+ // considered to be undef. Handle this by forcing the input value to the
+ // branch to false.
+ markForcedConstant(BI->getCondition(),
+ ConstantInt::getFalse(TI->getContext()));
+ return true;
+ }
+
+ if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+ if (!SI->getNumCases())
+ continue;
+ if (!getValueState(SI->getCondition()).isUndefined())
+ continue;
+
+ // If the input to SCCP is actually switch on undef, fix the undef to
+ // the first constant.
+ if (isa<UndefValue>(SI->getCondition())) {
+ SI->setCondition(SI->case_begin().getCaseValue());
+ markEdgeExecutable(&*BB, SI->case_begin().getCaseSuccessor());
+ return true;
+ }
+
+ markForcedConstant(SI->getCondition(), SI->case_begin().getCaseValue());
+ return true;
+ }
+ }
+
+ return false;
+}
+
+
+namespace {
+ //===--------------------------------------------------------------------===//
+ //
+ /// SCCP Class - This class uses the SCCPSolver to implement a per-function
+ /// Sparse Conditional Constant Propagator.
+ ///
+ struct SCCP : public FunctionPass {
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ }
+ static char ID; // Pass identification, replacement for typeid
+ SCCP() : FunctionPass(ID) {
+ initializeSCCPPass(*PassRegistry::getPassRegistry());
+ }
+
+ // runOnFunction - Run the Sparse Conditional Constant Propagation
+ // algorithm, and return true if the function was modified.
+ //
+ bool runOnFunction(Function &F) override;
+ };
+} // end anonymous namespace
+
+char SCCP::ID = 0;
+INITIALIZE_PASS(SCCP, "sccp",
+ "Sparse Conditional Constant Propagation", false, false)
+
+// createSCCPPass - This is the public interface to this file.
+FunctionPass *llvm::createSCCPPass() {
+ return new SCCP();
+}
+
+static void DeleteInstructionInBlock(BasicBlock *BB) {
+ DEBUG(dbgs() << " BasicBlock Dead:" << *BB);
+ ++NumDeadBlocks;
+
+ // Check to see if there are non-terminating instructions to delete.
+ if (isa<TerminatorInst>(BB->begin()))
+ return;
+
+ // Delete the instructions backwards, as it has a reduced likelihood of having
+ // to update as many def-use and use-def chains.
+ Instruction *EndInst = BB->getTerminator(); // Last not to be deleted.
+ while (EndInst != BB->begin()) {
+ // Delete the next to last instruction.
+ Instruction *Inst = &*--EndInst->getIterator();
+ if (!Inst->use_empty())
+ Inst->replaceAllUsesWith(UndefValue::get(Inst->getType()));
+ if (Inst->isEHPad()) {
+ EndInst = Inst;
+ continue;
+ }
+ BB->getInstList().erase(Inst);
+ ++NumInstRemoved;
+ }
+}
+
+// runOnFunction() - Run the Sparse Conditional Constant Propagation algorithm,
+// and return true if the function was modified.
+//
+bool SCCP::runOnFunction(Function &F) {
+ if (skipOptnoneFunction(F))
+ return false;
+
+ DEBUG(dbgs() << "SCCP on function '" << F.getName() << "'\n");
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ const TargetLibraryInfo *TLI =
+ &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ SCCPSolver Solver(DL, TLI);
+
+ // Mark the first block of the function as being executable.
+ Solver.MarkBlockExecutable(&F.front());
+
+ // Mark all arguments to the function as being overdefined.
+ for (Argument &AI : F.args())
+ Solver.markAnythingOverdefined(&AI);
+
+ // Solve for constants.
+ bool ResolvedUndefs = true;
+ while (ResolvedUndefs) {
+ Solver.Solve();
+ DEBUG(dbgs() << "RESOLVING UNDEFs\n");
+ ResolvedUndefs = Solver.ResolvedUndefsIn(F);
+ }
+
+ bool MadeChanges = false;
+
+ // If we decided that there are basic blocks that are dead in this function,
+ // delete their contents now. Note that we cannot actually delete the blocks,
+ // as we cannot modify the CFG of the function.
+
+ for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+ if (!Solver.isBlockExecutable(&*BB)) {
+ DeleteInstructionInBlock(&*BB);
+ MadeChanges = true;
+ continue;
+ }
+
+ // Iterate over all of the instructions in a function, replacing them with
+ // constants if we have found them to be of constant values.
+ //
+ for (BasicBlock::iterator BI = BB->begin(), E = BB->end(); BI != E; ) {
+ Instruction *Inst = &*BI++;
+ if (Inst->getType()->isVoidTy() || isa<TerminatorInst>(Inst))
+ continue;
+
+ // TODO: Reconstruct structs from their elements.
+ if (Inst->getType()->isStructTy())
+ continue;
+
+ LatticeVal IV = Solver.getLatticeValueFor(Inst);
+ if (IV.isOverdefined())
+ continue;
+
+ Constant *Const = IV.isConstant()
+ ? IV.getConstant() : UndefValue::get(Inst->getType());
+ DEBUG(dbgs() << " Constant: " << *Const << " = " << *Inst << '\n');
+
+ // Replaces all of the uses of a variable with uses of the constant.
+ Inst->replaceAllUsesWith(Const);
+
+ // Delete the instruction.
+ Inst->eraseFromParent();
+
+ // Hey, we just changed something!
+ MadeChanges = true;
+ ++NumInstRemoved;
+ }
+ }
+
+ return MadeChanges;
+}
+
+namespace {
+ //===--------------------------------------------------------------------===//
+ //
+ /// IPSCCP Class - This class implements interprocedural Sparse Conditional
+ /// Constant Propagation.
+ ///
+ struct IPSCCP : public ModulePass {
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ }
+ static char ID;
+ IPSCCP() : ModulePass(ID) {
+ initializeIPSCCPPass(*PassRegistry::getPassRegistry());
+ }
+ bool runOnModule(Module &M) override;
+ };
+} // end anonymous namespace
+
+char IPSCCP::ID = 0;
+INITIALIZE_PASS_BEGIN(IPSCCP, "ipsccp",
+ "Interprocedural Sparse Conditional Constant Propagation",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(IPSCCP, "ipsccp",
+ "Interprocedural Sparse Conditional Constant Propagation",
+ false, false)
+
+// createIPSCCPPass - This is the public interface to this file.
+ModulePass *llvm::createIPSCCPPass() {
+ return new IPSCCP();
+}
+
+
+static bool AddressIsTaken(const GlobalValue *GV) {
+ // Delete any dead constantexpr klingons.
+ GV->removeDeadConstantUsers();
+
+ for (const Use &U : GV->uses()) {
+ const User *UR = U.getUser();
+ if (const StoreInst *SI = dyn_cast<StoreInst>(UR)) {
+ if (SI->getOperand(0) == GV || SI->isVolatile())
+ return true; // Storing addr of GV.
+ } else if (isa<InvokeInst>(UR) || isa<CallInst>(UR)) {
+ // Make sure we are calling the function, not passing the address.
+ ImmutableCallSite CS(cast<Instruction>(UR));
+ if (!CS.isCallee(&U))
+ return true;
+ } else if (const LoadInst *LI = dyn_cast<LoadInst>(UR)) {
+ if (LI->isVolatile())
+ return true;
+ } else if (isa<BlockAddress>(UR)) {
+ // blockaddress doesn't take the address of the function, it takes addr
+ // of label.
+ } else {
+ return true;
+ }
+ }
+ return false;
+}
+
+bool IPSCCP::runOnModule(Module &M) {
+ const DataLayout &DL = M.getDataLayout();
+ const TargetLibraryInfo *TLI =
+ &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ SCCPSolver Solver(DL, TLI);
+
+ // AddressTakenFunctions - This set keeps track of the address-taken functions
+ // that are in the input. As IPSCCP runs through and simplifies code,
+ // functions that were address taken can end up losing their
+ // address-taken-ness. Because of this, we keep track of their addresses from
+ // the first pass so we can use them for the later simplification pass.
+ SmallPtrSet<Function*, 32> AddressTakenFunctions;
+
+ // Loop over all functions, marking arguments to those with their addresses
+ // taken or that are external as overdefined.
+ //
+ for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
+ if (F->isDeclaration())
+ continue;
+
+ // If this is a strong or ODR definition of this function, then we can
+ // propagate information about its result into callsites of it.
+ if (!F->mayBeOverridden())
+ Solver.AddTrackedFunction(&*F);
+
+ // If this function only has direct calls that we can see, we can track its
+ // arguments and return value aggressively, and can assume it is not called
+ // unless we see evidence to the contrary.
+ if (F->hasLocalLinkage()) {
+ if (AddressIsTaken(&*F))
+ AddressTakenFunctions.insert(&*F);
+ else {
+ Solver.AddArgumentTrackedFunction(&*F);
+ continue;
+ }
+ }
+
+ // Assume the function is called.
+ Solver.MarkBlockExecutable(&F->front());
+
+ // Assume nothing about the incoming arguments.
+ for (Argument &AI : F->args())
+ Solver.markAnythingOverdefined(&AI);
+ }
+
+ // Loop over global variables. We inform the solver about any internal global
+ // variables that do not have their 'addresses taken'. If they don't have
+ // their addresses taken, we can propagate constants through them.
+ for (GlobalVariable &G : M.globals())
+ if (!G.isConstant() && G.hasLocalLinkage() && !AddressIsTaken(&G))
+ Solver.TrackValueOfGlobalVariable(&G);
+
+ // Solve for constants.
+ bool ResolvedUndefs = true;
+ while (ResolvedUndefs) {
+ Solver.Solve();
+
+ DEBUG(dbgs() << "RESOLVING UNDEFS\n");
+ ResolvedUndefs = false;
+ for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F)
+ ResolvedUndefs |= Solver.ResolvedUndefsIn(*F);
+ }
+
+ bool MadeChanges = false;
+
+ // Iterate over all of the instructions in the module, replacing them with
+ // constants if we have found them to be of constant values.
+ //
+ SmallVector<BasicBlock*, 512> BlocksToErase;
+
+ for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
+ if (F->isDeclaration())
+ continue;
+
+ if (Solver.isBlockExecutable(&F->front())) {
+ for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end();
+ AI != E; ++AI) {
+ if (AI->use_empty() || AI->getType()->isStructTy()) continue;
+
+ // TODO: Could use getStructLatticeValueFor to find out if the entire
+ // result is a constant and replace it entirely if so.
+
+ LatticeVal IV = Solver.getLatticeValueFor(&*AI);
+ if (IV.isOverdefined()) continue;
+
+ Constant *CST = IV.isConstant() ?
+ IV.getConstant() : UndefValue::get(AI->getType());
+ DEBUG(dbgs() << "*** Arg " << *AI << " = " << *CST <<"\n");
+
+ // Replaces all of the uses of a variable with uses of the
+ // constant.
+ AI->replaceAllUsesWith(CST);
+ ++IPNumArgsElimed;
+ }
+ }
+
+ for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+ if (!Solver.isBlockExecutable(&*BB)) {
+ DeleteInstructionInBlock(&*BB);
+ MadeChanges = true;
+
+ TerminatorInst *TI = BB->getTerminator();
+ for (BasicBlock *Succ : TI->successors()) {
+ if (!Succ->empty() && isa<PHINode>(Succ->begin()))
+ Succ->removePredecessor(&*BB);
+ }
+ if (!TI->use_empty())
+ TI->replaceAllUsesWith(UndefValue::get(TI->getType()));
+ TI->eraseFromParent();
+ new UnreachableInst(M.getContext(), &*BB);
+
+ if (&*BB != &F->front())
+ BlocksToErase.push_back(&*BB);
+ continue;
+ }
+
+ for (BasicBlock::iterator BI = BB->begin(), E = BB->end(); BI != E; ) {
+ Instruction *Inst = &*BI++;
+ if (Inst->getType()->isVoidTy() || Inst->getType()->isStructTy())
+ continue;
+
+ // TODO: Could use getStructLatticeValueFor to find out if the entire
+ // result is a constant and replace it entirely if so.
+
+ LatticeVal IV = Solver.getLatticeValueFor(Inst);
+ if (IV.isOverdefined())
+ continue;
+
+ Constant *Const = IV.isConstant()
+ ? IV.getConstant() : UndefValue::get(Inst->getType());
+ DEBUG(dbgs() << " Constant: " << *Const << " = " << *Inst << '\n');
+
+ // Replaces all of the uses of a variable with uses of the
+ // constant.
+ Inst->replaceAllUsesWith(Const);
+
+ // Delete the instruction.
+ if (!isa<CallInst>(Inst) && !isa<TerminatorInst>(Inst))
+ Inst->eraseFromParent();
+
+ // Hey, we just changed something!
+ MadeChanges = true;
+ ++IPNumInstRemoved;
+ }
+ }
+
+ // Now that all instructions in the function are constant folded, erase dead
+ // blocks, because we can now use ConstantFoldTerminator to get rid of
+ // in-edges.
+ for (unsigned i = 0, e = BlocksToErase.size(); i != e; ++i) {
+ // If there are any PHI nodes in this successor, drop entries for BB now.
+ BasicBlock *DeadBB = BlocksToErase[i];
+ for (Value::user_iterator UI = DeadBB->user_begin(),
+ UE = DeadBB->user_end();
+ UI != UE;) {
+ // Grab the user and then increment the iterator early, as the user
+ // will be deleted. Step past all adjacent uses from the same user.
+ Instruction *I = dyn_cast<Instruction>(*UI);
+ do { ++UI; } while (UI != UE && *UI == I);
+
+ // Ignore blockaddress users; BasicBlock's dtor will handle them.
+ if (!I) continue;
+
+ bool Folded = ConstantFoldTerminator(I->getParent());
+ if (!Folded) {
+ // The constant folder may not have been able to fold the terminator
+ // if this is a branch or switch on undef. Fold it manually as a
+ // branch to the first successor.
+#ifndef NDEBUG
+ if (BranchInst *BI = dyn_cast<BranchInst>(I)) {
+ assert(BI->isConditional() && isa<UndefValue>(BI->getCondition()) &&
+ "Branch should be foldable!");
+ } else if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) {
+ assert(isa<UndefValue>(SI->getCondition()) && "Switch should fold");
+ } else {
+ llvm_unreachable("Didn't fold away reference to block!");
+ }
+#endif
+
+ // Make this an uncond branch to the first successor.
+ TerminatorInst *TI = I->getParent()->getTerminator();
+ BranchInst::Create(TI->getSuccessor(0), TI);
+
+ // Remove entries in successor phi nodes to remove edges.
+ for (unsigned i = 1, e = TI->getNumSuccessors(); i != e; ++i)
+ TI->getSuccessor(i)->removePredecessor(TI->getParent());
+
+ // Remove the old terminator.
+ TI->eraseFromParent();
+ }
+ }
+
+ // Finally, delete the basic block.
+ F->getBasicBlockList().erase(DeadBB);
+ }
+ BlocksToErase.clear();
+ }
+
+ // If we inferred constant or undef return values for a function, we replaced
+ // all call uses with the inferred value. This means we don't need to bother
+ // actually returning anything from the function. Replace all return
+ // instructions with return undef.
+ //
+ // Do this in two stages: first identify the functions we should process, then
+ // actually zap their returns. This is important because we can only do this
+ // if the address of the function isn't taken. In cases where a return is the
+ // last use of a function, the order of processing functions would affect
+ // whether other functions are optimizable.
+ SmallVector<ReturnInst*, 8> ReturnsToZap;
+
+ // TODO: Process multiple value ret instructions also.
+ const DenseMap<Function*, LatticeVal> &RV = Solver.getTrackedRetVals();
+ for (DenseMap<Function*, LatticeVal>::const_iterator I = RV.begin(),
+ E = RV.end(); I != E; ++I) {
+ Function *F = I->first;
+ if (I->second.isOverdefined() || F->getReturnType()->isVoidTy())
+ continue;
+
+ // We can only do this if we know that nothing else can call the function.
+ if (!F->hasLocalLinkage() || AddressTakenFunctions.count(F))
+ continue;
+
+ for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
+ if (ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator()))
+ if (!isa<UndefValue>(RI->getOperand(0)))
+ ReturnsToZap.push_back(RI);
+ }
+
+ // Zap all returns which we've identified as zap to change.
+ for (unsigned i = 0, e = ReturnsToZap.size(); i != e; ++i) {
+ Function *F = ReturnsToZap[i]->getParent()->getParent();
+ ReturnsToZap[i]->setOperand(0, UndefValue::get(F->getReturnType()));
+ }
+
+ // If we inferred constant or undef values for globals variables, we can
+ // delete the global and any stores that remain to it.
+ const DenseMap<GlobalVariable*, LatticeVal> &TG = Solver.getTrackedGlobals();
+ for (DenseMap<GlobalVariable*, LatticeVal>::const_iterator I = TG.begin(),
+ E = TG.end(); I != E; ++I) {
+ GlobalVariable *GV = I->first;
+ assert(!I->second.isOverdefined() &&
+ "Overdefined values should have been taken out of the map!");
+ DEBUG(dbgs() << "Found that GV '" << GV->getName() << "' is constant!\n");
+ while (!GV->use_empty()) {
+ StoreInst *SI = cast<StoreInst>(GV->user_back());
+ SI->eraseFromParent();
+ }
+ M.getGlobalList().erase(GV);
+ ++IPNumGlobalConst;
+ }
+
+ return MadeChanges;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/SROA.cpp b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp
new file mode 100644
index 0000000..a7361b5
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -0,0 +1,4291 @@
+//===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This transformation implements the well known scalar replacement of
+/// aggregates transformation. It tries to identify promotable elements of an
+/// aggregate alloca, and promote them to registers. It will also try to
+/// convert uses of an element (or set of elements) of an alloca into a vector
+/// or bitfield-style integer scalar if appropriate.
+///
+/// It works to do this with minimal slicing of the alloca so that regions
+/// which are merely transferred in and out of external memory remain unchanged
+/// and are not decomposed to scalar code.
+///
+/// Because this also performs alloca promotion, it can be thought of as also
+/// serving the purpose of SSA formation. The algorithm iterates on the
+/// function until all opportunities for promotion have been realized.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/SROA.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/PtrUseVisitor.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TimeValue.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+
+#if __cplusplus >= 201103L && !defined(NDEBUG)
+// We only use this for a debug check in C++11
+#include <random>
+#endif
+
+using namespace llvm;
+using namespace llvm::sroa;
+
+#define DEBUG_TYPE "sroa"
+
+STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement");
+STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed");
+STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions per alloca");
+STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses rewritten");
+STATISTIC(MaxUsesPerAllocaPartition, "Maximum number of uses of a partition");
+STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced");
+STATISTIC(NumPromoted, "Number of allocas promoted to SSA values");
+STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion");
+STATISTIC(NumDeleted, "Number of instructions deleted");
+STATISTIC(NumVectorized, "Number of vectorized aggregates");
+
+/// Hidden option to enable randomly shuffling the slices to help uncover
+/// instability in their order.
+static cl::opt<bool> SROARandomShuffleSlices("sroa-random-shuffle-slices",
+ cl::init(false), cl::Hidden);
+
+/// Hidden option to experiment with completely strict handling of inbounds
+/// GEPs.
+static cl::opt<bool> SROAStrictInbounds("sroa-strict-inbounds", cl::init(false),
+ cl::Hidden);
+
+namespace {
+/// \brief A custom IRBuilder inserter which prefixes all names if they are
+/// preserved.
+template <bool preserveNames = true>
+class IRBuilderPrefixedInserter
+ : public IRBuilderDefaultInserter<preserveNames> {
+ std::string Prefix;
+
+public:
+ void SetNamePrefix(const Twine &P) { Prefix = P.str(); }
+
+protected:
+ void InsertHelper(Instruction *I, const Twine &Name, BasicBlock *BB,
+ BasicBlock::iterator InsertPt) const {
+ IRBuilderDefaultInserter<preserveNames>::InsertHelper(
+ I, Name.isTriviallyEmpty() ? Name : Prefix + Name, BB, InsertPt);
+ }
+};
+
+// Specialization for not preserving the name is trivial.
+template <>
+class IRBuilderPrefixedInserter<false>
+ : public IRBuilderDefaultInserter<false> {
+public:
+ void SetNamePrefix(const Twine &P) {}
+};
+
+/// \brief Provide a typedef for IRBuilder that drops names in release builds.
+#ifndef NDEBUG
+typedef llvm::IRBuilder<true, ConstantFolder, IRBuilderPrefixedInserter<true>>
+ IRBuilderTy;
+#else
+typedef llvm::IRBuilder<false, ConstantFolder, IRBuilderPrefixedInserter<false>>
+ IRBuilderTy;
+#endif
+}
+
+namespace {
+/// \brief A used slice of an alloca.
+///
+/// This structure represents a slice of an alloca used by some instruction. It
+/// stores both the begin and end offsets of this use, a pointer to the use
+/// itself, and a flag indicating whether we can classify the use as splittable
+/// or not when forming partitions of the alloca.
+class Slice {
+ /// \brief The beginning offset of the range.
+ uint64_t BeginOffset;
+
+ /// \brief The ending offset, not included in the range.
+ uint64_t EndOffset;
+
+ /// \brief Storage for both the use of this slice and whether it can be
+ /// split.
+ PointerIntPair<Use *, 1, bool> UseAndIsSplittable;
+
+public:
+ Slice() : BeginOffset(), EndOffset() {}
+ Slice(uint64_t BeginOffset, uint64_t EndOffset, Use *U, bool IsSplittable)
+ : BeginOffset(BeginOffset), EndOffset(EndOffset),
+ UseAndIsSplittable(U, IsSplittable) {}
+
+ uint64_t beginOffset() const { return BeginOffset; }
+ uint64_t endOffset() const { return EndOffset; }
+
+ bool isSplittable() const { return UseAndIsSplittable.getInt(); }
+ void makeUnsplittable() { UseAndIsSplittable.setInt(false); }
+
+ Use *getUse() const { return UseAndIsSplittable.getPointer(); }
+
+ bool isDead() const { return getUse() == nullptr; }
+ void kill() { UseAndIsSplittable.setPointer(nullptr); }
+
+ /// \brief Support for ordering ranges.
+ ///
+ /// This provides an ordering over ranges such that start offsets are
+ /// always increasing, and within equal start offsets, the end offsets are
+ /// decreasing. Thus the spanning range comes first in a cluster with the
+ /// same start position.
+ bool operator<(const Slice &RHS) const {
+ if (beginOffset() < RHS.beginOffset())
+ return true;
+ if (beginOffset() > RHS.beginOffset())
+ return false;
+ if (isSplittable() != RHS.isSplittable())
+ return !isSplittable();
+ if (endOffset() > RHS.endOffset())
+ return true;
+ return false;
+ }
+
+ /// \brief Support comparison with a single offset to allow binary searches.
+ friend LLVM_ATTRIBUTE_UNUSED bool operator<(const Slice &LHS,
+ uint64_t RHSOffset) {
+ return LHS.beginOffset() < RHSOffset;
+ }
+ friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset,
+ const Slice &RHS) {
+ return LHSOffset < RHS.beginOffset();
+ }
+
+ bool operator==(const Slice &RHS) const {
+ return isSplittable() == RHS.isSplittable() &&
+ beginOffset() == RHS.beginOffset() && endOffset() == RHS.endOffset();
+ }
+ bool operator!=(const Slice &RHS) const { return !operator==(RHS); }
+};
+} // end anonymous namespace
+
+namespace llvm {
+template <typename T> struct isPodLike;
+template <> struct isPodLike<Slice> { static const bool value = true; };
+}
+
+/// \brief Representation of the alloca slices.
+///
+/// This class represents the slices of an alloca which are formed by its
+/// various uses. If a pointer escapes, we can't fully build a representation
+/// for the slices used and we reflect that in this structure. The uses are
+/// stored, sorted by increasing beginning offset and with unsplittable slices
+/// starting at a particular offset before splittable slices.
+class llvm::sroa::AllocaSlices {
+public:
+ /// \brief Construct the slices of a particular alloca.
+ AllocaSlices(const DataLayout &DL, AllocaInst &AI);
+
+ /// \brief Test whether a pointer to the allocation escapes our analysis.
+ ///
+ /// If this is true, the slices are never fully built and should be
+ /// ignored.
+ bool isEscaped() const { return PointerEscapingInstr; }
+
+ /// \brief Support for iterating over the slices.
+ /// @{
+ typedef SmallVectorImpl<Slice>::iterator iterator;
+ typedef iterator_range<iterator> range;
+ iterator begin() { return Slices.begin(); }
+ iterator end() { return Slices.end(); }
+
+ typedef SmallVectorImpl<Slice>::const_iterator const_iterator;
+ typedef iterator_range<const_iterator> const_range;
+ const_iterator begin() const { return Slices.begin(); }
+ const_iterator end() const { return Slices.end(); }
+ /// @}
+
+ /// \brief Erase a range of slices.
+ void erase(iterator Start, iterator Stop) { Slices.erase(Start, Stop); }
+
+ /// \brief Insert new slices for this alloca.
+ ///
+ /// This moves the slices into the alloca's slices collection, and re-sorts
+ /// everything so that the usual ordering properties of the alloca's slices
+ /// hold.
+ void insert(ArrayRef<Slice> NewSlices) {
+ int OldSize = Slices.size();
+ Slices.append(NewSlices.begin(), NewSlices.end());
+ auto SliceI = Slices.begin() + OldSize;
+ std::sort(SliceI, Slices.end());
+ std::inplace_merge(Slices.begin(), SliceI, Slices.end());
+ }
+
+ // Forward declare the iterator and range accessor for walking the
+ // partitions.
+ class partition_iterator;
+ iterator_range<partition_iterator> partitions();
+
+ /// \brief Access the dead users for this alloca.
+ ArrayRef<Instruction *> getDeadUsers() const { return DeadUsers; }
+
+ /// \brief Access the dead operands referring to this alloca.
+ ///
+ /// These are operands which have cannot actually be used to refer to the
+ /// alloca as they are outside its range and the user doesn't correct for
+ /// that. These mostly consist of PHI node inputs and the like which we just
+ /// need to replace with undef.
+ ArrayRef<Use *> getDeadOperands() const { return DeadOperands; }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ void print(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const;
+ void printSlice(raw_ostream &OS, const_iterator I,
+ StringRef Indent = " ") const;
+ void printUse(raw_ostream &OS, const_iterator I,
+ StringRef Indent = " ") const;
+ void print(raw_ostream &OS) const;
+ void dump(const_iterator I) const;
+ void dump() const;
+#endif
+
+private:
+ template <typename DerivedT, typename RetT = void> class BuilderBase;
+ class SliceBuilder;
+ friend class AllocaSlices::SliceBuilder;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// \brief Handle to alloca instruction to simplify method interfaces.
+ AllocaInst &AI;
+#endif
+
+ /// \brief The instruction responsible for this alloca not having a known set
+ /// of slices.
+ ///
+ /// When an instruction (potentially) escapes the pointer to the alloca, we
+ /// store a pointer to that here and abort trying to form slices of the
+ /// alloca. This will be null if the alloca slices are analyzed successfully.
+ Instruction *PointerEscapingInstr;
+
+ /// \brief The slices of the alloca.
+ ///
+ /// We store a vector of the slices formed by uses of the alloca here. This
+ /// vector is sorted by increasing begin offset, and then the unsplittable
+ /// slices before the splittable ones. See the Slice inner class for more
+ /// details.
+ SmallVector<Slice, 8> Slices;
+
+ /// \brief Instructions which will become dead if we rewrite the alloca.
+ ///
+ /// Note that these are not separated by slice. This is because we expect an
+ /// alloca to be completely rewritten or not rewritten at all. If rewritten,
+ /// all these instructions can simply be removed and replaced with undef as
+ /// they come from outside of the allocated space.
+ SmallVector<Instruction *, 8> DeadUsers;
+
+ /// \brief Operands which will become dead if we rewrite the alloca.
+ ///
+ /// These are operands that in their particular use can be replaced with
+ /// undef when we rewrite the alloca. These show up in out-of-bounds inputs
+ /// to PHI nodes and the like. They aren't entirely dead (there might be
+ /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we
+ /// want to swap this particular input for undef to simplify the use lists of
+ /// the alloca.
+ SmallVector<Use *, 8> DeadOperands;
+};
+
+/// \brief A partition of the slices.
+///
+/// An ephemeral representation for a range of slices which can be viewed as
+/// a partition of the alloca. This range represents a span of the alloca's
+/// memory which cannot be split, and provides access to all of the slices
+/// overlapping some part of the partition.
+///
+/// Objects of this type are produced by traversing the alloca's slices, but
+/// are only ephemeral and not persistent.
+class llvm::sroa::Partition {
+private:
+ friend class AllocaSlices;
+ friend class AllocaSlices::partition_iterator;
+
+ typedef AllocaSlices::iterator iterator;
+
+ /// \brief The beginning and ending offsets of the alloca for this
+ /// partition.
+ uint64_t BeginOffset, EndOffset;
+
+ /// \brief The start end end iterators of this partition.
+ iterator SI, SJ;
+
+ /// \brief A collection of split slice tails overlapping the partition.
+ SmallVector<Slice *, 4> SplitTails;
+
+ /// \brief Raw constructor builds an empty partition starting and ending at
+ /// the given iterator.
+ Partition(iterator SI) : SI(SI), SJ(SI) {}
+
+public:
+ /// \brief The start offset of this partition.
+ ///
+ /// All of the contained slices start at or after this offset.
+ uint64_t beginOffset() const { return BeginOffset; }
+
+ /// \brief The end offset of this partition.
+ ///
+ /// All of the contained slices end at or before this offset.
+ uint64_t endOffset() const { return EndOffset; }
+
+ /// \brief The size of the partition.
+ ///
+ /// Note that this can never be zero.
+ uint64_t size() const {
+ assert(BeginOffset < EndOffset && "Partitions must span some bytes!");
+ return EndOffset - BeginOffset;
+ }
+
+ /// \brief Test whether this partition contains no slices, and merely spans
+ /// a region occupied by split slices.
+ bool empty() const { return SI == SJ; }
+
+ /// \name Iterate slices that start within the partition.
+ /// These may be splittable or unsplittable. They have a begin offset >= the
+ /// partition begin offset.
+ /// @{
+ // FIXME: We should probably define a "concat_iterator" helper and use that
+ // to stitch together pointee_iterators over the split tails and the
+ // contiguous iterators of the partition. That would give a much nicer
+ // interface here. We could then additionally expose filtered iterators for
+ // split, unsplit, and unsplittable splices based on the usage patterns.
+ iterator begin() const { return SI; }
+ iterator end() const { return SJ; }
+ /// @}
+
+ /// \brief Get the sequence of split slice tails.
+ ///
+ /// These tails are of slices which start before this partition but are
+ /// split and overlap into the partition. We accumulate these while forming
+ /// partitions.
+ ArrayRef<Slice *> splitSliceTails() const { return SplitTails; }
+};
+
+/// \brief An iterator over partitions of the alloca's slices.
+///
+/// This iterator implements the core algorithm for partitioning the alloca's
+/// slices. It is a forward iterator as we don't support backtracking for
+/// efficiency reasons, and re-use a single storage area to maintain the
+/// current set of split slices.
+///
+/// It is templated on the slice iterator type to use so that it can operate
+/// with either const or non-const slice iterators.
+class AllocaSlices::partition_iterator
+ : public iterator_facade_base<partition_iterator, std::forward_iterator_tag,
+ Partition> {
+ friend class AllocaSlices;
+
+ /// \brief Most of the state for walking the partitions is held in a class
+ /// with a nice interface for examining them.
+ Partition P;
+
+ /// \brief We need to keep the end of the slices to know when to stop.
+ AllocaSlices::iterator SE;
+
+ /// \brief We also need to keep track of the maximum split end offset seen.
+ /// FIXME: Do we really?
+ uint64_t MaxSplitSliceEndOffset;
+
+ /// \brief Sets the partition to be empty at given iterator, and sets the
+ /// end iterator.
+ partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE)
+ : P(SI), SE(SE), MaxSplitSliceEndOffset(0) {
+ // If not already at the end, advance our state to form the initial
+ // partition.
+ if (SI != SE)
+ advance();
+ }
+
+ /// \brief Advance the iterator to the next partition.
+ ///
+ /// Requires that the iterator not be at the end of the slices.
+ void advance() {
+ assert((P.SI != SE || !P.SplitTails.empty()) &&
+ "Cannot advance past the end of the slices!");
+
+ // Clear out any split uses which have ended.
+ if (!P.SplitTails.empty()) {
+ if (P.EndOffset >= MaxSplitSliceEndOffset) {
+ // If we've finished all splits, this is easy.
+ P.SplitTails.clear();
+ MaxSplitSliceEndOffset = 0;
+ } else {
+ // Remove the uses which have ended in the prior partition. This
+ // cannot change the max split slice end because we just checked that
+ // the prior partition ended prior to that max.
+ P.SplitTails.erase(
+ std::remove_if(
+ P.SplitTails.begin(), P.SplitTails.end(),
+ [&](Slice *S) { return S->endOffset() <= P.EndOffset; }),
+ P.SplitTails.end());
+ assert(std::any_of(P.SplitTails.begin(), P.SplitTails.end(),
+ [&](Slice *S) {
+ return S->endOffset() == MaxSplitSliceEndOffset;
+ }) &&
+ "Could not find the current max split slice offset!");
+ assert(std::all_of(P.SplitTails.begin(), P.SplitTails.end(),
+ [&](Slice *S) {
+ return S->endOffset() <= MaxSplitSliceEndOffset;
+ }) &&
+ "Max split slice end offset is not actually the max!");
+ }
+ }
+
+ // If P.SI is already at the end, then we've cleared the split tail and
+ // now have an end iterator.
+ if (P.SI == SE) {
+ assert(P.SplitTails.empty() && "Failed to clear the split slices!");
+ return;
+ }
+
+ // If we had a non-empty partition previously, set up the state for
+ // subsequent partitions.
+ if (P.SI != P.SJ) {
+ // Accumulate all the splittable slices which started in the old
+ // partition into the split list.
+ for (Slice &S : P)
+ if (S.isSplittable() && S.endOffset() > P.EndOffset) {
+ P.SplitTails.push_back(&S);
+ MaxSplitSliceEndOffset =
+ std::max(S.endOffset(), MaxSplitSliceEndOffset);
+ }
+
+ // Start from the end of the previous partition.
+ P.SI = P.SJ;
+
+ // If P.SI is now at the end, we at most have a tail of split slices.
+ if (P.SI == SE) {
+ P.BeginOffset = P.EndOffset;
+ P.EndOffset = MaxSplitSliceEndOffset;
+ return;
+ }
+
+ // If the we have split slices and the next slice is after a gap and is
+ // not splittable immediately form an empty partition for the split
+ // slices up until the next slice begins.
+ if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset &&
+ !P.SI->isSplittable()) {
+ P.BeginOffset = P.EndOffset;
+ P.EndOffset = P.SI->beginOffset();
+ return;
+ }
+ }
+
+ // OK, we need to consume new slices. Set the end offset based on the
+ // current slice, and step SJ past it. The beginning offset of the
+ // partition is the beginning offset of the next slice unless we have
+ // pre-existing split slices that are continuing, in which case we begin
+ // at the prior end offset.
+ P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset;
+ P.EndOffset = P.SI->endOffset();
+ ++P.SJ;
+
+ // There are two strategies to form a partition based on whether the
+ // partition starts with an unsplittable slice or a splittable slice.
+ if (!P.SI->isSplittable()) {
+ // When we're forming an unsplittable region, it must always start at
+ // the first slice and will extend through its end.
+ assert(P.BeginOffset == P.SI->beginOffset());
+
+ // Form a partition including all of the overlapping slices with this
+ // unsplittable slice.
+ while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
+ if (!P.SJ->isSplittable())
+ P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
+ ++P.SJ;
+ }
+
+ // We have a partition across a set of overlapping unsplittable
+ // partitions.
+ return;
+ }
+
+ // If we're starting with a splittable slice, then we need to form
+ // a synthetic partition spanning it and any other overlapping splittable
+ // splices.
+ assert(P.SI->isSplittable() && "Forming a splittable partition!");
+
+ // Collect all of the overlapping splittable slices.
+ while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset &&
+ P.SJ->isSplittable()) {
+ P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
+ ++P.SJ;
+ }
+
+ // Back upiP.EndOffset if we ended the span early when encountering an
+ // unsplittable slice. This synthesizes the early end offset of
+ // a partition spanning only splittable slices.
+ if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
+ assert(!P.SJ->isSplittable());
+ P.EndOffset = P.SJ->beginOffset();
+ }
+ }
+
+public:
+ bool operator==(const partition_iterator &RHS) const {
+ assert(SE == RHS.SE &&
+ "End iterators don't match between compared partition iterators!");
+
+ // The observed positions of partitions is marked by the P.SI iterator and
+ // the emptiness of the split slices. The latter is only relevant when
+ // P.SI == SE, as the end iterator will additionally have an empty split
+ // slices list, but the prior may have the same P.SI and a tail of split
+ // slices.
+ if (P.SI == RHS.P.SI && P.SplitTails.empty() == RHS.P.SplitTails.empty()) {
+ assert(P.SJ == RHS.P.SJ &&
+ "Same set of slices formed two different sized partitions!");
+ assert(P.SplitTails.size() == RHS.P.SplitTails.size() &&
+ "Same slice position with differently sized non-empty split "
+ "slice tails!");
+ return true;
+ }
+ return false;
+ }
+
+ partition_iterator &operator++() {
+ advance();
+ return *this;
+ }
+
+ Partition &operator*() { return P; }
+};
+
+/// \brief A forward range over the partitions of the alloca's slices.
+///
+/// This accesses an iterator range over the partitions of the alloca's
+/// slices. It computes these partitions on the fly based on the overlapping
+/// offsets of the slices and the ability to split them. It will visit "empty"
+/// partitions to cover regions of the alloca only accessed via split
+/// slices.
+iterator_range<AllocaSlices::partition_iterator> AllocaSlices::partitions() {
+ return make_range(partition_iterator(begin(), end()),
+ partition_iterator(end(), end()));
+}
+
+static Value *foldSelectInst(SelectInst &SI) {
+ // If the condition being selected on is a constant or the same value is
+ // being selected between, fold the select. Yes this does (rarely) happen
+ // early on.
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition()))
+ return SI.getOperand(1 + CI->isZero());
+ if (SI.getOperand(1) == SI.getOperand(2))
+ return SI.getOperand(1);
+
+ return nullptr;
+}
+
+/// \brief A helper that folds a PHI node or a select.
+static Value *foldPHINodeOrSelectInst(Instruction &I) {
+ if (PHINode *PN = dyn_cast<PHINode>(&I)) {
+ // If PN merges together the same value, return that value.
+ return PN->hasConstantValue();
+ }
+ return foldSelectInst(cast<SelectInst>(I));
+}
+
+/// \brief Builder for the alloca slices.
+///
+/// This class builds a set of alloca slices by recursively visiting the uses
+/// of an alloca and making a slice for each load and store at each offset.
+class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
+ friend class PtrUseVisitor<SliceBuilder>;
+ friend class InstVisitor<SliceBuilder>;
+ typedef PtrUseVisitor<SliceBuilder> Base;
+
+ const uint64_t AllocSize;
+ AllocaSlices &AS;
+
+ SmallDenseMap<Instruction *, unsigned> MemTransferSliceMap;
+ SmallDenseMap<Instruction *, uint64_t> PHIOrSelectSizes;
+
+ /// \brief Set to de-duplicate dead instructions found in the use walk.
+ SmallPtrSet<Instruction *, 4> VisitedDeadInsts;
+
+public:
+ SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS)
+ : PtrUseVisitor<SliceBuilder>(DL),
+ AllocSize(DL.getTypeAllocSize(AI.getAllocatedType())), AS(AS) {}
+
+private:
+ void markAsDead(Instruction &I) {
+ if (VisitedDeadInsts.insert(&I).second)
+ AS.DeadUsers.push_back(&I);
+ }
+
+ void insertUse(Instruction &I, const APInt &Offset, uint64_t Size,
+ bool IsSplittable = false) {
+ // Completely skip uses which have a zero size or start either before or
+ // past the end of the allocation.
+ if (Size == 0 || Offset.uge(AllocSize)) {
+ DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @" << Offset
+ << " which has zero size or starts outside of the "
+ << AllocSize << " byte alloca:\n"
+ << " alloca: " << AS.AI << "\n"
+ << " use: " << I << "\n");
+ return markAsDead(I);
+ }
+
+ uint64_t BeginOffset = Offset.getZExtValue();
+ uint64_t EndOffset = BeginOffset + Size;
+
+ // Clamp the end offset to the end of the allocation. Note that this is
+ // formulated to handle even the case where "BeginOffset + Size" overflows.
+ // This may appear superficially to be something we could ignore entirely,
+ // but that is not so! There may be widened loads or PHI-node uses where
+ // some instructions are dead but not others. We can't completely ignore
+ // them, and so have to record at least the information here.
+ assert(AllocSize >= BeginOffset); // Established above.
+ if (Size > AllocSize - BeginOffset) {
+ DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" << Offset
+ << " to remain within the " << AllocSize << " byte alloca:\n"
+ << " alloca: " << AS.AI << "\n"
+ << " use: " << I << "\n");
+ EndOffset = AllocSize;
+ }
+
+ AS.Slices.push_back(Slice(BeginOffset, EndOffset, U, IsSplittable));
+ }
+
+ void visitBitCastInst(BitCastInst &BC) {
+ if (BC.use_empty())
+ return markAsDead(BC);
+
+ return Base::visitBitCastInst(BC);
+ }
+
+ void visitGetElementPtrInst(GetElementPtrInst &GEPI) {
+ if (GEPI.use_empty())
+ return markAsDead(GEPI);
+
+ if (SROAStrictInbounds && GEPI.isInBounds()) {
+ // FIXME: This is a manually un-factored variant of the basic code inside
+ // of GEPs with checking of the inbounds invariant specified in the
+ // langref in a very strict sense. If we ever want to enable
+ // SROAStrictInbounds, this code should be factored cleanly into
+ // PtrUseVisitor, but it is easier to experiment with SROAStrictInbounds
+ // by writing out the code here where we have tho underlying allocation
+ // size readily available.
+ APInt GEPOffset = Offset;
+ const DataLayout &DL = GEPI.getModule()->getDataLayout();
+ for (gep_type_iterator GTI = gep_type_begin(GEPI),
+ GTE = gep_type_end(GEPI);
+ GTI != GTE; ++GTI) {
+ ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
+ if (!OpC)
+ break;
+
+ // Handle a struct index, which adds its field offset to the pointer.
+ if (StructType *STy = dyn_cast<StructType>(*GTI)) {
+ unsigned ElementIdx = OpC->getZExtValue();
+ const StructLayout *SL = DL.getStructLayout(STy);
+ GEPOffset +=
+ APInt(Offset.getBitWidth(), SL->getElementOffset(ElementIdx));
+ } else {
+ // For array or vector indices, scale the index by the size of the
+ // type.
+ APInt Index = OpC->getValue().sextOrTrunc(Offset.getBitWidth());
+ GEPOffset += Index * APInt(Offset.getBitWidth(),
+ DL.getTypeAllocSize(GTI.getIndexedType()));
+ }
+
+ // If this index has computed an intermediate pointer which is not
+ // inbounds, then the result of the GEP is a poison value and we can
+ // delete it and all uses.
+ if (GEPOffset.ugt(AllocSize))
+ return markAsDead(GEPI);
+ }
+ }
+
+ return Base::visitGetElementPtrInst(GEPI);
+ }
+
+ void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset,
+ uint64_t Size, bool IsVolatile) {
+ // We allow splitting of non-volatile loads and stores where the type is an
+ // integer type. These may be used to implement 'memcpy' or other "transfer
+ // of bits" patterns.
+ bool IsSplittable = Ty->isIntegerTy() && !IsVolatile;
+
+ insertUse(I, Offset, Size, IsSplittable);
+ }
+
+ void visitLoadInst(LoadInst &LI) {
+ assert((!LI.isSimple() || LI.getType()->isSingleValueType()) &&
+ "All simple FCA loads should have been pre-split");
+
+ if (!IsOffsetKnown)
+ return PI.setAborted(&LI);
+
+ const DataLayout &DL = LI.getModule()->getDataLayout();
+ uint64_t Size = DL.getTypeStoreSize(LI.getType());
+ return handleLoadOrStore(LI.getType(), LI, Offset, Size, LI.isVolatile());
+ }
+
+ void visitStoreInst(StoreInst &SI) {
+ Value *ValOp = SI.getValueOperand();
+ if (ValOp == *U)
+ return PI.setEscapedAndAborted(&SI);
+ if (!IsOffsetKnown)
+ return PI.setAborted(&SI);
+
+ const DataLayout &DL = SI.getModule()->getDataLayout();
+ uint64_t Size = DL.getTypeStoreSize(ValOp->getType());
+
+ // If this memory access can be shown to *statically* extend outside the
+ // bounds of of the allocation, it's behavior is undefined, so simply
+ // ignore it. Note that this is more strict than the generic clamping
+ // behavior of insertUse. We also try to handle cases which might run the
+ // risk of overflow.
+ // FIXME: We should instead consider the pointer to have escaped if this
+ // function is being instrumented for addressing bugs or race conditions.
+ if (Size > AllocSize || Offset.ugt(AllocSize - Size)) {
+ DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @" << Offset
+ << " which extends past the end of the " << AllocSize
+ << " byte alloca:\n"
+ << " alloca: " << AS.AI << "\n"
+ << " use: " << SI << "\n");
+ return markAsDead(SI);
+ }
+
+ assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) &&
+ "All simple FCA stores should have been pre-split");
+ handleLoadOrStore(ValOp->getType(), SI, Offset, Size, SI.isVolatile());
+ }
+
+ void visitMemSetInst(MemSetInst &II) {
+ assert(II.getRawDest() == *U && "Pointer use is not the destination?");
+ ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
+ if ((Length && Length->getValue() == 0) ||
+ (IsOffsetKnown && Offset.uge(AllocSize)))
+ // Zero-length mem transfer intrinsics can be ignored entirely.
+ return markAsDead(II);
+
+ if (!IsOffsetKnown)
+ return PI.setAborted(&II);
+
+ insertUse(II, Offset, Length ? Length->getLimitedValue()
+ : AllocSize - Offset.getLimitedValue(),
+ (bool)Length);
+ }
+
+ void visitMemTransferInst(MemTransferInst &II) {
+ ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
+ if (Length && Length->getValue() == 0)
+ // Zero-length mem transfer intrinsics can be ignored entirely.
+ return markAsDead(II);
+
+ // Because we can visit these intrinsics twice, also check to see if the
+ // first time marked this instruction as dead. If so, skip it.
+ if (VisitedDeadInsts.count(&II))
+ return;
+
+ if (!IsOffsetKnown)
+ return PI.setAborted(&II);
+
+ // This side of the transfer is completely out-of-bounds, and so we can
+ // nuke the entire transfer. However, we also need to nuke the other side
+ // if already added to our partitions.
+ // FIXME: Yet another place we really should bypass this when
+ // instrumenting for ASan.
+ if (Offset.uge(AllocSize)) {
+ SmallDenseMap<Instruction *, unsigned>::iterator MTPI =
+ MemTransferSliceMap.find(&II);
+ if (MTPI != MemTransferSliceMap.end())
+ AS.Slices[MTPI->second].kill();
+ return markAsDead(II);
+ }
+
+ uint64_t RawOffset = Offset.getLimitedValue();
+ uint64_t Size = Length ? Length->getLimitedValue() : AllocSize - RawOffset;
+
+ // Check for the special case where the same exact value is used for both
+ // source and dest.
+ if (*U == II.getRawDest() && *U == II.getRawSource()) {
+ // For non-volatile transfers this is a no-op.
+ if (!II.isVolatile())
+ return markAsDead(II);
+
+ return insertUse(II, Offset, Size, /*IsSplittable=*/false);
+ }
+
+ // If we have seen both source and destination for a mem transfer, then
+ // they both point to the same alloca.
+ bool Inserted;
+ SmallDenseMap<Instruction *, unsigned>::iterator MTPI;
+ std::tie(MTPI, Inserted) =
+ MemTransferSliceMap.insert(std::make_pair(&II, AS.Slices.size()));
+ unsigned PrevIdx = MTPI->second;
+ if (!Inserted) {
+ Slice &PrevP = AS.Slices[PrevIdx];
+
+ // Check if the begin offsets match and this is a non-volatile transfer.
+ // In that case, we can completely elide the transfer.
+ if (!II.isVolatile() && PrevP.beginOffset() == RawOffset) {
+ PrevP.kill();
+ return markAsDead(II);
+ }
+
+ // Otherwise we have an offset transfer within the same alloca. We can't
+ // split those.
+ PrevP.makeUnsplittable();
+ }
+
+ // Insert the use now that we've fixed up the splittable nature.
+ insertUse(II, Offset, Size, /*IsSplittable=*/Inserted && Length);
+
+ // Check that we ended up with a valid index in the map.
+ assert(AS.Slices[PrevIdx].getUse()->getUser() == &II &&
+ "Map index doesn't point back to a slice with this user.");
+ }
+
+ // Disable SRoA for any intrinsics except for lifetime invariants.
+ // FIXME: What about debug intrinsics? This matches old behavior, but
+ // doesn't make sense.
+ void visitIntrinsicInst(IntrinsicInst &II) {
+ if (!IsOffsetKnown)
+ return PI.setAborted(&II);
+
+ if (II.getIntrinsicID() == Intrinsic::lifetime_start ||
+ II.getIntrinsicID() == Intrinsic::lifetime_end) {
+ ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0));
+ uint64_t Size = std::min(AllocSize - Offset.getLimitedValue(),
+ Length->getLimitedValue());
+ insertUse(II, Offset, Size, true);
+ return;
+ }
+
+ Base::visitIntrinsicInst(II);
+ }
+
+ Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) {
+ // We consider any PHI or select that results in a direct load or store of
+ // the same offset to be a viable use for slicing purposes. These uses
+ // are considered unsplittable and the size is the maximum loaded or stored
+ // size.
+ SmallPtrSet<Instruction *, 4> Visited;
+ SmallVector<std::pair<Instruction *, Instruction *>, 4> Uses;
+ Visited.insert(Root);
+ Uses.push_back(std::make_pair(cast<Instruction>(*U), Root));
+ const DataLayout &DL = Root->getModule()->getDataLayout();
+ // If there are no loads or stores, the access is dead. We mark that as
+ // a size zero access.
+ Size = 0;
+ do {
+ Instruction *I, *UsedI;
+ std::tie(UsedI, I) = Uses.pop_back_val();
+
+ if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+ Size = std::max(Size, DL.getTypeStoreSize(LI->getType()));
+ continue;
+ }
+ if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+ Value *Op = SI->getOperand(0);
+ if (Op == UsedI)
+ return SI;
+ Size = std::max(Size, DL.getTypeStoreSize(Op->getType()));
+ continue;
+ }
+
+ if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
+ if (!GEP->hasAllZeroIndices())
+ return GEP;
+ } else if (!isa<BitCastInst>(I) && !isa<PHINode>(I) &&
+ !isa<SelectInst>(I)) {
+ return I;
+ }
+
+ for (User *U : I->users())
+ if (Visited.insert(cast<Instruction>(U)).second)
+ Uses.push_back(std::make_pair(I, cast<Instruction>(U)));
+ } while (!Uses.empty());
+
+ return nullptr;
+ }
+
+ void visitPHINodeOrSelectInst(Instruction &I) {
+ assert(isa<PHINode>(I) || isa<SelectInst>(I));
+ if (I.use_empty())
+ return markAsDead(I);
+
+ // TODO: We could use SimplifyInstruction here to fold PHINodes and
+ // SelectInsts. However, doing so requires to change the current
+ // dead-operand-tracking mechanism. For instance, suppose neither loading
+ // from %U nor %other traps. Then "load (select undef, %U, %other)" does not
+ // trap either. However, if we simply replace %U with undef using the
+ // current dead-operand-tracking mechanism, "load (select undef, undef,
+ // %other)" may trap because the select may return the first operand
+ // "undef".
+ if (Value *Result = foldPHINodeOrSelectInst(I)) {
+ if (Result == *U)
+ // If the result of the constant fold will be the pointer, recurse
+ // through the PHI/select as if we had RAUW'ed it.
+ enqueueUsers(I);
+ else
+ // Otherwise the operand to the PHI/select is dead, and we can replace
+ // it with undef.
+ AS.DeadOperands.push_back(U);
+
+ return;
+ }
+
+ if (!IsOffsetKnown)
+ return PI.setAborted(&I);
+
+ // See if we already have computed info on this node.
+ uint64_t &Size = PHIOrSelectSizes[&I];
+ if (!Size) {
+ // This is a new PHI/Select, check for an unsafe use of it.
+ if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&I, Size))
+ return PI.setAborted(UnsafeI);
+ }
+
+ // For PHI and select operands outside the alloca, we can't nuke the entire
+ // phi or select -- the other side might still be relevant, so we special
+ // case them here and use a separate structure to track the operands
+ // themselves which should be replaced with undef.
+ // FIXME: This should instead be escaped in the event we're instrumenting
+ // for address sanitization.
+ if (Offset.uge(AllocSize)) {
+ AS.DeadOperands.push_back(U);
+ return;
+ }
+
+ insertUse(I, Offset, Size);
+ }
+
+ void visitPHINode(PHINode &PN) { visitPHINodeOrSelectInst(PN); }
+
+ void visitSelectInst(SelectInst &SI) { visitPHINodeOrSelectInst(SI); }
+
+ /// \brief Disable SROA entirely if there are unhandled users of the alloca.
+ void visitInstruction(Instruction &I) { PI.setAborted(&I); }
+};
+
+AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
+ :
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ AI(AI),
+#endif
+ PointerEscapingInstr(nullptr) {
+ SliceBuilder PB(DL, AI, *this);
+ SliceBuilder::PtrInfo PtrI = PB.visitPtr(AI);
+ if (PtrI.isEscaped() || PtrI.isAborted()) {
+ // FIXME: We should sink the escape vs. abort info into the caller nicely,
+ // possibly by just storing the PtrInfo in the AllocaSlices.
+ PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst()
+ : PtrI.getAbortingInst();
+ assert(PointerEscapingInstr && "Did not track a bad instruction");
+ return;
+ }
+
+ Slices.erase(std::remove_if(Slices.begin(), Slices.end(),
+ [](const Slice &S) {
+ return S.isDead();
+ }),
+ Slices.end());
+
+#if __cplusplus >= 201103L && !defined(NDEBUG)
+ if (SROARandomShuffleSlices) {
+ std::mt19937 MT(static_cast<unsigned>(sys::TimeValue::now().msec()));
+ std::shuffle(Slices.begin(), Slices.end(), MT);
+ }
+#endif
+
+ // Sort the uses. This arranges for the offsets to be in ascending order,
+ // and the sizes to be in descending order.
+ std::sort(Slices.begin(), Slices.end());
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+
+void AllocaSlices::print(raw_ostream &OS, const_iterator I,
+ StringRef Indent) const {
+ printSlice(OS, I, Indent);
+ OS << "\n";
+ printUse(OS, I, Indent);
+}
+
+void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I,
+ StringRef Indent) const {
+ OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")"
+ << " slice #" << (I - begin())
+ << (I->isSplittable() ? " (splittable)" : "");
+}
+
+void AllocaSlices::printUse(raw_ostream &OS, const_iterator I,
+ StringRef Indent) const {
+ OS << Indent << " used by: " << *I->getUse()->getUser() << "\n";
+}
+
+void AllocaSlices::print(raw_ostream &OS) const {
+ if (PointerEscapingInstr) {
+ OS << "Can't analyze slices for alloca: " << AI << "\n"
+ << " A pointer to this alloca escaped by:\n"
+ << " " << *PointerEscapingInstr << "\n";
+ return;
+ }
+
+ OS << "Slices of alloca: " << AI << "\n";
+ for (const_iterator I = begin(), E = end(); I != E; ++I)
+ print(OS, I);
+}
+
+LLVM_DUMP_METHOD void AllocaSlices::dump(const_iterator I) const {
+ print(dbgs(), I);
+}
+LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); }
+
+#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+
+/// Walk the range of a partitioning looking for a common type to cover this
+/// sequence of slices.
+static Type *findCommonType(AllocaSlices::const_iterator B,
+ AllocaSlices::const_iterator E,
+ uint64_t EndOffset) {
+ Type *Ty = nullptr;
+ bool TyIsCommon = true;
+ IntegerType *ITy = nullptr;
+
+ // Note that we need to look at *every* alloca slice's Use to ensure we
+ // always get consistent results regardless of the order of slices.
+ for (AllocaSlices::const_iterator I = B; I != E; ++I) {
+ Use *U = I->getUse();
+ if (isa<IntrinsicInst>(*U->getUser()))
+ continue;
+ if (I->beginOffset() != B->beginOffset() || I->endOffset() != EndOffset)
+ continue;
+
+ Type *UserTy = nullptr;
+ if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
+ UserTy = LI->getType();
+ } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
+ UserTy = SI->getValueOperand()->getType();
+ }
+
+ if (IntegerType *UserITy = dyn_cast_or_null<IntegerType>(UserTy)) {
+ // If the type is larger than the partition, skip it. We only encounter
+ // this for split integer operations where we want to use the type of the
+ // entity causing the split. Also skip if the type is not a byte width
+ // multiple.
+ if (UserITy->getBitWidth() % 8 != 0 ||
+ UserITy->getBitWidth() / 8 > (EndOffset - B->beginOffset()))
+ continue;
+
+ // Track the largest bitwidth integer type used in this way in case there
+ // is no common type.
+ if (!ITy || ITy->getBitWidth() < UserITy->getBitWidth())
+ ITy = UserITy;
+ }
+
+ // To avoid depending on the order of slices, Ty and TyIsCommon must not
+ // depend on types skipped above.
+ if (!UserTy || (Ty && Ty != UserTy))
+ TyIsCommon = false; // Give up on anything but an iN type.
+ else
+ Ty = UserTy;
+ }
+
+ return TyIsCommon ? Ty : ITy;
+}
+
+/// PHI instructions that use an alloca and are subsequently loaded can be
+/// rewritten to load both input pointers in the pred blocks and then PHI the
+/// results, allowing the load of the alloca to be promoted.
+/// From this:
+/// %P2 = phi [i32* %Alloca, i32* %Other]
+/// %V = load i32* %P2
+/// to:
+/// %V1 = load i32* %Alloca -> will be mem2reg'd
+/// ...
+/// %V2 = load i32* %Other
+/// ...
+/// %V = phi [i32 %V1, i32 %V2]
+///
+/// We can do this to a select if its only uses are loads and if the operands
+/// to the select can be loaded unconditionally.
+///
+/// FIXME: This should be hoisted into a generic utility, likely in
+/// Transforms/Util/Local.h
+static bool isSafePHIToSpeculate(PHINode &PN) {
+ // For now, we can only do this promotion if the load is in the same block
+ // as the PHI, and if there are no stores between the phi and load.
+ // TODO: Allow recursive phi users.
+ // TODO: Allow stores.
+ BasicBlock *BB = PN.getParent();
+ unsigned MaxAlign = 0;
+ bool HaveLoad = false;
+ for (User *U : PN.users()) {
+ LoadInst *LI = dyn_cast<LoadInst>(U);
+ if (!LI || !LI->isSimple())
+ return false;
+
+ // For now we only allow loads in the same block as the PHI. This is
+ // a common case that happens when instcombine merges two loads through
+ // a PHI.
+ if (LI->getParent() != BB)
+ return false;
+
+ // Ensure that there are no instructions between the PHI and the load that
+ // could store.
+ for (BasicBlock::iterator BBI(PN); &*BBI != LI; ++BBI)
+ if (BBI->mayWriteToMemory())
+ return false;
+
+ MaxAlign = std::max(MaxAlign, LI->getAlignment());
+ HaveLoad = true;
+ }
+
+ if (!HaveLoad)
+ return false;
+
+ const DataLayout &DL = PN.getModule()->getDataLayout();
+
+ // We can only transform this if it is safe to push the loads into the
+ // predecessor blocks. The only thing to watch out for is that we can't put
+ // a possibly trapping load in the predecessor if it is a critical edge.
+ for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
+ TerminatorInst *TI = PN.getIncomingBlock(Idx)->getTerminator();
+ Value *InVal = PN.getIncomingValue(Idx);
+
+ // If the value is produced by the terminator of the predecessor (an
+ // invoke) or it has side-effects, there is no valid place to put a load
+ // in the predecessor.
+ if (TI == InVal || TI->mayHaveSideEffects())
+ return false;
+
+ // If the predecessor has a single successor, then the edge isn't
+ // critical.
+ if (TI->getNumSuccessors() == 1)
+ continue;
+
+ // If this pointer is always safe to load, or if we can prove that there
+ // is already a load in the block, then we can move the load to the pred
+ // block.
+ if (isDereferenceablePointer(InVal, DL) ||
+ isSafeToLoadUnconditionally(InVal, TI, MaxAlign))
+ continue;
+
+ return false;
+ }
+
+ return true;
+}
+
+static void speculatePHINodeLoads(PHINode &PN) {
+ DEBUG(dbgs() << " original: " << PN << "\n");
+
+ Type *LoadTy = cast<PointerType>(PN.getType())->getElementType();
+ IRBuilderTy PHIBuilder(&PN);
+ PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(),
+ PN.getName() + ".sroa.speculated");
+
+ // Get the AA tags and alignment to use from one of the loads. It doesn't
+ // matter which one we get and if any differ.
+ LoadInst *SomeLoad = cast<LoadInst>(PN.user_back());
+
+ AAMDNodes AATags;
+ SomeLoad->getAAMetadata(AATags);
+ unsigned Align = SomeLoad->getAlignment();
+
+ // Rewrite all loads of the PN to use the new PHI.
+ while (!PN.use_empty()) {
+ LoadInst *LI = cast<LoadInst>(PN.user_back());
+ LI->replaceAllUsesWith(NewPN);
+ LI->eraseFromParent();
+ }
+
+ // Inject loads into all of the pred blocks.
+ for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
+ BasicBlock *Pred = PN.getIncomingBlock(Idx);
+ TerminatorInst *TI = Pred->getTerminator();
+ Value *InVal = PN.getIncomingValue(Idx);
+ IRBuilderTy PredBuilder(TI);
+
+ LoadInst *Load = PredBuilder.CreateLoad(
+ InVal, (PN.getName() + ".sroa.speculate.load." + Pred->getName()));
+ ++NumLoadsSpeculated;
+ Load->setAlignment(Align);
+ if (AATags)
+ Load->setAAMetadata(AATags);
+ NewPN->addIncoming(Load, Pred);
+ }
+
+ DEBUG(dbgs() << " speculated to: " << *NewPN << "\n");
+ PN.eraseFromParent();
+}
+
+/// Select instructions that use an alloca and are subsequently loaded can be
+/// rewritten to load both input pointers and then select between the result,
+/// allowing the load of the alloca to be promoted.
+/// From this:
+/// %P2 = select i1 %cond, i32* %Alloca, i32* %Other
+/// %V = load i32* %P2
+/// to:
+/// %V1 = load i32* %Alloca -> will be mem2reg'd
+/// %V2 = load i32* %Other
+/// %V = select i1 %cond, i32 %V1, i32 %V2
+///
+/// We can do this to a select if its only uses are loads and if the operand
+/// to the select can be loaded unconditionally.
+static bool isSafeSelectToSpeculate(SelectInst &SI) {
+ Value *TValue = SI.getTrueValue();
+ Value *FValue = SI.getFalseValue();
+ const DataLayout &DL = SI.getModule()->getDataLayout();
+ bool TDerefable = isDereferenceablePointer(TValue, DL);
+ bool FDerefable = isDereferenceablePointer(FValue, DL);
+
+ for (User *U : SI.users()) {
+ LoadInst *LI = dyn_cast<LoadInst>(U);
+ if (!LI || !LI->isSimple())
+ return false;
+
+ // Both operands to the select need to be dereferencable, either
+ // absolutely (e.g. allocas) or at this point because we can see other
+ // accesses to it.
+ if (!TDerefable &&
+ !isSafeToLoadUnconditionally(TValue, LI, LI->getAlignment()))
+ return false;
+ if (!FDerefable &&
+ !isSafeToLoadUnconditionally(FValue, LI, LI->getAlignment()))
+ return false;
+ }
+
+ return true;
+}
+
+static void speculateSelectInstLoads(SelectInst &SI) {
+ DEBUG(dbgs() << " original: " << SI << "\n");
+
+ IRBuilderTy IRB(&SI);
+ Value *TV = SI.getTrueValue();
+ Value *FV = SI.getFalseValue();
+ // Replace the loads of the select with a select of two loads.
+ while (!SI.use_empty()) {
+ LoadInst *LI = cast<LoadInst>(SI.user_back());
+ assert(LI->isSimple() && "We only speculate simple loads");
+
+ IRB.SetInsertPoint(LI);
+ LoadInst *TL =
+ IRB.CreateLoad(TV, LI->getName() + ".sroa.speculate.load.true");
+ LoadInst *FL =
+ IRB.CreateLoad(FV, LI->getName() + ".sroa.speculate.load.false");
+ NumLoadsSpeculated += 2;
+
+ // Transfer alignment and AA info if present.
+ TL->setAlignment(LI->getAlignment());
+ FL->setAlignment(LI->getAlignment());
+
+ AAMDNodes Tags;
+ LI->getAAMetadata(Tags);
+ if (Tags) {
+ TL->setAAMetadata(Tags);
+ FL->setAAMetadata(Tags);
+ }
+
+ Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL,
+ LI->getName() + ".sroa.speculated");
+
+ DEBUG(dbgs() << " speculated to: " << *V << "\n");
+ LI->replaceAllUsesWith(V);
+ LI->eraseFromParent();
+ }
+ SI.eraseFromParent();
+}
+
+/// \brief Build a GEP out of a base pointer and indices.
+///
+/// This will return the BasePtr if that is valid, or build a new GEP
+/// instruction using the IRBuilder if GEP-ing is needed.
+static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr,
+ SmallVectorImpl<Value *> &Indices, Twine NamePrefix) {
+ if (Indices.empty())
+ return BasePtr;
+
+ // A single zero index is a no-op, so check for this and avoid building a GEP
+ // in that case.
+ if (Indices.size() == 1 && cast<ConstantInt>(Indices.back())->isZero())
+ return BasePtr;
+
+ return IRB.CreateInBoundsGEP(nullptr, BasePtr, Indices,
+ NamePrefix + "sroa_idx");
+}
+
+/// \brief Get a natural GEP off of the BasePtr walking through Ty toward
+/// TargetTy without changing the offset of the pointer.
+///
+/// This routine assumes we've already established a properly offset GEP with
+/// Indices, and arrived at the Ty type. The goal is to continue to GEP with
+/// zero-indices down through type layers until we find one the same as
+/// TargetTy. If we can't find one with the same type, we at least try to use
+/// one with the same size. If none of that works, we just produce the GEP as
+/// indicated by Indices to have the correct offset.
+static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL,
+ Value *BasePtr, Type *Ty, Type *TargetTy,
+ SmallVectorImpl<Value *> &Indices,
+ Twine NamePrefix) {
+ if (Ty == TargetTy)
+ return buildGEP(IRB, BasePtr, Indices, NamePrefix);
+
+ // Pointer size to use for the indices.
+ unsigned PtrSize = DL.getPointerTypeSizeInBits(BasePtr->getType());
+
+ // See if we can descend into a struct and locate a field with the correct
+ // type.
+ unsigned NumLayers = 0;
+ Type *ElementTy = Ty;
+ do {
+ if (ElementTy->isPointerTy())
+ break;
+
+ if (ArrayType *ArrayTy = dyn_cast<ArrayType>(ElementTy)) {
+ ElementTy = ArrayTy->getElementType();
+ Indices.push_back(IRB.getIntN(PtrSize, 0));
+ } else if (VectorType *VectorTy = dyn_cast<VectorType>(ElementTy)) {
+ ElementTy = VectorTy->getElementType();
+ Indices.push_back(IRB.getInt32(0));
+ } else if (StructType *STy = dyn_cast<StructType>(ElementTy)) {
+ if (STy->element_begin() == STy->element_end())
+ break; // Nothing left to descend into.
+ ElementTy = *STy->element_begin();
+ Indices.push_back(IRB.getInt32(0));
+ } else {
+ break;
+ }
+ ++NumLayers;
+ } while (ElementTy != TargetTy);
+ if (ElementTy != TargetTy)
+ Indices.erase(Indices.end() - NumLayers, Indices.end());
+
+ return buildGEP(IRB, BasePtr, Indices, NamePrefix);
+}
+
+/// \brief Recursively compute indices for a natural GEP.
+///
+/// This is the recursive step for getNaturalGEPWithOffset that walks down the
+/// element types adding appropriate indices for the GEP.
+static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL,
+ Value *Ptr, Type *Ty, APInt &Offset,
+ Type *TargetTy,
+ SmallVectorImpl<Value *> &Indices,
+ Twine NamePrefix) {
+ if (Offset == 0)
+ return getNaturalGEPWithType(IRB, DL, Ptr, Ty, TargetTy, Indices,
+ NamePrefix);
+
+ // We can't recurse through pointer types.
+ if (Ty->isPointerTy())
+ return nullptr;
+
+ // We try to analyze GEPs over vectors here, but note that these GEPs are
+ // extremely poorly defined currently. The long-term goal is to remove GEPing
+ // over a vector from the IR completely.
+ if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) {
+ unsigned ElementSizeInBits = DL.getTypeSizeInBits(VecTy->getScalarType());
+ if (ElementSizeInBits % 8 != 0) {
+ // GEPs over non-multiple of 8 size vector elements are invalid.
+ return nullptr;
+ }
+ APInt ElementSize(Offset.getBitWidth(), ElementSizeInBits / 8);
+ APInt NumSkippedElements = Offset.sdiv(ElementSize);
+ if (NumSkippedElements.ugt(VecTy->getNumElements()))
+ return nullptr;
+ Offset -= NumSkippedElements * ElementSize;
+ Indices.push_back(IRB.getInt(NumSkippedElements));
+ return getNaturalGEPRecursively(IRB, DL, Ptr, VecTy->getElementType(),
+ Offset, TargetTy, Indices, NamePrefix);
+ }
+
+ if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
+ Type *ElementTy = ArrTy->getElementType();
+ APInt ElementSize(Offset.getBitWidth(), DL.getTypeAllocSize(ElementTy));
+ APInt NumSkippedElements = Offset.sdiv(ElementSize);
+ if (NumSkippedElements.ugt(ArrTy->getNumElements()))
+ return nullptr;
+
+ Offset -= NumSkippedElements * ElementSize;
+ Indices.push_back(IRB.getInt(NumSkippedElements));
+ return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy,
+ Indices, NamePrefix);
+ }
+
+ StructType *STy = dyn_cast<StructType>(Ty);
+ if (!STy)
+ return nullptr;
+
+ const StructLayout *SL = DL.getStructLayout(STy);
+ uint64_t StructOffset = Offset.getZExtValue();
+ if (StructOffset >= SL->getSizeInBytes())
+ return nullptr;
+ unsigned Index = SL->getElementContainingOffset(StructOffset);
+ Offset -= APInt(Offset.getBitWidth(), SL->getElementOffset(Index));
+ Type *ElementTy = STy->getElementType(Index);
+ if (Offset.uge(DL.getTypeAllocSize(ElementTy)))
+ return nullptr; // The offset points into alignment padding.
+
+ Indices.push_back(IRB.getInt32(Index));
+ return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy,
+ Indices, NamePrefix);
+}
+
+/// \brief Get a natural GEP from a base pointer to a particular offset and
+/// resulting in a particular type.
+///
+/// The goal is to produce a "natural" looking GEP that works with the existing
+/// composite types to arrive at the appropriate offset and element type for
+/// a pointer. TargetTy is the element type the returned GEP should point-to if
+/// possible. We recurse by decreasing Offset, adding the appropriate index to
+/// Indices, and setting Ty to the result subtype.
+///
+/// If no natural GEP can be constructed, this function returns null.
+static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL,
+ Value *Ptr, APInt Offset, Type *TargetTy,
+ SmallVectorImpl<Value *> &Indices,
+ Twine NamePrefix) {
+ PointerType *Ty = cast<PointerType>(Ptr->getType());
+
+ // Don't consider any GEPs through an i8* as natural unless the TargetTy is
+ // an i8.
+ if (Ty == IRB.getInt8PtrTy(Ty->getAddressSpace()) && TargetTy->isIntegerTy(8))
+ return nullptr;
+
+ Type *ElementTy = Ty->getElementType();
+ if (!ElementTy->isSized())
+ return nullptr; // We can't GEP through an unsized element.
+ APInt ElementSize(Offset.getBitWidth(), DL.getTypeAllocSize(ElementTy));
+ if (ElementSize == 0)
+ return nullptr; // Zero-length arrays can't help us build a natural GEP.
+ APInt NumSkippedElements = Offset.sdiv(ElementSize);
+
+ Offset -= NumSkippedElements * ElementSize;
+ Indices.push_back(IRB.getInt(NumSkippedElements));
+ return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy,
+ Indices, NamePrefix);
+}
+
+/// \brief Compute an adjusted pointer from Ptr by Offset bytes where the
+/// resulting pointer has PointerTy.
+///
+/// This tries very hard to compute a "natural" GEP which arrives at the offset
+/// and produces the pointer type desired. Where it cannot, it will try to use
+/// the natural GEP to arrive at the offset and bitcast to the type. Where that
+/// fails, it will try to use an existing i8* and GEP to the byte offset and
+/// bitcast to the type.
+///
+/// The strategy for finding the more natural GEPs is to peel off layers of the
+/// pointer, walking back through bit casts and GEPs, searching for a base
+/// pointer from which we can compute a natural GEP with the desired
+/// properties. The algorithm tries to fold as many constant indices into
+/// a single GEP as possible, thus making each GEP more independent of the
+/// surrounding code.
+static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
+ APInt Offset, Type *PointerTy, Twine NamePrefix) {
+ // Even though we don't look through PHI nodes, we could be called on an
+ // instruction in an unreachable block, which may be on a cycle.
+ SmallPtrSet<Value *, 4> Visited;
+ Visited.insert(Ptr);
+ SmallVector<Value *, 4> Indices;
+
+ // We may end up computing an offset pointer that has the wrong type. If we
+ // never are able to compute one directly that has the correct type, we'll
+ // fall back to it, so keep it and the base it was computed from around here.
+ Value *OffsetPtr = nullptr;
+ Value *OffsetBasePtr;
+
+ // Remember any i8 pointer we come across to re-use if we need to do a raw
+ // byte offset.
+ Value *Int8Ptr = nullptr;
+ APInt Int8PtrOffset(Offset.getBitWidth(), 0);
+
+ Type *TargetTy = PointerTy->getPointerElementType();
+
+ do {
+ // First fold any existing GEPs into the offset.
+ while (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) {
+ APInt GEPOffset(Offset.getBitWidth(), 0);
+ if (!GEP->accumulateConstantOffset(DL, GEPOffset))
+ break;
+ Offset += GEPOffset;
+ Ptr = GEP->getPointerOperand();
+ if (!Visited.insert(Ptr).second)
+ break;
+ }
+
+ // See if we can perform a natural GEP here.
+ Indices.clear();
+ if (Value *P = getNaturalGEPWithOffset(IRB, DL, Ptr, Offset, TargetTy,
+ Indices, NamePrefix)) {
+ // If we have a new natural pointer at the offset, clear out any old
+ // offset pointer we computed. Unless it is the base pointer or
+ // a non-instruction, we built a GEP we don't need. Zap it.
+ if (OffsetPtr && OffsetPtr != OffsetBasePtr)
+ if (Instruction *I = dyn_cast<Instruction>(OffsetPtr)) {
+ assert(I->use_empty() && "Built a GEP with uses some how!");
+ I->eraseFromParent();
+ }
+ OffsetPtr = P;
+ OffsetBasePtr = Ptr;
+ // If we also found a pointer of the right type, we're done.
+ if (P->getType() == PointerTy)
+ return P;
+ }
+
+ // Stash this pointer if we've found an i8*.
+ if (Ptr->getType()->isIntegerTy(8)) {
+ Int8Ptr = Ptr;
+ Int8PtrOffset = Offset;
+ }
+
+ // Peel off a layer of the pointer and update the offset appropriately.
+ if (Operator::getOpcode(Ptr) == Instruction::BitCast) {
+ Ptr = cast<Operator>(Ptr)->getOperand(0);
+ } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Ptr)) {
+ if (GA->mayBeOverridden())
+ break;
+ Ptr = GA->getAliasee();
+ } else {
+ break;
+ }
+ assert(Ptr->getType()->isPointerTy() && "Unexpected operand type!");
+ } while (Visited.insert(Ptr).second);
+
+ if (!OffsetPtr) {
+ if (!Int8Ptr) {
+ Int8Ptr = IRB.CreateBitCast(
+ Ptr, IRB.getInt8PtrTy(PointerTy->getPointerAddressSpace()),
+ NamePrefix + "sroa_raw_cast");
+ Int8PtrOffset = Offset;
+ }
+
+ OffsetPtr = Int8PtrOffset == 0
+ ? Int8Ptr
+ : IRB.CreateInBoundsGEP(IRB.getInt8Ty(), Int8Ptr,
+ IRB.getInt(Int8PtrOffset),
+ NamePrefix + "sroa_raw_idx");
+ }
+ Ptr = OffsetPtr;
+
+ // On the off chance we were targeting i8*, guard the bitcast here.
+ if (Ptr->getType() != PointerTy)
+ Ptr = IRB.CreateBitCast(Ptr, PointerTy, NamePrefix + "sroa_cast");
+
+ return Ptr;
+}
+
+/// \brief Compute the adjusted alignment for a load or store from an offset.
+static unsigned getAdjustedAlignment(Instruction *I, uint64_t Offset,
+ const DataLayout &DL) {
+ unsigned Alignment;
+ Type *Ty;
+ if (auto *LI = dyn_cast<LoadInst>(I)) {
+ Alignment = LI->getAlignment();
+ Ty = LI->getType();
+ } else if (auto *SI = dyn_cast<StoreInst>(I)) {
+ Alignment = SI->getAlignment();
+ Ty = SI->getValueOperand()->getType();
+ } else {
+ llvm_unreachable("Only loads and stores are allowed!");
+ }
+
+ if (!Alignment)
+ Alignment = DL.getABITypeAlignment(Ty);
+
+ return MinAlign(Alignment, Offset);
+}
+
+/// \brief Test whether we can convert a value from the old to the new type.
+///
+/// This predicate should be used to guard calls to convertValue in order to
+/// ensure that we only try to convert viable values. The strategy is that we
+/// will peel off single element struct and array wrappings to get to an
+/// underlying value, and convert that value.
+static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
+ if (OldTy == NewTy)
+ return true;
+
+ // For integer types, we can't handle any bit-width differences. This would
+ // break both vector conversions with extension and introduce endianness
+ // issues when in conjunction with loads and stores.
+ if (isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) {
+ assert(cast<IntegerType>(OldTy)->getBitWidth() !=
+ cast<IntegerType>(NewTy)->getBitWidth() &&
+ "We can't have the same bitwidth for different int types");
+ return false;
+ }
+
+ if (DL.getTypeSizeInBits(NewTy) != DL.getTypeSizeInBits(OldTy))
+ return false;
+ if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType())
+ return false;
+
+ // We can convert pointers to integers and vice-versa. Same for vectors
+ // of pointers and integers.
+ OldTy = OldTy->getScalarType();
+ NewTy = NewTy->getScalarType();
+ if (NewTy->isPointerTy() || OldTy->isPointerTy()) {
+ if (NewTy->isPointerTy() && OldTy->isPointerTy())
+ return true;
+ if (NewTy->isIntegerTy() || OldTy->isIntegerTy())
+ return true;
+ return false;
+ }
+
+ return true;
+}
+
+/// \brief Generic routine to convert an SSA value to a value of a different
+/// type.
+///
+/// This will try various different casting techniques, such as bitcasts,
+/// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test
+/// two types for viability with this routine.
+static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
+ Type *NewTy) {
+ Type *OldTy = V->getType();
+ assert(canConvertValue(DL, OldTy, NewTy) && "Value not convertable to type");
+
+ if (OldTy == NewTy)
+ return V;
+
+ assert(!(isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) &&
+ "Integer types must be the exact same to convert.");
+
+ // See if we need inttoptr for this type pair. A cast involving both scalars
+ // and vectors requires and additional bitcast.
+ if (OldTy->getScalarType()->isIntegerTy() &&
+ NewTy->getScalarType()->isPointerTy()) {
+ // Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8*
+ if (OldTy->isVectorTy() && !NewTy->isVectorTy())
+ return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)),
+ NewTy);
+
+ // Expand i128 to <2 x i8*> --> i128 to <2 x i64> to <2 x i8*>
+ if (!OldTy->isVectorTy() && NewTy->isVectorTy())
+ return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)),
+ NewTy);
+
+ return IRB.CreateIntToPtr(V, NewTy);
+ }
+
+ // See if we need ptrtoint for this type pair. A cast involving both scalars
+ // and vectors requires and additional bitcast.
+ if (OldTy->getScalarType()->isPointerTy() &&
+ NewTy->getScalarType()->isIntegerTy()) {
+ // Expand <2 x i8*> to i128 --> <2 x i8*> to <2 x i64> to i128
+ if (OldTy->isVectorTy() && !NewTy->isVectorTy())
+ return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
+ NewTy);
+
+ // Expand i8* to <2 x i32> --> i8* to i64 to <2 x i32>
+ if (!OldTy->isVectorTy() && NewTy->isVectorTy())
+ return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
+ NewTy);
+
+ return IRB.CreatePtrToInt(V, NewTy);
+ }
+
+ return IRB.CreateBitCast(V, NewTy);
+}
+
+/// \brief Test whether the given slice use can be promoted to a vector.
+///
+/// This function is called to test each entry in a partition which is slated
+/// for a single slice.
+static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
+ VectorType *Ty,
+ uint64_t ElementSize,
+ const DataLayout &DL) {
+ // First validate the slice offsets.
+ uint64_t BeginOffset =
+ std::max(S.beginOffset(), P.beginOffset()) - P.beginOffset();
+ uint64_t BeginIndex = BeginOffset / ElementSize;
+ if (BeginIndex * ElementSize != BeginOffset ||
+ BeginIndex >= Ty->getNumElements())
+ return false;
+ uint64_t EndOffset =
+ std::min(S.endOffset(), P.endOffset()) - P.beginOffset();
+ uint64_t EndIndex = EndOffset / ElementSize;
+ if (EndIndex * ElementSize != EndOffset || EndIndex > Ty->getNumElements())
+ return false;
+
+ assert(EndIndex > BeginIndex && "Empty vector!");
+ uint64_t NumElements = EndIndex - BeginIndex;
+ Type *SliceTy = (NumElements == 1)
+ ? Ty->getElementType()
+ : VectorType::get(Ty->getElementType(), NumElements);
+
+ Type *SplitIntTy =
+ Type::getIntNTy(Ty->getContext(), NumElements * ElementSize * 8);
+
+ Use *U = S.getUse();
+
+ if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
+ if (MI->isVolatile())
+ return false;
+ if (!S.isSplittable())
+ return false; // Skip any unsplittable intrinsics.
+ } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
+ if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
+ II->getIntrinsicID() != Intrinsic::lifetime_end)
+ return false;
+ } else if (U->get()->getType()->getPointerElementType()->isStructTy()) {
+ // Disable vector promotion when there are loads or stores of an FCA.
+ return false;
+ } else if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
+ if (LI->isVolatile())
+ return false;
+ Type *LTy = LI->getType();
+ if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
+ assert(LTy->isIntegerTy());
+ LTy = SplitIntTy;
+ }
+ if (!canConvertValue(DL, SliceTy, LTy))
+ return false;
+ } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
+ if (SI->isVolatile())
+ return false;
+ Type *STy = SI->getValueOperand()->getType();
+ if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
+ assert(STy->isIntegerTy());
+ STy = SplitIntTy;
+ }
+ if (!canConvertValue(DL, STy, SliceTy))
+ return false;
+ } else {
+ return false;
+ }
+
+ return true;
+}
+
+/// \brief Test whether the given alloca partitioning and range of slices can be
+/// promoted to a vector.
+///
+/// This is a quick test to check whether we can rewrite a particular alloca
+/// partition (and its newly formed alloca) into a vector alloca with only
+/// whole-vector loads and stores such that it could be promoted to a vector
+/// SSA value. We only can ensure this for a limited set of operations, and we
+/// don't want to do the rewrites unless we are confident that the result will
+/// be promotable, so we have an early test here.
+static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
+ // Collect the candidate types for vector-based promotion. Also track whether
+ // we have different element types.
+ SmallVector<VectorType *, 4> CandidateTys;
+ Type *CommonEltTy = nullptr;
+ bool HaveCommonEltTy = true;
+ auto CheckCandidateType = [&](Type *Ty) {
+ if (auto *VTy = dyn_cast<VectorType>(Ty)) {
+ CandidateTys.push_back(VTy);
+ if (!CommonEltTy)
+ CommonEltTy = VTy->getElementType();
+ else if (CommonEltTy != VTy->getElementType())
+ HaveCommonEltTy = false;
+ }
+ };
+ // Consider any loads or stores that are the exact size of the slice.
+ for (const Slice &S : P)
+ if (S.beginOffset() == P.beginOffset() &&
+ S.endOffset() == P.endOffset()) {
+ if (auto *LI = dyn_cast<LoadInst>(S.getUse()->getUser()))
+ CheckCandidateType(LI->getType());
+ else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser()))
+ CheckCandidateType(SI->getValueOperand()->getType());
+ }
+
+ // If we didn't find a vector type, nothing to do here.
+ if (CandidateTys.empty())
+ return nullptr;
+
+ // Remove non-integer vector types if we had multiple common element types.
+ // FIXME: It'd be nice to replace them with integer vector types, but we can't
+ // do that until all the backends are known to produce good code for all
+ // integer vector types.
+ if (!HaveCommonEltTy) {
+ CandidateTys.erase(std::remove_if(CandidateTys.begin(), CandidateTys.end(),
+ [](VectorType *VTy) {
+ return !VTy->getElementType()->isIntegerTy();
+ }),
+ CandidateTys.end());
+
+ // If there were no integer vector types, give up.
+ if (CandidateTys.empty())
+ return nullptr;
+
+ // Rank the remaining candidate vector types. This is easy because we know
+ // they're all integer vectors. We sort by ascending number of elements.
+ auto RankVectorTypes = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
+ assert(DL.getTypeSizeInBits(RHSTy) == DL.getTypeSizeInBits(LHSTy) &&
+ "Cannot have vector types of different sizes!");
+ assert(RHSTy->getElementType()->isIntegerTy() &&
+ "All non-integer types eliminated!");
+ assert(LHSTy->getElementType()->isIntegerTy() &&
+ "All non-integer types eliminated!");
+ return RHSTy->getNumElements() < LHSTy->getNumElements();
+ };
+ std::sort(CandidateTys.begin(), CandidateTys.end(), RankVectorTypes);
+ CandidateTys.erase(
+ std::unique(CandidateTys.begin(), CandidateTys.end(), RankVectorTypes),
+ CandidateTys.end());
+ } else {
+// The only way to have the same element type in every vector type is to
+// have the same vector type. Check that and remove all but one.
+#ifndef NDEBUG
+ for (VectorType *VTy : CandidateTys) {
+ assert(VTy->getElementType() == CommonEltTy &&
+ "Unaccounted for element type!");
+ assert(VTy == CandidateTys[0] &&
+ "Different vector types with the same element type!");
+ }
+#endif
+ CandidateTys.resize(1);
+ }
+
+ // Try each vector type, and return the one which works.
+ auto CheckVectorTypeForPromotion = [&](VectorType *VTy) {
+ uint64_t ElementSize = DL.getTypeSizeInBits(VTy->getElementType());
+
+ // While the definition of LLVM vectors is bitpacked, we don't support sizes
+ // that aren't byte sized.
+ if (ElementSize % 8)
+ return false;
+ assert((DL.getTypeSizeInBits(VTy) % 8) == 0 &&
+ "vector size not a multiple of element size?");
+ ElementSize /= 8;
+
+ for (const Slice &S : P)
+ if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL))
+ return false;
+
+ for (const Slice *S : P.splitSliceTails())
+ if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL))
+ return false;
+
+ return true;
+ };
+ for (VectorType *VTy : CandidateTys)
+ if (CheckVectorTypeForPromotion(VTy))
+ return VTy;
+
+ return nullptr;
+}
+
+/// \brief Test whether a slice of an alloca is valid for integer widening.
+///
+/// This implements the necessary checking for the \c isIntegerWideningViable
+/// test below on a single slice of the alloca.
+static bool isIntegerWideningViableForSlice(const Slice &S,
+ uint64_t AllocBeginOffset,
+ Type *AllocaTy,
+ const DataLayout &DL,
+ bool &WholeAllocaOp) {
+ uint64_t Size = DL.getTypeStoreSize(AllocaTy);
+
+ uint64_t RelBegin = S.beginOffset() - AllocBeginOffset;
+ uint64_t RelEnd = S.endOffset() - AllocBeginOffset;
+
+ // We can't reasonably handle cases where the load or store extends past
+ // the end of the alloca's type and into its padding.
+ if (RelEnd > Size)
+ return false;
+
+ Use *U = S.getUse();
+
+ if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
+ if (LI->isVolatile())
+ return false;
+ // We can't handle loads that extend past the allocated memory.
+ if (DL.getTypeStoreSize(LI->getType()) > Size)
+ return false;
+ // Note that we don't count vector loads or stores as whole-alloca
+ // operations which enable integer widening because we would prefer to use
+ // vector widening instead.
+ if (!isa<VectorType>(LI->getType()) && RelBegin == 0 && RelEnd == Size)
+ WholeAllocaOp = true;
+ if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) {
+ if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy))
+ return false;
+ } else if (RelBegin != 0 || RelEnd != Size ||
+ !canConvertValue(DL, AllocaTy, LI->getType())) {
+ // Non-integer loads need to be convertible from the alloca type so that
+ // they are promotable.
+ return false;
+ }
+ } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
+ Type *ValueTy = SI->getValueOperand()->getType();
+ if (SI->isVolatile())
+ return false;
+ // We can't handle stores that extend past the allocated memory.
+ if (DL.getTypeStoreSize(ValueTy) > Size)
+ return false;
+ // Note that we don't count vector loads or stores as whole-alloca
+ // operations which enable integer widening because we would prefer to use
+ // vector widening instead.
+ if (!isa<VectorType>(ValueTy) && RelBegin == 0 && RelEnd == Size)
+ WholeAllocaOp = true;
+ if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) {
+ if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy))
+ return false;
+ } else if (RelBegin != 0 || RelEnd != Size ||
+ !canConvertValue(DL, ValueTy, AllocaTy)) {
+ // Non-integer stores need to be convertible to the alloca type so that
+ // they are promotable.
+ return false;
+ }
+ } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
+ if (MI->isVolatile() || !isa<Constant>(MI->getLength()))
+ return false;
+ if (!S.isSplittable())
+ return false; // Skip any unsplittable intrinsics.
+ } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
+ if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
+ II->getIntrinsicID() != Intrinsic::lifetime_end)
+ return false;
+ } else {
+ return false;
+ }
+
+ return true;
+}
+
+/// \brief Test whether the given alloca partition's integer operations can be
+/// widened to promotable ones.
+///
+/// This is a quick test to check whether we can rewrite the integer loads and
+/// stores to a particular alloca into wider loads and stores and be able to
+/// promote the resulting alloca.
+static bool isIntegerWideningViable(Partition &P, Type *AllocaTy,
+ const DataLayout &DL) {
+ uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy);
+ // Don't create integer types larger than the maximum bitwidth.
+ if (SizeInBits > IntegerType::MAX_INT_BITS)
+ return false;
+
+ // Don't try to handle allocas with bit-padding.
+ if (SizeInBits != DL.getTypeStoreSizeInBits(AllocaTy))
+ return false;
+
+ // We need to ensure that an integer type with the appropriate bitwidth can
+ // be converted to the alloca type, whatever that is. We don't want to force
+ // the alloca itself to have an integer type if there is a more suitable one.
+ Type *IntTy = Type::getIntNTy(AllocaTy->getContext(), SizeInBits);
+ if (!canConvertValue(DL, AllocaTy, IntTy) ||
+ !canConvertValue(DL, IntTy, AllocaTy))
+ return false;
+
+ // While examining uses, we ensure that the alloca has a covering load or
+ // store. We don't want to widen the integer operations only to fail to
+ // promote due to some other unsplittable entry (which we may make splittable
+ // later). However, if there are only splittable uses, go ahead and assume
+ // that we cover the alloca.
+ // FIXME: We shouldn't consider split slices that happen to start in the
+ // partition here...
+ bool WholeAllocaOp =
+ P.begin() != P.end() ? false : DL.isLegalInteger(SizeInBits);
+
+ for (const Slice &S : P)
+ if (!isIntegerWideningViableForSlice(S, P.beginOffset(), AllocaTy, DL,
+ WholeAllocaOp))
+ return false;
+
+ for (const Slice *S : P.splitSliceTails())
+ if (!isIntegerWideningViableForSlice(*S, P.beginOffset(), AllocaTy, DL,
+ WholeAllocaOp))
+ return false;
+
+ return WholeAllocaOp;
+}
+
+static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
+ IntegerType *Ty, uint64_t Offset,
+ const Twine &Name) {
+ DEBUG(dbgs() << " start: " << *V << "\n");
+ IntegerType *IntTy = cast<IntegerType>(V->getType());
+ assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) &&
+ "Element extends past full value");
+ uint64_t ShAmt = 8 * Offset;
+ if (DL.isBigEndian())
+ ShAmt = 8 * (DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset);
+ if (ShAmt) {
+ V = IRB.CreateLShr(V, ShAmt, Name + ".shift");
+ DEBUG(dbgs() << " shifted: " << *V << "\n");
+ }
+ assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
+ "Cannot extract to a larger integer!");
+ if (Ty != IntTy) {
+ V = IRB.CreateTrunc(V, Ty, Name + ".trunc");
+ DEBUG(dbgs() << " trunced: " << *V << "\n");
+ }
+ return V;
+}
+
+static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old,
+ Value *V, uint64_t Offset, const Twine &Name) {
+ IntegerType *IntTy = cast<IntegerType>(Old->getType());
+ IntegerType *Ty = cast<IntegerType>(V->getType());
+ assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
+ "Cannot insert a larger integer!");
+ DEBUG(dbgs() << " start: " << *V << "\n");
+ if (Ty != IntTy) {
+ V = IRB.CreateZExt(V, IntTy, Name + ".ext");
+ DEBUG(dbgs() << " extended: " << *V << "\n");
+ }
+ assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) &&
+ "Element store outside of alloca store");
+ uint64_t ShAmt = 8 * Offset;
+ if (DL.isBigEndian())
+ ShAmt = 8 * (DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset);
+ if (ShAmt) {
+ V = IRB.CreateShl(V, ShAmt, Name + ".shift");
+ DEBUG(dbgs() << " shifted: " << *V << "\n");
+ }
+
+ if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) {
+ APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt);
+ Old = IRB.CreateAnd(Old, Mask, Name + ".mask");
+ DEBUG(dbgs() << " masked: " << *Old << "\n");
+ V = IRB.CreateOr(Old, V, Name + ".insert");
+ DEBUG(dbgs() << " inserted: " << *V << "\n");
+ }
+ return V;
+}
+
+static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex,
+ unsigned EndIndex, const Twine &Name) {
+ VectorType *VecTy = cast<VectorType>(V->getType());
+ unsigned NumElements = EndIndex - BeginIndex;
+ assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
+
+ if (NumElements == VecTy->getNumElements())
+ return V;
+
+ if (NumElements == 1) {
+ V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex),
+ Name + ".extract");
+ DEBUG(dbgs() << " extract: " << *V << "\n");
+ return V;
+ }
+
+ SmallVector<Constant *, 8> Mask;
+ Mask.reserve(NumElements);
+ for (unsigned i = BeginIndex; i != EndIndex; ++i)
+ Mask.push_back(IRB.getInt32(i));
+ V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()),
+ ConstantVector::get(Mask), Name + ".extract");
+ DEBUG(dbgs() << " shuffle: " << *V << "\n");
+ return V;
+}
+
+static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
+ unsigned BeginIndex, const Twine &Name) {
+ VectorType *VecTy = cast<VectorType>(Old->getType());
+ assert(VecTy && "Can only insert a vector into a vector");
+
+ VectorType *Ty = dyn_cast<VectorType>(V->getType());
+ if (!Ty) {
+ // Single element to insert.
+ V = IRB.CreateInsertElement(Old, V, IRB.getInt32(BeginIndex),
+ Name + ".insert");
+ DEBUG(dbgs() << " insert: " << *V << "\n");
+ return V;
+ }
+
+ assert(Ty->getNumElements() <= VecTy->getNumElements() &&
+ "Too many elements!");
+ if (Ty->getNumElements() == VecTy->getNumElements()) {
+ assert(V->getType() == VecTy && "Vector type mismatch");
+ return V;
+ }
+ unsigned EndIndex = BeginIndex + Ty->getNumElements();
+
+ // When inserting a smaller vector into the larger to store, we first
+ // use a shuffle vector to widen it with undef elements, and then
+ // a second shuffle vector to select between the loaded vector and the
+ // incoming vector.
+ SmallVector<Constant *, 8> Mask;
+ Mask.reserve(VecTy->getNumElements());
+ for (unsigned i = 0; i != VecTy->getNumElements(); ++i)
+ if (i >= BeginIndex && i < EndIndex)
+ Mask.push_back(IRB.getInt32(i - BeginIndex));
+ else
+ Mask.push_back(UndefValue::get(IRB.getInt32Ty()));
+ V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()),
+ ConstantVector::get(Mask), Name + ".expand");
+ DEBUG(dbgs() << " shuffle: " << *V << "\n");
+
+ Mask.clear();
+ for (unsigned i = 0; i != VecTy->getNumElements(); ++i)
+ Mask.push_back(IRB.getInt1(i >= BeginIndex && i < EndIndex));
+
+ V = IRB.CreateSelect(ConstantVector::get(Mask), V, Old, Name + "blend");
+
+ DEBUG(dbgs() << " blend: " << *V << "\n");
+ return V;
+}
+
+/// \brief Visitor to rewrite instructions using p particular slice of an alloca
+/// to use a new alloca.
+///
+/// Also implements the rewriting to vector-based accesses when the partition
+/// passes the isVectorPromotionViable predicate. Most of the rewriting logic
+/// lives here.
+class llvm::sroa::AllocaSliceRewriter
+ : public InstVisitor<AllocaSliceRewriter, bool> {
+ // Befriend the base class so it can delegate to private visit methods.
+ friend class llvm::InstVisitor<AllocaSliceRewriter, bool>;
+ typedef llvm::InstVisitor<AllocaSliceRewriter, bool> Base;
+
+ const DataLayout &DL;
+ AllocaSlices &AS;
+ SROA &Pass;
+ AllocaInst &OldAI, &NewAI;
+ const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset;
+ Type *NewAllocaTy;
+
+ // This is a convenience and flag variable that will be null unless the new
+ // alloca's integer operations should be widened to this integer type due to
+ // passing isIntegerWideningViable above. If it is non-null, the desired
+ // integer type will be stored here for easy access during rewriting.
+ IntegerType *IntTy;
+
+ // If we are rewriting an alloca partition which can be written as pure
+ // vector operations, we stash extra information here. When VecTy is
+ // non-null, we have some strict guarantees about the rewritten alloca:
+ // - The new alloca is exactly the size of the vector type here.
+ // - The accesses all either map to the entire vector or to a single
+ // element.
+ // - The set of accessing instructions is only one of those handled above
+ // in isVectorPromotionViable. Generally these are the same access kinds
+ // which are promotable via mem2reg.
+ VectorType *VecTy;
+ Type *ElementTy;
+ uint64_t ElementSize;
+
+ // The original offset of the slice currently being rewritten relative to
+ // the original alloca.
+ uint64_t BeginOffset, EndOffset;
+ // The new offsets of the slice currently being rewritten relative to the
+ // original alloca.
+ uint64_t NewBeginOffset, NewEndOffset;
+
+ uint64_t SliceSize;
+ bool IsSplittable;
+ bool IsSplit;
+ Use *OldUse;
+ Instruction *OldPtr;
+
+ // Track post-rewrite users which are PHI nodes and Selects.
+ SmallPtrSetImpl<PHINode *> &PHIUsers;
+ SmallPtrSetImpl<SelectInst *> &SelectUsers;
+
+ // Utility IR builder, whose name prefix is setup for each visited use, and
+ // the insertion point is set to point to the user.
+ IRBuilderTy IRB;
+
+public:
+ AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &AS, SROA &Pass,
+ AllocaInst &OldAI, AllocaInst &NewAI,
+ uint64_t NewAllocaBeginOffset,
+ uint64_t NewAllocaEndOffset, bool IsIntegerPromotable,
+ VectorType *PromotableVecTy,
+ SmallPtrSetImpl<PHINode *> &PHIUsers,
+ SmallPtrSetImpl<SelectInst *> &SelectUsers)
+ : DL(DL), AS(AS), Pass(Pass), OldAI(OldAI), NewAI(NewAI),
+ NewAllocaBeginOffset(NewAllocaBeginOffset),
+ NewAllocaEndOffset(NewAllocaEndOffset),
+ NewAllocaTy(NewAI.getAllocatedType()),
+ IntTy(IsIntegerPromotable
+ ? Type::getIntNTy(
+ NewAI.getContext(),
+ DL.getTypeSizeInBits(NewAI.getAllocatedType()))
+ : nullptr),
+ VecTy(PromotableVecTy),
+ ElementTy(VecTy ? VecTy->getElementType() : nullptr),
+ ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy) / 8 : 0),
+ BeginOffset(), EndOffset(), IsSplittable(), IsSplit(), OldUse(),
+ OldPtr(), PHIUsers(PHIUsers), SelectUsers(SelectUsers),
+ IRB(NewAI.getContext(), ConstantFolder()) {
+ if (VecTy) {
+ assert((DL.getTypeSizeInBits(ElementTy) % 8) == 0 &&
+ "Only multiple-of-8 sized vector elements are viable");
+ ++NumVectorized;
+ }
+ assert((!IntTy && !VecTy) || (IntTy && !VecTy) || (!IntTy && VecTy));
+ }
+
+ bool visit(AllocaSlices::const_iterator I) {
+ bool CanSROA = true;
+ BeginOffset = I->beginOffset();
+ EndOffset = I->endOffset();
+ IsSplittable = I->isSplittable();
+ IsSplit =
+ BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset;
+ DEBUG(dbgs() << " rewriting " << (IsSplit ? "split " : ""));
+ DEBUG(AS.printSlice(dbgs(), I, ""));
+ DEBUG(dbgs() << "\n");
+
+ // Compute the intersecting offset range.
+ assert(BeginOffset < NewAllocaEndOffset);
+ assert(EndOffset > NewAllocaBeginOffset);
+ NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset);
+ NewEndOffset = std::min(EndOffset, NewAllocaEndOffset);
+
+ SliceSize = NewEndOffset - NewBeginOffset;
+
+ OldUse = I->getUse();
+ OldPtr = cast<Instruction>(OldUse->get());
+
+ Instruction *OldUserI = cast<Instruction>(OldUse->getUser());
+ IRB.SetInsertPoint(OldUserI);
+ IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc());
+ IRB.SetNamePrefix(Twine(NewAI.getName()) + "." + Twine(BeginOffset) + ".");
+
+ CanSROA &= visit(cast<Instruction>(OldUse->getUser()));
+ if (VecTy || IntTy)
+ assert(CanSROA);
+ return CanSROA;
+ }
+
+private:
+ // Make sure the other visit overloads are visible.
+ using Base::visit;
+
+ // Every instruction which can end up as a user must have a rewrite rule.
+ bool visitInstruction(Instruction &I) {
+ DEBUG(dbgs() << " !!!! Cannot rewrite: " << I << "\n");
+ llvm_unreachable("No rewrite rule for this instruction!");
+ }
+
+ Value *getNewAllocaSlicePtr(IRBuilderTy &IRB, Type *PointerTy) {
+ // Note that the offset computation can use BeginOffset or NewBeginOffset
+ // interchangeably for unsplit slices.
+ assert(IsSplit || BeginOffset == NewBeginOffset);
+ uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
+
+#ifndef NDEBUG
+ StringRef OldName = OldPtr->getName();
+ // Skip through the last '.sroa.' component of the name.
+ size_t LastSROAPrefix = OldName.rfind(".sroa.");
+ if (LastSROAPrefix != StringRef::npos) {
+ OldName = OldName.substr(LastSROAPrefix + strlen(".sroa."));
+ // Look for an SROA slice index.
+ size_t IndexEnd = OldName.find_first_not_of("0123456789");
+ if (IndexEnd != StringRef::npos && OldName[IndexEnd] == '.') {
+ // Strip the index and look for the offset.
+ OldName = OldName.substr(IndexEnd + 1);
+ size_t OffsetEnd = OldName.find_first_not_of("0123456789");
+ if (OffsetEnd != StringRef::npos && OldName[OffsetEnd] == '.')
+ // Strip the offset.
+ OldName = OldName.substr(OffsetEnd + 1);
+ }
+ }
+ // Strip any SROA suffixes as well.
+ OldName = OldName.substr(0, OldName.find(".sroa_"));
+#endif
+
+ return getAdjustedPtr(IRB, DL, &NewAI,
+ APInt(DL.getPointerSizeInBits(), Offset), PointerTy,
+#ifndef NDEBUG
+ Twine(OldName) + "."
+#else
+ Twine()
+#endif
+ );
+ }
+
+ /// \brief Compute suitable alignment to access this slice of the *new*
+ /// alloca.
+ ///
+ /// You can optionally pass a type to this routine and if that type's ABI
+ /// alignment is itself suitable, this will return zero.
+ unsigned getSliceAlign(Type *Ty = nullptr) {
+ unsigned NewAIAlign = NewAI.getAlignment();
+ if (!NewAIAlign)
+ NewAIAlign = DL.getABITypeAlignment(NewAI.getAllocatedType());
+ unsigned Align =
+ MinAlign(NewAIAlign, NewBeginOffset - NewAllocaBeginOffset);
+ return (Ty && Align == DL.getABITypeAlignment(Ty)) ? 0 : Align;
+ }
+
+ unsigned getIndex(uint64_t Offset) {
+ assert(VecTy && "Can only call getIndex when rewriting a vector");
+ uint64_t RelOffset = Offset - NewAllocaBeginOffset;
+ assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds");
+ uint32_t Index = RelOffset / ElementSize;
+ assert(Index * ElementSize == RelOffset);
+ return Index;
+ }
+
+ void deleteIfTriviallyDead(Value *V) {
+ Instruction *I = cast<Instruction>(V);
+ if (isInstructionTriviallyDead(I))
+ Pass.DeadInsts.insert(I);
+ }
+
+ Value *rewriteVectorizedLoadInst() {
+ unsigned BeginIndex = getIndex(NewBeginOffset);
+ unsigned EndIndex = getIndex(NewEndOffset);
+ assert(EndIndex > BeginIndex && "Empty vector!");
+
+ Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");
+ return extractVector(IRB, V, BeginIndex, EndIndex, "vec");
+ }
+
+ Value *rewriteIntegerLoad(LoadInst &LI) {
+ assert(IntTy && "We cannot insert an integer to the alloca");
+ assert(!LI.isVolatile());
+ Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");
+ V = convertValue(DL, IRB, V, IntTy);
+ assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
+ uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
+ if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) {
+ IntegerType *ExtractTy = Type::getIntNTy(LI.getContext(), SliceSize * 8);
+ V = extractInteger(DL, IRB, V, ExtractTy, Offset, "extract");
+ }
+ // It is possible that the extracted type is not the load type. This
+ // happens if there is a load past the end of the alloca, and as
+ // a consequence the slice is narrower but still a candidate for integer
+ // lowering. To handle this case, we just zero extend the extracted
+ // integer.
+ assert(cast<IntegerType>(LI.getType())->getBitWidth() >= SliceSize * 8 &&
+ "Can only handle an extract for an overly wide load");
+ if (cast<IntegerType>(LI.getType())->getBitWidth() > SliceSize * 8)
+ V = IRB.CreateZExt(V, LI.getType());
+ return V;
+ }
+
+ bool visitLoadInst(LoadInst &LI) {
+ DEBUG(dbgs() << " original: " << LI << "\n");
+ Value *OldOp = LI.getOperand(0);
+ assert(OldOp == OldPtr);
+
+ Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8)
+ : LI.getType();
+ const bool IsLoadPastEnd = DL.getTypeStoreSize(TargetTy) > SliceSize;
+ bool IsPtrAdjusted = false;
+ Value *V;
+ if (VecTy) {
+ V = rewriteVectorizedLoadInst();
+ } else if (IntTy && LI.getType()->isIntegerTy()) {
+ V = rewriteIntegerLoad(LI);
+ } else if (NewBeginOffset == NewAllocaBeginOffset &&
+ NewEndOffset == NewAllocaEndOffset &&
+ (canConvertValue(DL, NewAllocaTy, TargetTy) ||
+ (IsLoadPastEnd && NewAllocaTy->isIntegerTy() &&
+ TargetTy->isIntegerTy()))) {
+ LoadInst *NewLI = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
+ LI.isVolatile(), LI.getName());
+ if (LI.isVolatile())
+ NewLI->setAtomic(LI.getOrdering(), LI.getSynchScope());
+ V = NewLI;
+
+ // If this is an integer load past the end of the slice (which means the
+ // bytes outside the slice are undef or this load is dead) just forcibly
+ // fix the integer size with correct handling of endianness.
+ if (auto *AITy = dyn_cast<IntegerType>(NewAllocaTy))
+ if (auto *TITy = dyn_cast<IntegerType>(TargetTy))
+ if (AITy->getBitWidth() < TITy->getBitWidth()) {
+ V = IRB.CreateZExt(V, TITy, "load.ext");
+ if (DL.isBigEndian())
+ V = IRB.CreateShl(V, TITy->getBitWidth() - AITy->getBitWidth(),
+ "endian_shift");
+ }
+ } else {
+ Type *LTy = TargetTy->getPointerTo();
+ LoadInst *NewLI = IRB.CreateAlignedLoad(getNewAllocaSlicePtr(IRB, LTy),
+ getSliceAlign(TargetTy),
+ LI.isVolatile(), LI.getName());
+ if (LI.isVolatile())
+ NewLI->setAtomic(LI.getOrdering(), LI.getSynchScope());
+
+ V = NewLI;
+ IsPtrAdjusted = true;
+ }
+ V = convertValue(DL, IRB, V, TargetTy);
+
+ if (IsSplit) {
+ assert(!LI.isVolatile());
+ assert(LI.getType()->isIntegerTy() &&
+ "Only integer type loads and stores are split");
+ assert(SliceSize < DL.getTypeStoreSize(LI.getType()) &&
+ "Split load isn't smaller than original load");
+ assert(LI.getType()->getIntegerBitWidth() ==
+ DL.getTypeStoreSizeInBits(LI.getType()) &&
+ "Non-byte-multiple bit width");
+ // Move the insertion point just past the load so that we can refer to it.
+ IRB.SetInsertPoint(&*std::next(BasicBlock::iterator(&LI)));
+ // Create a placeholder value with the same type as LI to use as the
+ // basis for the new value. This allows us to replace the uses of LI with
+ // the computed value, and then replace the placeholder with LI, leaving
+ // LI only used for this computation.
+ Value *Placeholder =
+ new LoadInst(UndefValue::get(LI.getType()->getPointerTo()));
+ V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset,
+ "insert");
+ LI.replaceAllUsesWith(V);
+ Placeholder->replaceAllUsesWith(&LI);
+ delete Placeholder;
+ } else {
+ LI.replaceAllUsesWith(V);
+ }
+
+ Pass.DeadInsts.insert(&LI);
+ deleteIfTriviallyDead(OldOp);
+ DEBUG(dbgs() << " to: " << *V << "\n");
+ return !LI.isVolatile() && !IsPtrAdjusted;
+ }
+
+ bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp) {
+ if (V->getType() != VecTy) {
+ unsigned BeginIndex = getIndex(NewBeginOffset);
+ unsigned EndIndex = getIndex(NewEndOffset);
+ assert(EndIndex > BeginIndex && "Empty vector!");
+ unsigned NumElements = EndIndex - BeginIndex;
+ assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
+ Type *SliceTy = (NumElements == 1)
+ ? ElementTy
+ : VectorType::get(ElementTy, NumElements);
+ if (V->getType() != SliceTy)
+ V = convertValue(DL, IRB, V, SliceTy);
+
+ // Mix in the existing elements.
+ Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");
+ V = insertVector(IRB, Old, V, BeginIndex, "vec");
+ }
+ StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
+ Pass.DeadInsts.insert(&SI);
+
+ (void)Store;
+ DEBUG(dbgs() << " to: " << *Store << "\n");
+ return true;
+ }
+
+ bool rewriteIntegerStore(Value *V, StoreInst &SI) {
+ assert(IntTy && "We cannot extract an integer from the alloca");
+ assert(!SI.isVolatile());
+ if (DL.getTypeSizeInBits(V->getType()) != IntTy->getBitWidth()) {
+ Value *Old =
+ IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload");
+ Old = convertValue(DL, IRB, Old, IntTy);
+ assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
+ uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
+ V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset, "insert");
+ }
+ V = convertValue(DL, IRB, V, NewAllocaTy);
+ StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
+ Pass.DeadInsts.insert(&SI);
+ (void)Store;
+ DEBUG(dbgs() << " to: " << *Store << "\n");
+ return true;
+ }
+
+ bool visitStoreInst(StoreInst &SI) {
+ DEBUG(dbgs() << " original: " << SI << "\n");
+ Value *OldOp = SI.getOperand(1);
+ assert(OldOp == OldPtr);
+
+ Value *V = SI.getValueOperand();
+
+ // Strip all inbounds GEPs and pointer casts to try to dig out any root
+ // alloca that should be re-examined after promoting this alloca.
+ if (V->getType()->isPointerTy())
+ if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets()))
+ Pass.PostPromotionWorklist.insert(AI);
+
+ if (SliceSize < DL.getTypeStoreSize(V->getType())) {
+ assert(!SI.isVolatile());
+ assert(V->getType()->isIntegerTy() &&
+ "Only integer type loads and stores are split");
+ assert(V->getType()->getIntegerBitWidth() ==
+ DL.getTypeStoreSizeInBits(V->getType()) &&
+ "Non-byte-multiple bit width");
+ IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), SliceSize * 8);
+ V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset - BeginOffset,
+ "extract");
+ }
+
+ if (VecTy)
+ return rewriteVectorizedStoreInst(V, SI, OldOp);
+ if (IntTy && V->getType()->isIntegerTy())
+ return rewriteIntegerStore(V, SI);
+
+ const bool IsStorePastEnd = DL.getTypeStoreSize(V->getType()) > SliceSize;
+ StoreInst *NewSI;
+ if (NewBeginOffset == NewAllocaBeginOffset &&
+ NewEndOffset == NewAllocaEndOffset &&
+ (canConvertValue(DL, V->getType(), NewAllocaTy) ||
+ (IsStorePastEnd && NewAllocaTy->isIntegerTy() &&
+ V->getType()->isIntegerTy()))) {
+ // If this is an integer store past the end of slice (and thus the bytes
+ // past that point are irrelevant or this is unreachable), truncate the
+ // value prior to storing.
+ if (auto *VITy = dyn_cast<IntegerType>(V->getType()))
+ if (auto *AITy = dyn_cast<IntegerType>(NewAllocaTy))
+ if (VITy->getBitWidth() > AITy->getBitWidth()) {
+ if (DL.isBigEndian())
+ V = IRB.CreateLShr(V, VITy->getBitWidth() - AITy->getBitWidth(),
+ "endian_shift");
+ V = IRB.CreateTrunc(V, AITy, "load.trunc");
+ }
+
+ V = convertValue(DL, IRB, V, NewAllocaTy);
+ NewSI = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(),
+ SI.isVolatile());
+ } else {
+ Value *NewPtr = getNewAllocaSlicePtr(IRB, V->getType()->getPointerTo());
+ NewSI = IRB.CreateAlignedStore(V, NewPtr, getSliceAlign(V->getType()),
+ SI.isVolatile());
+ }
+ if (SI.isVolatile())
+ NewSI->setAtomic(SI.getOrdering(), SI.getSynchScope());
+ Pass.DeadInsts.insert(&SI);
+ deleteIfTriviallyDead(OldOp);
+
+ DEBUG(dbgs() << " to: " << *NewSI << "\n");
+ return NewSI->getPointerOperand() == &NewAI && !SI.isVolatile();
+ }
+
+ /// \brief Compute an integer value from splatting an i8 across the given
+ /// number of bytes.
+ ///
+ /// Note that this routine assumes an i8 is a byte. If that isn't true, don't
+ /// call this routine.
+ /// FIXME: Heed the advice above.
+ ///
+ /// \param V The i8 value to splat.
+ /// \param Size The number of bytes in the output (assuming i8 is one byte)
+ Value *getIntegerSplat(Value *V, unsigned Size) {
+ assert(Size > 0 && "Expected a positive number of bytes.");
+ IntegerType *VTy = cast<IntegerType>(V->getType());
+ assert(VTy->getBitWidth() == 8 && "Expected an i8 value for the byte");
+ if (Size == 1)
+ return V;
+
+ Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size * 8);
+ V = IRB.CreateMul(
+ IRB.CreateZExt(V, SplatIntTy, "zext"),
+ ConstantExpr::getUDiv(
+ Constant::getAllOnesValue(SplatIntTy),
+ ConstantExpr::getZExt(Constant::getAllOnesValue(V->getType()),
+ SplatIntTy)),
+ "isplat");
+ return V;
+ }
+
+ /// \brief Compute a vector splat for a given element value.
+ Value *getVectorSplat(Value *V, unsigned NumElements) {
+ V = IRB.CreateVectorSplat(NumElements, V, "vsplat");
+ DEBUG(dbgs() << " splat: " << *V << "\n");
+ return V;
+ }
+
+ bool visitMemSetInst(MemSetInst &II) {
+ DEBUG(dbgs() << " original: " << II << "\n");
+ assert(II.getRawDest() == OldPtr);
+
+ // If the memset has a variable size, it cannot be split, just adjust the
+ // pointer to the new alloca.
+ if (!isa<Constant>(II.getLength())) {
+ assert(!IsSplit);
+ assert(NewBeginOffset == BeginOffset);
+ II.setDest(getNewAllocaSlicePtr(IRB, OldPtr->getType()));
+ Type *CstTy = II.getAlignmentCst()->getType();
+ II.setAlignment(ConstantInt::get(CstTy, getSliceAlign()));
+
+ deleteIfTriviallyDead(OldPtr);
+ return false;
+ }
+
+ // Record this instruction for deletion.
+ Pass.DeadInsts.insert(&II);
+
+ Type *AllocaTy = NewAI.getAllocatedType();
+ Type *ScalarTy = AllocaTy->getScalarType();
+
+ // If this doesn't map cleanly onto the alloca type, and that type isn't
+ // a single value type, just emit a memset.
+ if (!VecTy && !IntTy &&
+ (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset ||
+ SliceSize != DL.getTypeStoreSize(AllocaTy) ||
+ !AllocaTy->isSingleValueType() ||
+ !DL.isLegalInteger(DL.getTypeSizeInBits(ScalarTy)) ||
+ DL.getTypeSizeInBits(ScalarTy) % 8 != 0)) {
+ Type *SizeTy = II.getLength()->getType();
+ Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
+ CallInst *New = IRB.CreateMemSet(
+ getNewAllocaSlicePtr(IRB, OldPtr->getType()), II.getValue(), Size,
+ getSliceAlign(), II.isVolatile());
+ (void)New;
+ DEBUG(dbgs() << " to: " << *New << "\n");
+ return false;
+ }
+
+ // If we can represent this as a simple value, we have to build the actual
+ // value to store, which requires expanding the byte present in memset to
+ // a sensible representation for the alloca type. This is essentially
+ // splatting the byte to a sufficiently wide integer, splatting it across
+ // any desired vector width, and bitcasting to the final type.
+ Value *V;
+
+ if (VecTy) {
+ // If this is a memset of a vectorized alloca, insert it.
+ assert(ElementTy == ScalarTy);
+
+ unsigned BeginIndex = getIndex(NewBeginOffset);
+ unsigned EndIndex = getIndex(NewEndOffset);
+ assert(EndIndex > BeginIndex && "Empty vector!");
+ unsigned NumElements = EndIndex - BeginIndex;
+ assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
+
+ Value *Splat =
+ getIntegerSplat(II.getValue(), DL.getTypeSizeInBits(ElementTy) / 8);
+ Splat = convertValue(DL, IRB, Splat, ElementTy);
+ if (NumElements > 1)
+ Splat = getVectorSplat(Splat, NumElements);
+
+ Value *Old =
+ IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload");
+ V = insertVector(IRB, Old, Splat, BeginIndex, "vec");
+ } else if (IntTy) {
+ // If this is a memset on an alloca where we can widen stores, insert the
+ // set integer.
+ assert(!II.isVolatile());
+
+ uint64_t Size = NewEndOffset - NewBeginOffset;
+ V = getIntegerSplat(II.getValue(), Size);
+
+ if (IntTy && (BeginOffset != NewAllocaBeginOffset ||
+ EndOffset != NewAllocaBeginOffset)) {
+ Value *Old =
+ IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload");
+ Old = convertValue(DL, IRB, Old, IntTy);
+ uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
+ V = insertInteger(DL, IRB, Old, V, Offset, "insert");
+ } else {
+ assert(V->getType() == IntTy &&
+ "Wrong type for an alloca wide integer!");
+ }
+ V = convertValue(DL, IRB, V, AllocaTy);
+ } else {
+ // Established these invariants above.
+ assert(NewBeginOffset == NewAllocaBeginOffset);
+ assert(NewEndOffset == NewAllocaEndOffset);
+
+ V = getIntegerSplat(II.getValue(), DL.getTypeSizeInBits(ScalarTy) / 8);
+ if (VectorType *AllocaVecTy = dyn_cast<VectorType>(AllocaTy))
+ V = getVectorSplat(V, AllocaVecTy->getNumElements());
+
+ V = convertValue(DL, IRB, V, AllocaTy);
+ }
+
+ Value *New = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(),
+ II.isVolatile());
+ (void)New;
+ DEBUG(dbgs() << " to: " << *New << "\n");
+ return !II.isVolatile();
+ }
+
+ bool visitMemTransferInst(MemTransferInst &II) {
+ // Rewriting of memory transfer instructions can be a bit tricky. We break
+ // them into two categories: split intrinsics and unsplit intrinsics.
+
+ DEBUG(dbgs() << " original: " << II << "\n");
+
+ bool IsDest = &II.getRawDestUse() == OldUse;
+ assert((IsDest && II.getRawDest() == OldPtr) ||
+ (!IsDest && II.getRawSource() == OldPtr));
+
+ unsigned SliceAlign = getSliceAlign();
+
+ // For unsplit intrinsics, we simply modify the source and destination
+ // pointers in place. This isn't just an optimization, it is a matter of
+ // correctness. With unsplit intrinsics we may be dealing with transfers
+ // within a single alloca before SROA ran, or with transfers that have
+ // a variable length. We may also be dealing with memmove instead of
+ // memcpy, and so simply updating the pointers is the necessary for us to
+ // update both source and dest of a single call.
+ if (!IsSplittable) {
+ Value *AdjustedPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
+ if (IsDest)
+ II.setDest(AdjustedPtr);
+ else
+ II.setSource(AdjustedPtr);
+
+ if (II.getAlignment() > SliceAlign) {
+ Type *CstTy = II.getAlignmentCst()->getType();
+ II.setAlignment(
+ ConstantInt::get(CstTy, MinAlign(II.getAlignment(), SliceAlign)));
+ }
+
+ DEBUG(dbgs() << " to: " << II << "\n");
+ deleteIfTriviallyDead(OldPtr);
+ return false;
+ }
+ // For split transfer intrinsics we have an incredibly useful assurance:
+ // the source and destination do not reside within the same alloca, and at
+ // least one of them does not escape. This means that we can replace
+ // memmove with memcpy, and we don't need to worry about all manner of
+ // downsides to splitting and transforming the operations.
+
+ // If this doesn't map cleanly onto the alloca type, and that type isn't
+ // a single value type, just emit a memcpy.
+ bool EmitMemCpy =
+ !VecTy && !IntTy &&
+ (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset ||
+ SliceSize != DL.getTypeStoreSize(NewAI.getAllocatedType()) ||
+ !NewAI.getAllocatedType()->isSingleValueType());
+
+ // If we're just going to emit a memcpy, the alloca hasn't changed, and the
+ // size hasn't been shrunk based on analysis of the viable range, this is
+ // a no-op.
+ if (EmitMemCpy && &OldAI == &NewAI) {
+ // Ensure the start lines up.
+ assert(NewBeginOffset == BeginOffset);
+
+ // Rewrite the size as needed.
+ if (NewEndOffset != EndOffset)
+ II.setLength(ConstantInt::get(II.getLength()->getType(),
+ NewEndOffset - NewBeginOffset));
+ return false;
+ }
+ // Record this instruction for deletion.
+ Pass.DeadInsts.insert(&II);
+
+ // Strip all inbounds GEPs and pointer casts to try to dig out any root
+ // alloca that should be re-examined after rewriting this instruction.
+ Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest();
+ if (AllocaInst *AI =
+ dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets())) {
+ assert(AI != &OldAI && AI != &NewAI &&
+ "Splittable transfers cannot reach the same alloca on both ends.");
+ Pass.Worklist.insert(AI);
+ }
+
+ Type *OtherPtrTy = OtherPtr->getType();
+ unsigned OtherAS = OtherPtrTy->getPointerAddressSpace();
+
+ // Compute the relative offset for the other pointer within the transfer.
+ unsigned IntPtrWidth = DL.getPointerSizeInBits(OtherAS);
+ APInt OtherOffset(IntPtrWidth, NewBeginOffset - BeginOffset);
+ unsigned OtherAlign = MinAlign(II.getAlignment() ? II.getAlignment() : 1,
+ OtherOffset.zextOrTrunc(64).getZExtValue());
+
+ if (EmitMemCpy) {
+ // Compute the other pointer, folding as much as possible to produce
+ // a single, simple GEP in most cases.
+ OtherPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
+ OtherPtr->getName() + ".");
+
+ Value *OurPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
+ Type *SizeTy = II.getLength()->getType();
+ Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
+
+ CallInst *New = IRB.CreateMemCpy(
+ IsDest ? OurPtr : OtherPtr, IsDest ? OtherPtr : OurPtr, Size,
+ MinAlign(SliceAlign, OtherAlign), II.isVolatile());
+ (void)New;
+ DEBUG(dbgs() << " to: " << *New << "\n");
+ return false;
+ }
+
+ bool IsWholeAlloca = NewBeginOffset == NewAllocaBeginOffset &&
+ NewEndOffset == NewAllocaEndOffset;
+ uint64_t Size = NewEndOffset - NewBeginOffset;
+ unsigned BeginIndex = VecTy ? getIndex(NewBeginOffset) : 0;
+ unsigned EndIndex = VecTy ? getIndex(NewEndOffset) : 0;
+ unsigned NumElements = EndIndex - BeginIndex;
+ IntegerType *SubIntTy =
+ IntTy ? Type::getIntNTy(IntTy->getContext(), Size * 8) : nullptr;
+
+ // Reset the other pointer type to match the register type we're going to
+ // use, but using the address space of the original other pointer.
+ if (VecTy && !IsWholeAlloca) {
+ if (NumElements == 1)
+ OtherPtrTy = VecTy->getElementType();
+ else
+ OtherPtrTy = VectorType::get(VecTy->getElementType(), NumElements);
+
+ OtherPtrTy = OtherPtrTy->getPointerTo(OtherAS);
+ } else if (IntTy && !IsWholeAlloca) {
+ OtherPtrTy = SubIntTy->getPointerTo(OtherAS);
+ } else {
+ OtherPtrTy = NewAllocaTy->getPointerTo(OtherAS);
+ }
+
+ Value *SrcPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
+ OtherPtr->getName() + ".");
+ unsigned SrcAlign = OtherAlign;
+ Value *DstPtr = &NewAI;
+ unsigned DstAlign = SliceAlign;
+ if (!IsDest) {
+ std::swap(SrcPtr, DstPtr);
+ std::swap(SrcAlign, DstAlign);
+ }
+
+ Value *Src;
+ if (VecTy && !IsWholeAlloca && !IsDest) {
+ Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");
+ Src = extractVector(IRB, Src, BeginIndex, EndIndex, "vec");
+ } else if (IntTy && !IsWholeAlloca && !IsDest) {
+ Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");
+ Src = convertValue(DL, IRB, Src, IntTy);
+ uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
+ Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract");
+ } else {
+ Src =
+ IRB.CreateAlignedLoad(SrcPtr, SrcAlign, II.isVolatile(), "copyload");
+ }
+
+ if (VecTy && !IsWholeAlloca && IsDest) {
+ Value *Old =
+ IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload");
+ Src = insertVector(IRB, Old, Src, BeginIndex, "vec");
+ } else if (IntTy && !IsWholeAlloca && IsDest) {
+ Value *Old =
+ IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload");
+ Old = convertValue(DL, IRB, Old, IntTy);
+ uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
+ Src = insertInteger(DL, IRB, Old, Src, Offset, "insert");
+ Src = convertValue(DL, IRB, Src, NewAllocaTy);
+ }
+
+ StoreInst *Store = cast<StoreInst>(
+ IRB.CreateAlignedStore(Src, DstPtr, DstAlign, II.isVolatile()));
+ (void)Store;
+ DEBUG(dbgs() << " to: " << *Store << "\n");
+ return !II.isVolatile();
+ }
+
+ bool visitIntrinsicInst(IntrinsicInst &II) {
+ assert(II.getIntrinsicID() == Intrinsic::lifetime_start ||
+ II.getIntrinsicID() == Intrinsic::lifetime_end);
+ DEBUG(dbgs() << " original: " << II << "\n");
+ assert(II.getArgOperand(1) == OldPtr);
+
+ // Record this instruction for deletion.
+ Pass.DeadInsts.insert(&II);
+
+ ConstantInt *Size =
+ ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()),
+ NewEndOffset - NewBeginOffset);
+ Value *Ptr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
+ Value *New;
+ if (II.getIntrinsicID() == Intrinsic::lifetime_start)
+ New = IRB.CreateLifetimeStart(Ptr, Size);
+ else
+ New = IRB.CreateLifetimeEnd(Ptr, Size);
+
+ (void)New;
+ DEBUG(dbgs() << " to: " << *New << "\n");
+ return true;
+ }
+
+ bool visitPHINode(PHINode &PN) {
+ DEBUG(dbgs() << " original: " << PN << "\n");
+ assert(BeginOffset >= NewAllocaBeginOffset && "PHIs are unsplittable");
+ assert(EndOffset <= NewAllocaEndOffset && "PHIs are unsplittable");
+
+ // We would like to compute a new pointer in only one place, but have it be
+ // as local as possible to the PHI. To do that, we re-use the location of
+ // the old pointer, which necessarily must be in the right position to
+ // dominate the PHI.
+ IRBuilderTy PtrBuilder(IRB);
+ if (isa<PHINode>(OldPtr))
+ PtrBuilder.SetInsertPoint(&*OldPtr->getParent()->getFirstInsertionPt());
+ else
+ PtrBuilder.SetInsertPoint(OldPtr);
+ PtrBuilder.SetCurrentDebugLocation(OldPtr->getDebugLoc());
+
+ Value *NewPtr = getNewAllocaSlicePtr(PtrBuilder, OldPtr->getType());
+ // Replace the operands which were using the old pointer.
+ std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr);
+
+ DEBUG(dbgs() << " to: " << PN << "\n");
+ deleteIfTriviallyDead(OldPtr);
+
+ // PHIs can't be promoted on their own, but often can be speculated. We
+ // check the speculation outside of the rewriter so that we see the
+ // fully-rewritten alloca.
+ PHIUsers.insert(&PN);
+ return true;
+ }
+
+ bool visitSelectInst(SelectInst &SI) {
+ DEBUG(dbgs() << " original: " << SI << "\n");
+ assert((SI.getTrueValue() == OldPtr || SI.getFalseValue() == OldPtr) &&
+ "Pointer isn't an operand!");
+ assert(BeginOffset >= NewAllocaBeginOffset && "Selects are unsplittable");
+ assert(EndOffset <= NewAllocaEndOffset && "Selects are unsplittable");
+
+ Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
+ // Replace the operands which were using the old pointer.
+ if (SI.getOperand(1) == OldPtr)
+ SI.setOperand(1, NewPtr);
+ if (SI.getOperand(2) == OldPtr)
+ SI.setOperand(2, NewPtr);
+
+ DEBUG(dbgs() << " to: " << SI << "\n");
+ deleteIfTriviallyDead(OldPtr);
+
+ // Selects can't be promoted on their own, but often can be speculated. We
+ // check the speculation outside of the rewriter so that we see the
+ // fully-rewritten alloca.
+ SelectUsers.insert(&SI);
+ return true;
+ }
+};
+
+namespace {
+/// \brief Visitor to rewrite aggregate loads and stores as scalar.
+///
+/// This pass aggressively rewrites all aggregate loads and stores on
+/// a particular pointer (or any pointer derived from it which we can identify)
+/// with scalar loads and stores.
+class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
+ // Befriend the base class so it can delegate to private visit methods.
+ friend class llvm::InstVisitor<AggLoadStoreRewriter, bool>;
+
+ /// Queue of pointer uses to analyze and potentially rewrite.
+ SmallVector<Use *, 8> Queue;
+
+ /// Set to prevent us from cycling with phi nodes and loops.
+ SmallPtrSet<User *, 8> Visited;
+
+ /// The current pointer use being rewritten. This is used to dig up the used
+ /// value (as opposed to the user).
+ Use *U;
+
+public:
+ /// Rewrite loads and stores through a pointer and all pointers derived from
+ /// it.
+ bool rewrite(Instruction &I) {
+ DEBUG(dbgs() << " Rewriting FCA loads and stores...\n");
+ enqueueUsers(I);
+ bool Changed = false;
+ while (!Queue.empty()) {
+ U = Queue.pop_back_val();
+ Changed |= visit(cast<Instruction>(U->getUser()));
+ }
+ return Changed;
+ }
+
+private:
+ /// Enqueue all the users of the given instruction for further processing.
+ /// This uses a set to de-duplicate users.
+ void enqueueUsers(Instruction &I) {
+ for (Use &U : I.uses())
+ if (Visited.insert(U.getUser()).second)
+ Queue.push_back(&U);
+ }
+
+ // Conservative default is to not rewrite anything.
+ bool visitInstruction(Instruction &I) { return false; }
+
+ /// \brief Generic recursive split emission class.
+ template <typename Derived> class OpSplitter {
+ protected:
+ /// The builder used to form new instructions.
+ IRBuilderTy IRB;
+ /// The indices which to be used with insert- or extractvalue to select the
+ /// appropriate value within the aggregate.
+ SmallVector<unsigned, 4> Indices;
+ /// The indices to a GEP instruction which will move Ptr to the correct slot
+ /// within the aggregate.
+ SmallVector<Value *, 4> GEPIndices;
+ /// The base pointer of the original op, used as a base for GEPing the
+ /// split operations.
+ Value *Ptr;
+
+ /// Initialize the splitter with an insertion point, Ptr and start with a
+ /// single zero GEP index.
+ OpSplitter(Instruction *InsertionPoint, Value *Ptr)
+ : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr) {}
+
+ public:
+ /// \brief Generic recursive split emission routine.
+ ///
+ /// This method recursively splits an aggregate op (load or store) into
+ /// scalar or vector ops. It splits recursively until it hits a single value
+ /// and emits that single value operation via the template argument.
+ ///
+ /// The logic of this routine relies on GEPs and insertvalue and
+ /// extractvalue all operating with the same fundamental index list, merely
+ /// formatted differently (GEPs need actual values).
+ ///
+ /// \param Ty The type being split recursively into smaller ops.
+ /// \param Agg The aggregate value being built up or stored, depending on
+ /// whether this is splitting a load or a store respectively.
+ void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) {
+ if (Ty->isSingleValueType())
+ return static_cast<Derived *>(this)->emitFunc(Ty, Agg, Name);
+
+ if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+ unsigned OldSize = Indices.size();
+ (void)OldSize;
+ for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size;
+ ++Idx) {
+ assert(Indices.size() == OldSize && "Did not return to the old size");
+ Indices.push_back(Idx);
+ GEPIndices.push_back(IRB.getInt32(Idx));
+ emitSplitOps(ATy->getElementType(), Agg, Name + "." + Twine(Idx));
+ GEPIndices.pop_back();
+ Indices.pop_back();
+ }
+ return;
+ }
+
+ if (StructType *STy = dyn_cast<StructType>(Ty)) {
+ unsigned OldSize = Indices.size();
+ (void)OldSize;
+ for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size;
+ ++Idx) {
+ assert(Indices.size() == OldSize && "Did not return to the old size");
+ Indices.push_back(Idx);
+ GEPIndices.push_back(IRB.getInt32(Idx));
+ emitSplitOps(STy->getElementType(Idx), Agg, Name + "." + Twine(Idx));
+ GEPIndices.pop_back();
+ Indices.pop_back();
+ }
+ return;
+ }
+
+ llvm_unreachable("Only arrays and structs are aggregate loadable types");
+ }
+ };
+
+ struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> {
+ LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr)
+ : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr) {}
+
+ /// Emit a leaf load of a single value. This is called at the leaves of the
+ /// recursive emission to actually load values.
+ void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) {
+ assert(Ty->isSingleValueType());
+ // Load the single value and insert it using the indices.
+ Value *GEP =
+ IRB.CreateInBoundsGEP(nullptr, Ptr, GEPIndices, Name + ".gep");
+ Value *Load = IRB.CreateLoad(GEP, Name + ".load");
+ Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert");
+ DEBUG(dbgs() << " to: " << *Load << "\n");
+ }
+ };
+
+ bool visitLoadInst(LoadInst &LI) {
+ assert(LI.getPointerOperand() == *U);
+ if (!LI.isSimple() || LI.getType()->isSingleValueType())
+ return false;
+
+ // We have an aggregate being loaded, split it apart.
+ DEBUG(dbgs() << " original: " << LI << "\n");
+ LoadOpSplitter Splitter(&LI, *U);
+ Value *V = UndefValue::get(LI.getType());
+ Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca");
+ LI.replaceAllUsesWith(V);
+ LI.eraseFromParent();
+ return true;
+ }
+
+ struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> {
+ StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr)
+ : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr) {}
+
+ /// Emit a leaf store of a single value. This is called at the leaves of the
+ /// recursive emission to actually produce stores.
+ void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) {
+ assert(Ty->isSingleValueType());
+ // Extract the single value and store it using the indices.
+ Value *Store = IRB.CreateStore(
+ IRB.CreateExtractValue(Agg, Indices, Name + ".extract"),
+ IRB.CreateInBoundsGEP(nullptr, Ptr, GEPIndices, Name + ".gep"));
+ (void)Store;
+ DEBUG(dbgs() << " to: " << *Store << "\n");
+ }
+ };
+
+ bool visitStoreInst(StoreInst &SI) {
+ if (!SI.isSimple() || SI.getPointerOperand() != *U)
+ return false;
+ Value *V = SI.getValueOperand();
+ if (V->getType()->isSingleValueType())
+ return false;
+
+ // We have an aggregate being stored, split it apart.
+ DEBUG(dbgs() << " original: " << SI << "\n");
+ StoreOpSplitter Splitter(&SI, *U);
+ Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca");
+ SI.eraseFromParent();
+ return true;
+ }
+
+ bool visitBitCastInst(BitCastInst &BC) {
+ enqueueUsers(BC);
+ return false;
+ }
+
+ bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
+ enqueueUsers(GEPI);
+ return false;
+ }
+
+ bool visitPHINode(PHINode &PN) {
+ enqueueUsers(PN);
+ return false;
+ }
+
+ bool visitSelectInst(SelectInst &SI) {
+ enqueueUsers(SI);
+ return false;
+ }
+};
+}
+
+/// \brief Strip aggregate type wrapping.
+///
+/// This removes no-op aggregate types wrapping an underlying type. It will
+/// strip as many layers of types as it can without changing either the type
+/// size or the allocated size.
+static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) {
+ if (Ty->isSingleValueType())
+ return Ty;
+
+ uint64_t AllocSize = DL.getTypeAllocSize(Ty);
+ uint64_t TypeSize = DL.getTypeSizeInBits(Ty);
+
+ Type *InnerTy;
+ if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
+ InnerTy = ArrTy->getElementType();
+ } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
+ const StructLayout *SL = DL.getStructLayout(STy);
+ unsigned Index = SL->getElementContainingOffset(0);
+ InnerTy = STy->getElementType(Index);
+ } else {
+ return Ty;
+ }
+
+ if (AllocSize > DL.getTypeAllocSize(InnerTy) ||
+ TypeSize > DL.getTypeSizeInBits(InnerTy))
+ return Ty;
+
+ return stripAggregateTypeWrapping(DL, InnerTy);
+}
+
+/// \brief Try to find a partition of the aggregate type passed in for a given
+/// offset and size.
+///
+/// This recurses through the aggregate type and tries to compute a subtype
+/// based on the offset and size. When the offset and size span a sub-section
+/// of an array, it will even compute a new array type for that sub-section,
+/// and the same for structs.
+///
+/// Note that this routine is very strict and tries to find a partition of the
+/// type which produces the *exact* right offset and size. It is not forgiving
+/// when the size or offset cause either end of type-based partition to be off.
+/// Also, this is a best-effort routine. It is reasonable to give up and not
+/// return a type if necessary.
+static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset,
+ uint64_t Size) {
+ if (Offset == 0 && DL.getTypeAllocSize(Ty) == Size)
+ return stripAggregateTypeWrapping(DL, Ty);
+ if (Offset > DL.getTypeAllocSize(Ty) ||
+ (DL.getTypeAllocSize(Ty) - Offset) < Size)
+ return nullptr;
+
+ if (SequentialType *SeqTy = dyn_cast<SequentialType>(Ty)) {
+ // We can't partition pointers...
+ if (SeqTy->isPointerTy())
+ return nullptr;
+
+ Type *ElementTy = SeqTy->getElementType();
+ uint64_t ElementSize = DL.getTypeAllocSize(ElementTy);
+ uint64_t NumSkippedElements = Offset / ElementSize;
+ if (ArrayType *ArrTy = dyn_cast<ArrayType>(SeqTy)) {
+ if (NumSkippedElements >= ArrTy->getNumElements())
+ return nullptr;
+ } else if (VectorType *VecTy = dyn_cast<VectorType>(SeqTy)) {
+ if (NumSkippedElements >= VecTy->getNumElements())
+ return nullptr;
+ }
+ Offset -= NumSkippedElements * ElementSize;
+
+ // First check if we need to recurse.
+ if (Offset > 0 || Size < ElementSize) {
+ // Bail if the partition ends in a different array element.
+ if ((Offset + Size) > ElementSize)
+ return nullptr;
+ // Recurse through the element type trying to peel off offset bytes.
+ return getTypePartition(DL, ElementTy, Offset, Size);
+ }
+ assert(Offset == 0);
+
+ if (Size == ElementSize)
+ return stripAggregateTypeWrapping(DL, ElementTy);
+ assert(Size > ElementSize);
+ uint64_t NumElements = Size / ElementSize;
+ if (NumElements * ElementSize != Size)
+ return nullptr;
+ return ArrayType::get(ElementTy, NumElements);
+ }
+
+ StructType *STy = dyn_cast<StructType>(Ty);
+ if (!STy)
+ return nullptr;
+
+ const StructLayout *SL = DL.getStructLayout(STy);
+ if (Offset >= SL->getSizeInBytes())
+ return nullptr;
+ uint64_t EndOffset = Offset + Size;
+ if (EndOffset > SL->getSizeInBytes())
+ return nullptr;
+
+ unsigned Index = SL->getElementContainingOffset(Offset);
+ Offset -= SL->getElementOffset(Index);
+
+ Type *ElementTy = STy->getElementType(Index);
+ uint64_t ElementSize = DL.getTypeAllocSize(ElementTy);
+ if (Offset >= ElementSize)
+ return nullptr; // The offset points into alignment padding.
+
+ // See if any partition must be contained by the element.
+ if (Offset > 0 || Size < ElementSize) {
+ if ((Offset + Size) > ElementSize)
+ return nullptr;
+ return getTypePartition(DL, ElementTy, Offset, Size);
+ }
+ assert(Offset == 0);
+
+ if (Size == ElementSize)
+ return stripAggregateTypeWrapping(DL, ElementTy);
+
+ StructType::element_iterator EI = STy->element_begin() + Index,
+ EE = STy->element_end();
+ if (EndOffset < SL->getSizeInBytes()) {
+ unsigned EndIndex = SL->getElementContainingOffset(EndOffset);
+ if (Index == EndIndex)
+ return nullptr; // Within a single element and its padding.
+
+ // Don't try to form "natural" types if the elements don't line up with the
+ // expected size.
+ // FIXME: We could potentially recurse down through the last element in the
+ // sub-struct to find a natural end point.
+ if (SL->getElementOffset(EndIndex) != EndOffset)
+ return nullptr;
+
+ assert(Index < EndIndex);
+ EE = STy->element_begin() + EndIndex;
+ }
+
+ // Try to build up a sub-structure.
+ StructType *SubTy =
+ StructType::get(STy->getContext(), makeArrayRef(EI, EE), STy->isPacked());
+ const StructLayout *SubSL = DL.getStructLayout(SubTy);
+ if (Size != SubSL->getSizeInBytes())
+ return nullptr; // The sub-struct doesn't have quite the size needed.
+
+ return SubTy;
+}
+
+/// \brief Pre-split loads and stores to simplify rewriting.
+///
+/// We want to break up the splittable load+store pairs as much as
+/// possible. This is important to do as a preprocessing step, as once we
+/// start rewriting the accesses to partitions of the alloca we lose the
+/// necessary information to correctly split apart paired loads and stores
+/// which both point into this alloca. The case to consider is something like
+/// the following:
+///
+/// %a = alloca [12 x i8]
+/// %gep1 = getelementptr [12 x i8]* %a, i32 0, i32 0
+/// %gep2 = getelementptr [12 x i8]* %a, i32 0, i32 4
+/// %gep3 = getelementptr [12 x i8]* %a, i32 0, i32 8
+/// %iptr1 = bitcast i8* %gep1 to i64*
+/// %iptr2 = bitcast i8* %gep2 to i64*
+/// %fptr1 = bitcast i8* %gep1 to float*
+/// %fptr2 = bitcast i8* %gep2 to float*
+/// %fptr3 = bitcast i8* %gep3 to float*
+/// store float 0.0, float* %fptr1
+/// store float 1.0, float* %fptr2
+/// %v = load i64* %iptr1
+/// store i64 %v, i64* %iptr2
+/// %f1 = load float* %fptr2
+/// %f2 = load float* %fptr3
+///
+/// Here we want to form 3 partitions of the alloca, each 4 bytes large, and
+/// promote everything so we recover the 2 SSA values that should have been
+/// there all along.
+///
+/// \returns true if any changes are made.
+bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
+ DEBUG(dbgs() << "Pre-splitting loads and stores\n");
+
+ // Track the loads and stores which are candidates for pre-splitting here, in
+ // the order they first appear during the partition scan. These give stable
+ // iteration order and a basis for tracking which loads and stores we
+ // actually split.
+ SmallVector<LoadInst *, 4> Loads;
+ SmallVector<StoreInst *, 4> Stores;
+
+ // We need to accumulate the splits required of each load or store where we
+ // can find them via a direct lookup. This is important to cross-check loads
+ // and stores against each other. We also track the slice so that we can kill
+ // all the slices that end up split.
+ struct SplitOffsets {
+ Slice *S;
+ std::vector<uint64_t> Splits;
+ };
+ SmallDenseMap<Instruction *, SplitOffsets, 8> SplitOffsetsMap;
+
+ // Track loads out of this alloca which cannot, for any reason, be pre-split.
+ // This is important as we also cannot pre-split stores of those loads!
+ // FIXME: This is all pretty gross. It means that we can be more aggressive
+ // in pre-splitting when the load feeding the store happens to come from
+ // a separate alloca. Put another way, the effectiveness of SROA would be
+ // decreased by a frontend which just concatenated all of its local allocas
+ // into one big flat alloca. But defeating such patterns is exactly the job
+ // SROA is tasked with! Sadly, to not have this discrepancy we would have
+ // change store pre-splitting to actually force pre-splitting of the load
+ // that feeds it *and all stores*. That makes pre-splitting much harder, but
+ // maybe it would make it more principled?
+ SmallPtrSet<LoadInst *, 8> UnsplittableLoads;
+
+ DEBUG(dbgs() << " Searching for candidate loads and stores\n");
+ for (auto &P : AS.partitions()) {
+ for (Slice &S : P) {
+ Instruction *I = cast<Instruction>(S.getUse()->getUser());
+ if (!S.isSplittable() ||S.endOffset() <= P.endOffset()) {
+ // If this was a load we have to track that it can't participate in any
+ // pre-splitting!
+ if (auto *LI = dyn_cast<LoadInst>(I))
+ UnsplittableLoads.insert(LI);
+ continue;
+ }
+ assert(P.endOffset() > S.beginOffset() &&
+ "Empty or backwards partition!");
+
+ // Determine if this is a pre-splittable slice.
+ if (auto *LI = dyn_cast<LoadInst>(I)) {
+ assert(!LI->isVolatile() && "Cannot split volatile loads!");
+
+ // The load must be used exclusively to store into other pointers for
+ // us to be able to arbitrarily pre-split it. The stores must also be
+ // simple to avoid changing semantics.
+ auto IsLoadSimplyStored = [](LoadInst *LI) {
+ for (User *LU : LI->users()) {
+ auto *SI = dyn_cast<StoreInst>(LU);
+ if (!SI || !SI->isSimple())
+ return false;
+ }
+ return true;
+ };
+ if (!IsLoadSimplyStored(LI)) {
+ UnsplittableLoads.insert(LI);
+ continue;
+ }
+
+ Loads.push_back(LI);
+ } else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser())) {
+ if (!SI ||
+ S.getUse() != &SI->getOperandUse(SI->getPointerOperandIndex()))
+ continue;
+ auto *StoredLoad = dyn_cast<LoadInst>(SI->getValueOperand());
+ if (!StoredLoad || !StoredLoad->isSimple())
+ continue;
+ assert(!SI->isVolatile() && "Cannot split volatile stores!");
+
+ Stores.push_back(SI);
+ } else {
+ // Other uses cannot be pre-split.
+ continue;
+ }
+
+ // Record the initial split.
+ DEBUG(dbgs() << " Candidate: " << *I << "\n");
+ auto &Offsets = SplitOffsetsMap[I];
+ assert(Offsets.Splits.empty() &&
+ "Should not have splits the first time we see an instruction!");
+ Offsets.S = &S;
+ Offsets.Splits.push_back(P.endOffset() - S.beginOffset());
+ }
+
+ // Now scan the already split slices, and add a split for any of them which
+ // we're going to pre-split.
+ for (Slice *S : P.splitSliceTails()) {
+ auto SplitOffsetsMapI =
+ SplitOffsetsMap.find(cast<Instruction>(S->getUse()->getUser()));
+ if (SplitOffsetsMapI == SplitOffsetsMap.end())
+ continue;
+ auto &Offsets = SplitOffsetsMapI->second;
+
+ assert(Offsets.S == S && "Found a mismatched slice!");
+ assert(!Offsets.Splits.empty() &&
+ "Cannot have an empty set of splits on the second partition!");
+ assert(Offsets.Splits.back() ==
+ P.beginOffset() - Offsets.S->beginOffset() &&
+ "Previous split does not end where this one begins!");
+
+ // Record each split. The last partition's end isn't needed as the size
+ // of the slice dictates that.
+ if (S->endOffset() > P.endOffset())
+ Offsets.Splits.push_back(P.endOffset() - Offsets.S->beginOffset());
+ }
+ }
+
+ // We may have split loads where some of their stores are split stores. For
+ // such loads and stores, we can only pre-split them if their splits exactly
+ // match relative to their starting offset. We have to verify this prior to
+ // any rewriting.
+ Stores.erase(
+ std::remove_if(Stores.begin(), Stores.end(),
+ [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) {
+ // Lookup the load we are storing in our map of split
+ // offsets.
+ auto *LI = cast<LoadInst>(SI->getValueOperand());
+ // If it was completely unsplittable, then we're done,
+ // and this store can't be pre-split.
+ if (UnsplittableLoads.count(LI))
+ return true;
+
+ auto LoadOffsetsI = SplitOffsetsMap.find(LI);
+ if (LoadOffsetsI == SplitOffsetsMap.end())
+ return false; // Unrelated loads are definitely safe.
+ auto &LoadOffsets = LoadOffsetsI->second;
+
+ // Now lookup the store's offsets.
+ auto &StoreOffsets = SplitOffsetsMap[SI];
+
+ // If the relative offsets of each split in the load and
+ // store match exactly, then we can split them and we
+ // don't need to remove them here.
+ if (LoadOffsets.Splits == StoreOffsets.Splits)
+ return false;
+
+ DEBUG(dbgs()
+ << " Mismatched splits for load and store:\n"
+ << " " << *LI << "\n"
+ << " " << *SI << "\n");
+
+ // We've found a store and load that we need to split
+ // with mismatched relative splits. Just give up on them
+ // and remove both instructions from our list of
+ // candidates.
+ UnsplittableLoads.insert(LI);
+ return true;
+ }),
+ Stores.end());
+ // Now we have to go *back* through all the stores, because a later store may
+ // have caused an earlier store's load to become unsplittable and if it is
+ // unsplittable for the later store, then we can't rely on it being split in
+ // the earlier store either.
+ Stores.erase(std::remove_if(Stores.begin(), Stores.end(),
+ [&UnsplittableLoads](StoreInst *SI) {
+ auto *LI =
+ cast<LoadInst>(SI->getValueOperand());
+ return UnsplittableLoads.count(LI);
+ }),
+ Stores.end());
+ // Once we've established all the loads that can't be split for some reason,
+ // filter any that made it into our list out.
+ Loads.erase(std::remove_if(Loads.begin(), Loads.end(),
+ [&UnsplittableLoads](LoadInst *LI) {
+ return UnsplittableLoads.count(LI);
+ }),
+ Loads.end());
+
+
+ // If no loads or stores are left, there is no pre-splitting to be done for
+ // this alloca.
+ if (Loads.empty() && Stores.empty())
+ return false;
+
+ // From here on, we can't fail and will be building new accesses, so rig up
+ // an IR builder.
+ IRBuilderTy IRB(&AI);
+
+ // Collect the new slices which we will merge into the alloca slices.
+ SmallVector<Slice, 4> NewSlices;
+
+ // Track any allocas we end up splitting loads and stores for so we iterate
+ // on them.
+ SmallPtrSet<AllocaInst *, 4> ResplitPromotableAllocas;
+
+ // At this point, we have collected all of the loads and stores we can
+ // pre-split, and the specific splits needed for them. We actually do the
+ // splitting in a specific order in order to handle when one of the loads in
+ // the value operand to one of the stores.
+ //
+ // First, we rewrite all of the split loads, and just accumulate each split
+ // load in a parallel structure. We also build the slices for them and append
+ // them to the alloca slices.
+ SmallDenseMap<LoadInst *, std::vector<LoadInst *>, 1> SplitLoadsMap;
+ std::vector<LoadInst *> SplitLoads;
+ const DataLayout &DL = AI.getModule()->getDataLayout();
+ for (LoadInst *LI : Loads) {
+ SplitLoads.clear();
+
+ IntegerType *Ty = cast<IntegerType>(LI->getType());
+ uint64_t LoadSize = Ty->getBitWidth() / 8;
+ assert(LoadSize > 0 && "Cannot have a zero-sized integer load!");
+
+ auto &Offsets = SplitOffsetsMap[LI];
+ assert(LoadSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
+ "Slice size should always match load size exactly!");
+ uint64_t BaseOffset = Offsets.S->beginOffset();
+ assert(BaseOffset + LoadSize > BaseOffset &&
+ "Cannot represent alloca access size using 64-bit integers!");
+
+ Instruction *BasePtr = cast<Instruction>(LI->getPointerOperand());
+ IRB.SetInsertPoint(LI);
+
+ DEBUG(dbgs() << " Splitting load: " << *LI << "\n");
+
+ uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
+ int Idx = 0, Size = Offsets.Splits.size();
+ for (;;) {
+ auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
+ auto *PartPtrTy = PartTy->getPointerTo(LI->getPointerAddressSpace());
+ LoadInst *PLoad = IRB.CreateAlignedLoad(
+ getAdjustedPtr(IRB, DL, BasePtr,
+ APInt(DL.getPointerSizeInBits(), PartOffset),
+ PartPtrTy, BasePtr->getName() + "."),
+ getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false,
+ LI->getName());
+
+ // Append this load onto the list of split loads so we can find it later
+ // to rewrite the stores.
+ SplitLoads.push_back(PLoad);
+
+ // Now build a new slice for the alloca.
+ NewSlices.push_back(
+ Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
+ &PLoad->getOperandUse(PLoad->getPointerOperandIndex()),
+ /*IsSplittable*/ false));
+ DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
+ << ", " << NewSlices.back().endOffset() << "): " << *PLoad
+ << "\n");
+
+ // See if we've handled all the splits.
+ if (Idx >= Size)
+ break;
+
+ // Setup the next partition.
+ PartOffset = Offsets.Splits[Idx];
+ ++Idx;
+ PartSize = (Idx < Size ? Offsets.Splits[Idx] : LoadSize) - PartOffset;
+ }
+
+ // Now that we have the split loads, do the slow walk over all uses of the
+ // load and rewrite them as split stores, or save the split loads to use
+ // below if the store is going to be split there anyways.
+ bool DeferredStores = false;
+ for (User *LU : LI->users()) {
+ StoreInst *SI = cast<StoreInst>(LU);
+ if (!Stores.empty() && SplitOffsetsMap.count(SI)) {
+ DeferredStores = true;
+ DEBUG(dbgs() << " Deferred splitting of store: " << *SI << "\n");
+ continue;
+ }
+
+ Value *StoreBasePtr = SI->getPointerOperand();
+ IRB.SetInsertPoint(SI);
+
+ DEBUG(dbgs() << " Splitting store of load: " << *SI << "\n");
+
+ for (int Idx = 0, Size = SplitLoads.size(); Idx < Size; ++Idx) {
+ LoadInst *PLoad = SplitLoads[Idx];
+ uint64_t PartOffset = Idx == 0 ? 0 : Offsets.Splits[Idx - 1];
+ auto *PartPtrTy =
+ PLoad->getType()->getPointerTo(SI->getPointerAddressSpace());
+
+ StoreInst *PStore = IRB.CreateAlignedStore(
+ PLoad, getAdjustedPtr(IRB, DL, StoreBasePtr,
+ APInt(DL.getPointerSizeInBits(), PartOffset),
+ PartPtrTy, StoreBasePtr->getName() + "."),
+ getAdjustedAlignment(SI, PartOffset, DL), /*IsVolatile*/ false);
+ (void)PStore;
+ DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n");
+ }
+
+ // We want to immediately iterate on any allocas impacted by splitting
+ // this store, and we have to track any promotable alloca (indicated by
+ // a direct store) as needing to be resplit because it is no longer
+ // promotable.
+ if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(StoreBasePtr)) {
+ ResplitPromotableAllocas.insert(OtherAI);
+ Worklist.insert(OtherAI);
+ } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
+ StoreBasePtr->stripInBoundsOffsets())) {
+ Worklist.insert(OtherAI);
+ }
+
+ // Mark the original store as dead.
+ DeadInsts.insert(SI);
+ }
+
+ // Save the split loads if there are deferred stores among the users.
+ if (DeferredStores)
+ SplitLoadsMap.insert(std::make_pair(LI, std::move(SplitLoads)));
+
+ // Mark the original load as dead and kill the original slice.
+ DeadInsts.insert(LI);
+ Offsets.S->kill();
+ }
+
+ // Second, we rewrite all of the split stores. At this point, we know that
+ // all loads from this alloca have been split already. For stores of such
+ // loads, we can simply look up the pre-existing split loads. For stores of
+ // other loads, we split those loads first and then write split stores of
+ // them.
+ for (StoreInst *SI : Stores) {
+ auto *LI = cast<LoadInst>(SI->getValueOperand());
+ IntegerType *Ty = cast<IntegerType>(LI->getType());
+ uint64_t StoreSize = Ty->getBitWidth() / 8;
+ assert(StoreSize > 0 && "Cannot have a zero-sized integer store!");
+
+ auto &Offsets = SplitOffsetsMap[SI];
+ assert(StoreSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
+ "Slice size should always match load size exactly!");
+ uint64_t BaseOffset = Offsets.S->beginOffset();
+ assert(BaseOffset + StoreSize > BaseOffset &&
+ "Cannot represent alloca access size using 64-bit integers!");
+
+ Value *LoadBasePtr = LI->getPointerOperand();
+ Instruction *StoreBasePtr = cast<Instruction>(SI->getPointerOperand());
+
+ DEBUG(dbgs() << " Splitting store: " << *SI << "\n");
+
+ // Check whether we have an already split load.
+ auto SplitLoadsMapI = SplitLoadsMap.find(LI);
+ std::vector<LoadInst *> *SplitLoads = nullptr;
+ if (SplitLoadsMapI != SplitLoadsMap.end()) {
+ SplitLoads = &SplitLoadsMapI->second;
+ assert(SplitLoads->size() == Offsets.Splits.size() + 1 &&
+ "Too few split loads for the number of splits in the store!");
+ } else {
+ DEBUG(dbgs() << " of load: " << *LI << "\n");
+ }
+
+ uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
+ int Idx = 0, Size = Offsets.Splits.size();
+ for (;;) {
+ auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
+ auto *PartPtrTy = PartTy->getPointerTo(SI->getPointerAddressSpace());
+
+ // Either lookup a split load or create one.
+ LoadInst *PLoad;
+ if (SplitLoads) {
+ PLoad = (*SplitLoads)[Idx];
+ } else {
+ IRB.SetInsertPoint(LI);
+ PLoad = IRB.CreateAlignedLoad(
+ getAdjustedPtr(IRB, DL, LoadBasePtr,
+ APInt(DL.getPointerSizeInBits(), PartOffset),
+ PartPtrTy, LoadBasePtr->getName() + "."),
+ getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false,
+ LI->getName());
+ }
+
+ // And store this partition.
+ IRB.SetInsertPoint(SI);
+ StoreInst *PStore = IRB.CreateAlignedStore(
+ PLoad, getAdjustedPtr(IRB, DL, StoreBasePtr,
+ APInt(DL.getPointerSizeInBits(), PartOffset),
+ PartPtrTy, StoreBasePtr->getName() + "."),
+ getAdjustedAlignment(SI, PartOffset, DL), /*IsVolatile*/ false);
+
+ // Now build a new slice for the alloca.
+ NewSlices.push_back(
+ Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
+ &PStore->getOperandUse(PStore->getPointerOperandIndex()),
+ /*IsSplittable*/ false));
+ DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
+ << ", " << NewSlices.back().endOffset() << "): " << *PStore
+ << "\n");
+ if (!SplitLoads) {
+ DEBUG(dbgs() << " of split load: " << *PLoad << "\n");
+ }
+
+ // See if we've finished all the splits.
+ if (Idx >= Size)
+ break;
+
+ // Setup the next partition.
+ PartOffset = Offsets.Splits[Idx];
+ ++Idx;
+ PartSize = (Idx < Size ? Offsets.Splits[Idx] : StoreSize) - PartOffset;
+ }
+
+ // We want to immediately iterate on any allocas impacted by splitting
+ // this load, which is only relevant if it isn't a load of this alloca and
+ // thus we didn't already split the loads above. We also have to keep track
+ // of any promotable allocas we split loads on as they can no longer be
+ // promoted.
+ if (!SplitLoads) {
+ if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(LoadBasePtr)) {
+ assert(OtherAI != &AI && "We can't re-split our own alloca!");
+ ResplitPromotableAllocas.insert(OtherAI);
+ Worklist.insert(OtherAI);
+ } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
+ LoadBasePtr->stripInBoundsOffsets())) {
+ assert(OtherAI != &AI && "We can't re-split our own alloca!");
+ Worklist.insert(OtherAI);
+ }
+ }
+
+ // Mark the original store as dead now that we've split it up and kill its
+ // slice. Note that we leave the original load in place unless this store
+ // was its only use. It may in turn be split up if it is an alloca load
+ // for some other alloca, but it may be a normal load. This may introduce
+ // redundant loads, but where those can be merged the rest of the optimizer
+ // should handle the merging, and this uncovers SSA splits which is more
+ // important. In practice, the original loads will almost always be fully
+ // split and removed eventually, and the splits will be merged by any
+ // trivial CSE, including instcombine.
+ if (LI->hasOneUse()) {
+ assert(*LI->user_begin() == SI && "Single use isn't this store!");
+ DeadInsts.insert(LI);
+ }
+ DeadInsts.insert(SI);
+ Offsets.S->kill();
+ }
+
+ // Remove the killed slices that have ben pre-split.
+ AS.erase(std::remove_if(AS.begin(), AS.end(), [](const Slice &S) {
+ return S.isDead();
+ }), AS.end());
+
+ // Insert our new slices. This will sort and merge them into the sorted
+ // sequence.
+ AS.insert(NewSlices);
+
+ DEBUG(dbgs() << " Pre-split slices:\n");
+#ifndef NDEBUG
+ for (auto I = AS.begin(), E = AS.end(); I != E; ++I)
+ DEBUG(AS.print(dbgs(), I, " "));
+#endif
+
+ // Finally, don't try to promote any allocas that new require re-splitting.
+ // They have already been added to the worklist above.
+ PromotableAllocas.erase(
+ std::remove_if(
+ PromotableAllocas.begin(), PromotableAllocas.end(),
+ [&](AllocaInst *AI) { return ResplitPromotableAllocas.count(AI); }),
+ PromotableAllocas.end());
+
+ return true;
+}
+
+/// \brief Rewrite an alloca partition's users.
+///
+/// This routine drives both of the rewriting goals of the SROA pass. It tries
+/// to rewrite uses of an alloca partition to be conducive for SSA value
+/// promotion. If the partition needs a new, more refined alloca, this will
+/// build that new alloca, preserving as much type information as possible, and
+/// rewrite the uses of the old alloca to point at the new one and have the
+/// appropriate new offsets. It also evaluates how successful the rewrite was
+/// at enabling promotion and if it was successful queues the alloca to be
+/// promoted.
+AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
+ Partition &P) {
+ // Try to compute a friendly type for this partition of the alloca. This
+ // won't always succeed, in which case we fall back to a legal integer type
+ // or an i8 array of an appropriate size.
+ Type *SliceTy = nullptr;
+ const DataLayout &DL = AI.getModule()->getDataLayout();
+ if (Type *CommonUseTy = findCommonType(P.begin(), P.end(), P.endOffset()))
+ if (DL.getTypeAllocSize(CommonUseTy) >= P.size())
+ SliceTy = CommonUseTy;
+ if (!SliceTy)
+ if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
+ P.beginOffset(), P.size()))
+ SliceTy = TypePartitionTy;
+ if ((!SliceTy || (SliceTy->isArrayTy() &&
+ SliceTy->getArrayElementType()->isIntegerTy())) &&
+ DL.isLegalInteger(P.size() * 8))
+ SliceTy = Type::getIntNTy(*C, P.size() * 8);
+ if (!SliceTy)
+ SliceTy = ArrayType::get(Type::getInt8Ty(*C), P.size());
+ assert(DL.getTypeAllocSize(SliceTy) >= P.size());
+
+ bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL);
+
+ VectorType *VecTy =
+ IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL);
+ if (VecTy)
+ SliceTy = VecTy;
+
+ // Check for the case where we're going to rewrite to a new alloca of the
+ // exact same type as the original, and with the same access offsets. In that
+ // case, re-use the existing alloca, but still run through the rewriter to
+ // perform phi and select speculation.
+ AllocaInst *NewAI;
+ if (SliceTy == AI.getAllocatedType()) {
+ assert(P.beginOffset() == 0 &&
+ "Non-zero begin offset but same alloca type");
+ NewAI = &AI;
+ // FIXME: We should be able to bail at this point with "nothing changed".
+ // FIXME: We might want to defer PHI speculation until after here.
+ // FIXME: return nullptr;
+ } else {
+ unsigned Alignment = AI.getAlignment();
+ if (!Alignment) {
+ // The minimum alignment which users can rely on when the explicit
+ // alignment is omitted or zero is that required by the ABI for this
+ // type.
+ Alignment = DL.getABITypeAlignment(AI.getAllocatedType());
+ }
+ Alignment = MinAlign(Alignment, P.beginOffset());
+ // If we will get at least this much alignment from the type alone, leave
+ // the alloca's alignment unconstrained.
+ if (Alignment <= DL.getABITypeAlignment(SliceTy))
+ Alignment = 0;
+ NewAI = new AllocaInst(
+ SliceTy, nullptr, Alignment,
+ AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()), &AI);
+ ++NumNewAllocas;
+ }
+
+ DEBUG(dbgs() << "Rewriting alloca partition "
+ << "[" << P.beginOffset() << "," << P.endOffset()
+ << ") to: " << *NewAI << "\n");
+
+ // Track the high watermark on the worklist as it is only relevant for
+ // promoted allocas. We will reset it to this point if the alloca is not in
+ // fact scheduled for promotion.
+ unsigned PPWOldSize = PostPromotionWorklist.size();
+ unsigned NumUses = 0;
+ SmallPtrSet<PHINode *, 8> PHIUsers;
+ SmallPtrSet<SelectInst *, 8> SelectUsers;
+
+ AllocaSliceRewriter Rewriter(DL, AS, *this, AI, *NewAI, P.beginOffset(),
+ P.endOffset(), IsIntegerPromotable, VecTy,
+ PHIUsers, SelectUsers);
+ bool Promotable = true;
+ for (Slice *S : P.splitSliceTails()) {
+ Promotable &= Rewriter.visit(S);
+ ++NumUses;
+ }
+ for (Slice &S : P) {
+ Promotable &= Rewriter.visit(&S);
+ ++NumUses;
+ }
+
+ NumAllocaPartitionUses += NumUses;
+ MaxUsesPerAllocaPartition =
+ std::max<unsigned>(NumUses, MaxUsesPerAllocaPartition);
+
+ // Now that we've processed all the slices in the new partition, check if any
+ // PHIs or Selects would block promotion.
+ for (SmallPtrSetImpl<PHINode *>::iterator I = PHIUsers.begin(),
+ E = PHIUsers.end();
+ I != E; ++I)
+ if (!isSafePHIToSpeculate(**I)) {
+ Promotable = false;
+ PHIUsers.clear();
+ SelectUsers.clear();
+ break;
+ }
+ for (SmallPtrSetImpl<SelectInst *>::iterator I = SelectUsers.begin(),
+ E = SelectUsers.end();
+ I != E; ++I)
+ if (!isSafeSelectToSpeculate(**I)) {
+ Promotable = false;
+ PHIUsers.clear();
+ SelectUsers.clear();
+ break;
+ }
+
+ if (Promotable) {
+ if (PHIUsers.empty() && SelectUsers.empty()) {
+ // Promote the alloca.
+ PromotableAllocas.push_back(NewAI);
+ } else {
+ // If we have either PHIs or Selects to speculate, add them to those
+ // worklists and re-queue the new alloca so that we promote in on the
+ // next iteration.
+ for (PHINode *PHIUser : PHIUsers)
+ SpeculatablePHIs.insert(PHIUser);
+ for (SelectInst *SelectUser : SelectUsers)
+ SpeculatableSelects.insert(SelectUser);
+ Worklist.insert(NewAI);
+ }
+ } else {
+ // If we can't promote the alloca, iterate on it to check for new
+ // refinements exposed by splitting the current alloca. Don't iterate on an
+ // alloca which didn't actually change and didn't get promoted.
+ if (NewAI != &AI)
+ Worklist.insert(NewAI);
+
+ // Drop any post-promotion work items if promotion didn't happen.
+ while (PostPromotionWorklist.size() > PPWOldSize)
+ PostPromotionWorklist.pop_back();
+ }
+
+ return NewAI;
+}
+
+/// \brief Walks the slices of an alloca and form partitions based on them,
+/// rewriting each of their uses.
+bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
+ if (AS.begin() == AS.end())
+ return false;
+
+ unsigned NumPartitions = 0;
+ bool Changed = false;
+ const DataLayout &DL = AI.getModule()->getDataLayout();
+
+ // First try to pre-split loads and stores.
+ Changed |= presplitLoadsAndStores(AI, AS);
+
+ // Now that we have identified any pre-splitting opportunities, mark any
+ // splittable (non-whole-alloca) loads and stores as unsplittable. If we fail
+ // to split these during pre-splitting, we want to force them to be
+ // rewritten into a partition.
+ bool IsSorted = true;
+ for (Slice &S : AS) {
+ if (!S.isSplittable())
+ continue;
+ // FIXME: We currently leave whole-alloca splittable loads and stores. This
+ // used to be the only splittable loads and stores and we need to be
+ // confident that the above handling of splittable loads and stores is
+ // completely sufficient before we forcibly disable the remaining handling.
+ if (S.beginOffset() == 0 &&
+ S.endOffset() >= DL.getTypeAllocSize(AI.getAllocatedType()))
+ continue;
+ if (isa<LoadInst>(S.getUse()->getUser()) ||
+ isa<StoreInst>(S.getUse()->getUser())) {
+ S.makeUnsplittable();
+ IsSorted = false;
+ }
+ }
+ if (!IsSorted)
+ std::sort(AS.begin(), AS.end());
+
+ /// \brief Describes the allocas introduced by rewritePartition
+ /// in order to migrate the debug info.
+ struct Piece {
+ AllocaInst *Alloca;
+ uint64_t Offset;
+ uint64_t Size;
+ Piece(AllocaInst *AI, uint64_t O, uint64_t S)
+ : Alloca(AI), Offset(O), Size(S) {}
+ };
+ SmallVector<Piece, 4> Pieces;
+
+ // Rewrite each partition.
+ for (auto &P : AS.partitions()) {
+ if (AllocaInst *NewAI = rewritePartition(AI, AS, P)) {
+ Changed = true;
+ if (NewAI != &AI) {
+ uint64_t SizeOfByte = 8;
+ uint64_t AllocaSize = DL.getTypeSizeInBits(NewAI->getAllocatedType());
+ // Don't include any padding.
+ uint64_t Size = std::min(AllocaSize, P.size() * SizeOfByte);
+ Pieces.push_back(Piece(NewAI, P.beginOffset() * SizeOfByte, Size));
+ }
+ }
+ ++NumPartitions;
+ }
+
+ NumAllocaPartitions += NumPartitions;
+ MaxPartitionsPerAlloca =
+ std::max<unsigned>(NumPartitions, MaxPartitionsPerAlloca);
+
+ // Migrate debug information from the old alloca to the new alloca(s)
+ // and the individual partitions.
+ if (DbgDeclareInst *DbgDecl = FindAllocaDbgDeclare(&AI)) {
+ auto *Var = DbgDecl->getVariable();
+ auto *Expr = DbgDecl->getExpression();
+ DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false);
+ bool IsSplit = Pieces.size() > 1;
+ for (auto Piece : Pieces) {
+ // Create a piece expression describing the new partition or reuse AI's
+ // expression if there is only one partition.
+ auto *PieceExpr = Expr;
+ if (IsSplit || Expr->isBitPiece()) {
+ // If this alloca is already a scalar replacement of a larger aggregate,
+ // Piece.Offset describes the offset inside the scalar.
+ uint64_t Offset = Expr->isBitPiece() ? Expr->getBitPieceOffset() : 0;
+ uint64_t Start = Offset + Piece.Offset;
+ uint64_t Size = Piece.Size;
+ if (Expr->isBitPiece()) {
+ uint64_t AbsEnd = Expr->getBitPieceOffset() + Expr->getBitPieceSize();
+ if (Start >= AbsEnd)
+ // No need to describe a SROAed padding.
+ continue;
+ Size = std::min(Size, AbsEnd - Start);
+ }
+ PieceExpr = DIB.createBitPieceExpression(Start, Size);
+ }
+
+ // Remove any existing dbg.declare intrinsic describing the same alloca.
+ if (DbgDeclareInst *OldDDI = FindAllocaDbgDeclare(Piece.Alloca))
+ OldDDI->eraseFromParent();
+
+ DIB.insertDeclare(Piece.Alloca, Var, PieceExpr, DbgDecl->getDebugLoc(),
+ &AI);
+ }
+ }
+ return Changed;
+}
+
+/// \brief Clobber a use with undef, deleting the used value if it becomes dead.
+void SROA::clobberUse(Use &U) {
+ Value *OldV = U;
+ // Replace the use with an undef value.
+ U = UndefValue::get(OldV->getType());
+
+ // Check for this making an instruction dead. We have to garbage collect
+ // all the dead instructions to ensure the uses of any alloca end up being
+ // minimal.
+ if (Instruction *OldI = dyn_cast<Instruction>(OldV))
+ if (isInstructionTriviallyDead(OldI)) {
+ DeadInsts.insert(OldI);
+ }
+}
+
+/// \brief Analyze an alloca for SROA.
+///
+/// This analyzes the alloca to ensure we can reason about it, builds
+/// the slices of the alloca, and then hands it off to be split and
+/// rewritten as needed.
+bool SROA::runOnAlloca(AllocaInst &AI) {
+ DEBUG(dbgs() << "SROA alloca: " << AI << "\n");
+ ++NumAllocasAnalyzed;
+
+ // Special case dead allocas, as they're trivial.
+ if (AI.use_empty()) {
+ AI.eraseFromParent();
+ return true;
+ }
+ const DataLayout &DL = AI.getModule()->getDataLayout();
+
+ // Skip alloca forms that this analysis can't handle.
+ if (AI.isArrayAllocation() || !AI.getAllocatedType()->isSized() ||
+ DL.getTypeAllocSize(AI.getAllocatedType()) == 0)
+ return false;
+
+ bool Changed = false;
+
+ // First, split any FCA loads and stores touching this alloca to promote
+ // better splitting and promotion opportunities.
+ AggLoadStoreRewriter AggRewriter;
+ Changed |= AggRewriter.rewrite(AI);
+
+ // Build the slices using a recursive instruction-visiting builder.
+ AllocaSlices AS(DL, AI);
+ DEBUG(AS.print(dbgs()));
+ if (AS.isEscaped())
+ return Changed;
+
+ // Delete all the dead users of this alloca before splitting and rewriting it.
+ for (Instruction *DeadUser : AS.getDeadUsers()) {
+ // Free up everything used by this instruction.
+ for (Use &DeadOp : DeadUser->operands())
+ clobberUse(DeadOp);
+
+ // Now replace the uses of this instruction.
+ DeadUser->replaceAllUsesWith(UndefValue::get(DeadUser->getType()));
+
+ // And mark it for deletion.
+ DeadInsts.insert(DeadUser);
+ Changed = true;
+ }
+ for (Use *DeadOp : AS.getDeadOperands()) {
+ clobberUse(*DeadOp);
+ Changed = true;
+ }
+
+ // No slices to split. Leave the dead alloca for a later pass to clean up.
+ if (AS.begin() == AS.end())
+ return Changed;
+
+ Changed |= splitAlloca(AI, AS);
+
+ DEBUG(dbgs() << " Speculating PHIs\n");
+ while (!SpeculatablePHIs.empty())
+ speculatePHINodeLoads(*SpeculatablePHIs.pop_back_val());
+
+ DEBUG(dbgs() << " Speculating Selects\n");
+ while (!SpeculatableSelects.empty())
+ speculateSelectInstLoads(*SpeculatableSelects.pop_back_val());
+
+ return Changed;
+}
+
+/// \brief Delete the dead instructions accumulated in this run.
+///
+/// Recursively deletes the dead instructions we've accumulated. This is done
+/// at the very end to maximize locality of the recursive delete and to
+/// minimize the problems of invalidated instruction pointers as such pointers
+/// are used heavily in the intermediate stages of the algorithm.
+///
+/// We also record the alloca instructions deleted here so that they aren't
+/// subsequently handed to mem2reg to promote.
+void SROA::deleteDeadInstructions(
+ SmallPtrSetImpl<AllocaInst *> &DeletedAllocas) {
+ while (!DeadInsts.empty()) {
+ Instruction *I = DeadInsts.pop_back_val();
+ DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
+
+ I->replaceAllUsesWith(UndefValue::get(I->getType()));
+
+ for (Use &Operand : I->operands())
+ if (Instruction *U = dyn_cast<Instruction>(Operand)) {
+ // Zero out the operand and see if it becomes trivially dead.
+ Operand = nullptr;
+ if (isInstructionTriviallyDead(U))
+ DeadInsts.insert(U);
+ }
+
+ if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
+ DeletedAllocas.insert(AI);
+ if (DbgDeclareInst *DbgDecl = FindAllocaDbgDeclare(AI))
+ DbgDecl->eraseFromParent();
+ }
+
+ ++NumDeleted;
+ I->eraseFromParent();
+ }
+}
+
+/// \brief Promote the allocas, using the best available technique.
+///
+/// This attempts to promote whatever allocas have been identified as viable in
+/// the PromotableAllocas list. If that list is empty, there is nothing to do.
+/// This function returns whether any promotion occurred.
+bool SROA::promoteAllocas(Function &F) {
+ if (PromotableAllocas.empty())
+ return false;
+
+ NumPromoted += PromotableAllocas.size();
+
+ DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
+ PromoteMemToReg(PromotableAllocas, *DT, nullptr, AC);
+ PromotableAllocas.clear();
+ return true;
+}
+
+PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT,
+ AssumptionCache &RunAC) {
+ DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
+ C = &F.getContext();
+ DT = &RunDT;
+ AC = &RunAC;
+
+ BasicBlock &EntryBB = F.getEntryBlock();
+ for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end());
+ I != E; ++I) {
+ if (AllocaInst *AI = dyn_cast<AllocaInst>(I))
+ Worklist.insert(AI);
+ }
+
+ bool Changed = false;
+ // A set of deleted alloca instruction pointers which should be removed from
+ // the list of promotable allocas.
+ SmallPtrSet<AllocaInst *, 4> DeletedAllocas;
+
+ do {
+ while (!Worklist.empty()) {
+ Changed |= runOnAlloca(*Worklist.pop_back_val());
+ deleteDeadInstructions(DeletedAllocas);
+
+ // Remove the deleted allocas from various lists so that we don't try to
+ // continue processing them.
+ if (!DeletedAllocas.empty()) {
+ auto IsInSet = [&](AllocaInst *AI) { return DeletedAllocas.count(AI); };
+ Worklist.remove_if(IsInSet);
+ PostPromotionWorklist.remove_if(IsInSet);
+ PromotableAllocas.erase(std::remove_if(PromotableAllocas.begin(),
+ PromotableAllocas.end(),
+ IsInSet),
+ PromotableAllocas.end());
+ DeletedAllocas.clear();
+ }
+ }
+
+ Changed |= promoteAllocas(F);
+
+ Worklist = PostPromotionWorklist;
+ PostPromotionWorklist.clear();
+ } while (!Worklist.empty());
+
+ // FIXME: Even when promoting allocas we should preserve some abstract set of
+ // CFG-specific analyses.
+ return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
+
+PreservedAnalyses SROA::run(Function &F, AnalysisManager<Function> *AM) {
+ return runImpl(F, AM->getResult<DominatorTreeAnalysis>(F),
+ AM->getResult<AssumptionAnalysis>(F));
+}
+
+/// A legacy pass for the legacy pass manager that wraps the \c SROA pass.
+///
+/// This is in the llvm namespace purely to allow it to be a friend of the \c
+/// SROA pass.
+class llvm::sroa::SROALegacyPass : public FunctionPass {
+ /// The SROA implementation.
+ SROA Impl;
+
+public:
+ SROALegacyPass() : FunctionPass(ID) {
+ initializeSROALegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+ bool runOnFunction(Function &F) override {
+ if (skipOptnoneFunction(F))
+ return false;
+
+ auto PA = Impl.runImpl(
+ F, getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
+ getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F));
+ return !PA.areAllPreserved();
+ }
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.setPreservesCFG();
+ }
+
+ const char *getPassName() const override { return "SROA"; }
+ static char ID;
+};
+
+char SROALegacyPass::ID = 0;
+
+FunctionPass *llvm::createSROAPass() { return new SROALegacyPass(); }
+
+INITIALIZE_PASS_BEGIN(SROALegacyPass, "sroa",
+ "Scalar Replacement Of Aggregates", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(SROALegacyPass, "sroa", "Scalar Replacement Of Aggregates",
+ false, false)
diff --git a/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp
new file mode 100644
index 0000000..52d477c
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp
@@ -0,0 +1,244 @@
+//===-- Scalar.cpp --------------------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements common infrastructure for libLLVMScalarOpts.a, which
+// implements several scalar transformations over the LLVM intermediate
+// representation, including the C bindings for that library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm-c/Initialization.h"
+#include "llvm-c/Transforms/Scalar.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/ScopedNoAliasAA.h"
+#include "llvm/Analysis/TypeBasedAliasAnalysis.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/IR/LegacyPassManager.h"
+
+using namespace llvm;
+
+/// initializeScalarOptsPasses - Initialize all passes linked into the
+/// ScalarOpts library.
+void llvm::initializeScalarOpts(PassRegistry &Registry) {
+ initializeADCELegacyPassPass(Registry);
+ initializeBDCEPass(Registry);
+ initializeAlignmentFromAssumptionsPass(Registry);
+ initializeConstantHoistingPass(Registry);
+ initializeConstantPropagationPass(Registry);
+ initializeCorrelatedValuePropagationPass(Registry);
+ initializeDCEPass(Registry);
+ initializeDeadInstEliminationPass(Registry);
+ initializeScalarizerPass(Registry);
+ initializeDSEPass(Registry);
+ initializeGVNPass(Registry);
+ initializeEarlyCSELegacyPassPass(Registry);
+ initializeFlattenCFGPassPass(Registry);
+ initializeInductiveRangeCheckEliminationPass(Registry);
+ initializeIndVarSimplifyPass(Registry);
+ initializeJumpThreadingPass(Registry);
+ initializeLICMPass(Registry);
+ initializeLoopDeletionPass(Registry);
+ initializeLoopAccessAnalysisPass(Registry);
+ initializeLoopInstSimplifyPass(Registry);
+ initializeLoopInterchangePass(Registry);
+ initializeLoopRotatePass(Registry);
+ initializeLoopStrengthReducePass(Registry);
+ initializeLoopRerollPass(Registry);
+ initializeLoopUnrollPass(Registry);
+ initializeLoopUnswitchPass(Registry);
+ initializeLoopIdiomRecognizePass(Registry);
+ initializeLowerAtomicPass(Registry);
+ initializeLowerExpectIntrinsicPass(Registry);
+ initializeMemCpyOptPass(Registry);
+ initializeMergedLoadStoreMotionPass(Registry);
+ initializeNaryReassociatePass(Registry);
+ initializePartiallyInlineLibCallsPass(Registry);
+ initializeReassociatePass(Registry);
+ initializeRegToMemPass(Registry);
+ initializeRewriteStatepointsForGCPass(Registry);
+ initializeSCCPPass(Registry);
+ initializeIPSCCPPass(Registry);
+ initializeSROALegacyPassPass(Registry);
+ initializeSROA_DTPass(Registry);
+ initializeSROA_SSAUpPass(Registry);
+ initializeCFGSimplifyPassPass(Registry);
+ initializeStructurizeCFGPass(Registry);
+ initializeSinkingPass(Registry);
+ initializeTailCallElimPass(Registry);
+ initializeSeparateConstOffsetFromGEPPass(Registry);
+ initializeSpeculativeExecutionPass(Registry);
+ initializeStraightLineStrengthReducePass(Registry);
+ initializeLoadCombinePass(Registry);
+ initializePlaceBackedgeSafepointsImplPass(Registry);
+ initializePlaceSafepointsPass(Registry);
+ initializeFloat2IntPass(Registry);
+ initializeLoopDistributePass(Registry);
+ initializeLoopLoadEliminationPass(Registry);
+}
+
+void LLVMInitializeScalarOpts(LLVMPassRegistryRef R) {
+ initializeScalarOpts(*unwrap(R));
+}
+
+void LLVMAddAggressiveDCEPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createAggressiveDCEPass());
+}
+
+void LLVMAddBitTrackingDCEPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createBitTrackingDCEPass());
+}
+
+void LLVMAddAlignmentFromAssumptionsPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createAlignmentFromAssumptionsPass());
+}
+
+void LLVMAddCFGSimplificationPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createCFGSimplificationPass());
+}
+
+void LLVMAddDeadStoreEliminationPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createDeadStoreEliminationPass());
+}
+
+void LLVMAddScalarizerPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createScalarizerPass());
+}
+
+void LLVMAddGVNPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createGVNPass());
+}
+
+void LLVMAddMergedLoadStoreMotionPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createMergedLoadStoreMotionPass());
+}
+
+void LLVMAddIndVarSimplifyPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createIndVarSimplifyPass());
+}
+
+void LLVMAddInstructionCombiningPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createInstructionCombiningPass());
+}
+
+void LLVMAddJumpThreadingPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createJumpThreadingPass());
+}
+
+void LLVMAddLICMPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createLICMPass());
+}
+
+void LLVMAddLoopDeletionPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createLoopDeletionPass());
+}
+
+void LLVMAddLoopIdiomPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createLoopIdiomPass());
+}
+
+void LLVMAddLoopRotatePass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createLoopRotatePass());
+}
+
+void LLVMAddLoopRerollPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createLoopRerollPass());
+}
+
+void LLVMAddLoopUnrollPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createLoopUnrollPass());
+}
+
+void LLVMAddLoopUnswitchPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createLoopUnswitchPass());
+}
+
+void LLVMAddMemCpyOptPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createMemCpyOptPass());
+}
+
+void LLVMAddPartiallyInlineLibCallsPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createPartiallyInlineLibCallsPass());
+}
+
+void LLVMAddLowerSwitchPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createLowerSwitchPass());
+}
+
+void LLVMAddPromoteMemoryToRegisterPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createPromoteMemoryToRegisterPass());
+}
+
+void LLVMAddReassociatePass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createReassociatePass());
+}
+
+void LLVMAddSCCPPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createSCCPPass());
+}
+
+void LLVMAddScalarReplAggregatesPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createScalarReplAggregatesPass());
+}
+
+void LLVMAddScalarReplAggregatesPassSSA(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createScalarReplAggregatesPass(-1, false));
+}
+
+void LLVMAddScalarReplAggregatesPassWithThreshold(LLVMPassManagerRef PM,
+ int Threshold) {
+ unwrap(PM)->add(createScalarReplAggregatesPass(Threshold));
+}
+
+void LLVMAddSimplifyLibCallsPass(LLVMPassManagerRef PM) {
+ // NOTE: The simplify-libcalls pass has been removed.
+}
+
+void LLVMAddTailCallEliminationPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createTailCallEliminationPass());
+}
+
+void LLVMAddConstantPropagationPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createConstantPropagationPass());
+}
+
+void LLVMAddDemoteMemoryToRegisterPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createDemoteRegisterToMemoryPass());
+}
+
+void LLVMAddVerifierPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createVerifierPass());
+}
+
+void LLVMAddCorrelatedValuePropagationPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createCorrelatedValuePropagationPass());
+}
+
+void LLVMAddEarlyCSEPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createEarlyCSEPass());
+}
+
+void LLVMAddTypeBasedAliasAnalysisPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createTypeBasedAAWrapperPass());
+}
+
+void LLVMAddScopedNoAliasAAPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createScopedNoAliasAAWrapperPass());
+}
+
+void LLVMAddBasicAliasAnalysisPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createBasicAAWrapperPass());
+}
+
+void LLVMAddLowerExpectIntrinsicPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createLowerExpectIntrinsicPass());
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp
new file mode 100644
index 0000000..114d22d
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp
@@ -0,0 +1,2630 @@
+//===- ScalarReplAggregates.cpp - Scalar Replacement of Aggregates --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This transformation implements the well known scalar replacement of
+// aggregates transformation. This xform breaks up alloca instructions of
+// aggregate type (structure or array) into individual alloca instructions for
+// each member (if possible). Then, if possible, it transforms the individual
+// alloca instructions into nice clean scalar SSA form.
+//
+// This combines a simple SRoA algorithm with the Mem2Reg algorithm because they
+// often interact, especially for C++ programs. As such, iterating between
+// SRoA, then Mem2Reg until we run out of things to promote works well.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "scalarrepl"
+
+STATISTIC(NumReplaced, "Number of allocas broken up");
+STATISTIC(NumPromoted, "Number of allocas promoted");
+STATISTIC(NumAdjusted, "Number of scalar allocas adjusted to allow promotion");
+STATISTIC(NumConverted, "Number of aggregates converted to scalar");
+
+namespace {
+#define SROA SROA_
+ struct SROA : public FunctionPass {
+ SROA(int T, bool hasDT, char &ID, int ST, int AT, int SLT)
+ : FunctionPass(ID), HasDomTree(hasDT) {
+ if (T == -1)
+ SRThreshold = 128;
+ else
+ SRThreshold = T;
+ if (ST == -1)
+ StructMemberThreshold = 32;
+ else
+ StructMemberThreshold = ST;
+ if (AT == -1)
+ ArrayElementThreshold = 8;
+ else
+ ArrayElementThreshold = AT;
+ if (SLT == -1)
+ // Do not limit the scalar integer load size if no threshold is given.
+ ScalarLoadThreshold = -1;
+ else
+ ScalarLoadThreshold = SLT;
+ }
+
+ bool runOnFunction(Function &F) override;
+
+ bool performScalarRepl(Function &F);
+ bool performPromotion(Function &F);
+
+ private:
+ bool HasDomTree;
+
+ /// DeadInsts - Keep track of instructions we have made dead, so that
+ /// we can remove them after we are done working.
+ SmallVector<Value*, 32> DeadInsts;
+
+ /// AllocaInfo - When analyzing uses of an alloca instruction, this captures
+ /// information about the uses. All these fields are initialized to false
+ /// and set to true when something is learned.
+ struct AllocaInfo {
+ /// The alloca to promote.
+ AllocaInst *AI;
+
+ /// CheckedPHIs - This is a set of verified PHI nodes, to prevent infinite
+ /// looping and avoid redundant work.
+ SmallPtrSet<PHINode*, 8> CheckedPHIs;
+
+ /// isUnsafe - This is set to true if the alloca cannot be SROA'd.
+ bool isUnsafe : 1;
+
+ /// isMemCpySrc - This is true if this aggregate is memcpy'd from.
+ bool isMemCpySrc : 1;
+
+ /// isMemCpyDst - This is true if this aggregate is memcpy'd into.
+ bool isMemCpyDst : 1;
+
+ /// hasSubelementAccess - This is true if a subelement of the alloca is
+ /// ever accessed, or false if the alloca is only accessed with mem
+ /// intrinsics or load/store that only access the entire alloca at once.
+ bool hasSubelementAccess : 1;
+
+ /// hasALoadOrStore - This is true if there are any loads or stores to it.
+ /// The alloca may just be accessed with memcpy, for example, which would
+ /// not set this.
+ bool hasALoadOrStore : 1;
+
+ explicit AllocaInfo(AllocaInst *ai)
+ : AI(ai), isUnsafe(false), isMemCpySrc(false), isMemCpyDst(false),
+ hasSubelementAccess(false), hasALoadOrStore(false) {}
+ };
+
+ /// SRThreshold - The maximum alloca size to considered for SROA.
+ unsigned SRThreshold;
+
+ /// StructMemberThreshold - The maximum number of members a struct can
+ /// contain to be considered for SROA.
+ unsigned StructMemberThreshold;
+
+ /// ArrayElementThreshold - The maximum number of elements an array can
+ /// have to be considered for SROA.
+ unsigned ArrayElementThreshold;
+
+ /// ScalarLoadThreshold - The maximum size in bits of scalars to load when
+ /// converting to scalar
+ unsigned ScalarLoadThreshold;
+
+ void MarkUnsafe(AllocaInfo &I, Instruction *User) {
+ I.isUnsafe = true;
+ DEBUG(dbgs() << " Transformation preventing inst: " << *User << '\n');
+ }
+
+ bool isSafeAllocaToScalarRepl(AllocaInst *AI);
+
+ void isSafeForScalarRepl(Instruction *I, uint64_t Offset, AllocaInfo &Info);
+ void isSafePHISelectUseForScalarRepl(Instruction *User, uint64_t Offset,
+ AllocaInfo &Info);
+ void isSafeGEP(GetElementPtrInst *GEPI, uint64_t &Offset, AllocaInfo &Info);
+ void isSafeMemAccess(uint64_t Offset, uint64_t MemSize,
+ Type *MemOpType, bool isStore, AllocaInfo &Info,
+ Instruction *TheAccess, bool AllowWholeAccess);
+ bool TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size,
+ const DataLayout &DL);
+ uint64_t FindElementAndOffset(Type *&T, uint64_t &Offset, Type *&IdxTy,
+ const DataLayout &DL);
+
+ void DoScalarReplacement(AllocaInst *AI,
+ std::vector<AllocaInst*> &WorkList);
+ void DeleteDeadInstructions();
+
+ void RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
+ SmallVectorImpl<AllocaInst *> &NewElts);
+ void RewriteBitCast(BitCastInst *BC, AllocaInst *AI, uint64_t Offset,
+ SmallVectorImpl<AllocaInst *> &NewElts);
+ void RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset,
+ SmallVectorImpl<AllocaInst *> &NewElts);
+ void RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI,
+ uint64_t Offset,
+ SmallVectorImpl<AllocaInst *> &NewElts);
+ void RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
+ AllocaInst *AI,
+ SmallVectorImpl<AllocaInst *> &NewElts);
+ void RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI,
+ SmallVectorImpl<AllocaInst *> &NewElts);
+ void RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI,
+ SmallVectorImpl<AllocaInst *> &NewElts);
+ bool ShouldAttemptScalarRepl(AllocaInst *AI);
+ };
+
+ // SROA_DT - SROA that uses DominatorTree.
+ struct SROA_DT : public SROA {
+ static char ID;
+ public:
+ SROA_DT(int T = -1, int ST = -1, int AT = -1, int SLT = -1) :
+ SROA(T, true, ID, ST, AT, SLT) {
+ initializeSROA_DTPass(*PassRegistry::getPassRegistry());
+ }
+
+ // getAnalysisUsage - This pass does not require any passes, but we know it
+ // will not alter the CFG, so say so.
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.setPreservesCFG();
+ }
+ };
+
+ // SROA_SSAUp - SROA that uses SSAUpdater.
+ struct SROA_SSAUp : public SROA {
+ static char ID;
+ public:
+ SROA_SSAUp(int T = -1, int ST = -1, int AT = -1, int SLT = -1) :
+ SROA(T, false, ID, ST, AT, SLT) {
+ initializeSROA_SSAUpPass(*PassRegistry::getPassRegistry());
+ }
+
+ // getAnalysisUsage - This pass does not require any passes, but we know it
+ // will not alter the CFG, so say so.
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.setPreservesCFG();
+ }
+ };
+
+}
+
+char SROA_DT::ID = 0;
+char SROA_SSAUp::ID = 0;
+
+INITIALIZE_PASS_BEGIN(SROA_DT, "scalarrepl",
+ "Scalar Replacement of Aggregates (DT)", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(SROA_DT, "scalarrepl",
+ "Scalar Replacement of Aggregates (DT)", false, false)
+
+INITIALIZE_PASS_BEGIN(SROA_SSAUp, "scalarrepl-ssa",
+ "Scalar Replacement of Aggregates (SSAUp)", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_END(SROA_SSAUp, "scalarrepl-ssa",
+ "Scalar Replacement of Aggregates (SSAUp)", false, false)
+
+// Public interface to the ScalarReplAggregates pass
+FunctionPass *llvm::createScalarReplAggregatesPass(int Threshold,
+ bool UseDomTree,
+ int StructMemberThreshold,
+ int ArrayElementThreshold,
+ int ScalarLoadThreshold) {
+ if (UseDomTree)
+ return new SROA_DT(Threshold, StructMemberThreshold, ArrayElementThreshold,
+ ScalarLoadThreshold);
+ return new SROA_SSAUp(Threshold, StructMemberThreshold,
+ ArrayElementThreshold, ScalarLoadThreshold);
+}
+
+
+//===----------------------------------------------------------------------===//
+// Convert To Scalar Optimization.
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// ConvertToScalarInfo - This class implements the "Convert To Scalar"
+/// optimization, which scans the uses of an alloca and determines if it can
+/// rewrite it in terms of a single new alloca that can be mem2reg'd.
+class ConvertToScalarInfo {
+ /// AllocaSize - The size of the alloca being considered in bytes.
+ unsigned AllocaSize;
+ const DataLayout &DL;
+ unsigned ScalarLoadThreshold;
+
+ /// IsNotTrivial - This is set to true if there is some access to the object
+ /// which means that mem2reg can't promote it.
+ bool IsNotTrivial;
+
+ /// ScalarKind - Tracks the kind of alloca being considered for promotion,
+ /// computed based on the uses of the alloca rather than the LLVM type system.
+ enum {
+ Unknown,
+
+ // Accesses via GEPs that are consistent with element access of a vector
+ // type. This will not be converted into a vector unless there is a later
+ // access using an actual vector type.
+ ImplicitVector,
+
+ // Accesses via vector operations and GEPs that are consistent with the
+ // layout of a vector type.
+ Vector,
+
+ // An integer bag-of-bits with bitwise operations for insertion and
+ // extraction. Any combination of types can be converted into this kind
+ // of scalar.
+ Integer
+ } ScalarKind;
+
+ /// VectorTy - This tracks the type that we should promote the vector to if
+ /// it is possible to turn it into a vector. This starts out null, and if it
+ /// isn't possible to turn into a vector type, it gets set to VoidTy.
+ VectorType *VectorTy;
+
+ /// HadNonMemTransferAccess - True if there is at least one access to the
+ /// alloca that is not a MemTransferInst. We don't want to turn structs into
+ /// large integers unless there is some potential for optimization.
+ bool HadNonMemTransferAccess;
+
+ /// HadDynamicAccess - True if some element of this alloca was dynamic.
+ /// We don't yet have support for turning a dynamic access into a large
+ /// integer.
+ bool HadDynamicAccess;
+
+public:
+ explicit ConvertToScalarInfo(unsigned Size, const DataLayout &DL,
+ unsigned SLT)
+ : AllocaSize(Size), DL(DL), ScalarLoadThreshold(SLT), IsNotTrivial(false),
+ ScalarKind(Unknown), VectorTy(nullptr), HadNonMemTransferAccess(false),
+ HadDynamicAccess(false) { }
+
+ AllocaInst *TryConvert(AllocaInst *AI);
+
+private:
+ bool CanConvertToScalar(Value *V, uint64_t Offset, Value* NonConstantIdx);
+ void MergeInTypeForLoadOrStore(Type *In, uint64_t Offset);
+ bool MergeInVectorType(VectorType *VInTy, uint64_t Offset);
+ void ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, uint64_t Offset,
+ Value *NonConstantIdx);
+
+ Value *ConvertScalar_ExtractValue(Value *NV, Type *ToType,
+ uint64_t Offset, Value* NonConstantIdx,
+ IRBuilder<> &Builder);
+ Value *ConvertScalar_InsertValue(Value *StoredVal, Value *ExistingVal,
+ uint64_t Offset, Value* NonConstantIdx,
+ IRBuilder<> &Builder);
+};
+} // end anonymous namespace.
+
+
+/// TryConvert - Analyze the specified alloca, and if it is safe to do so,
+/// rewrite it to be a new alloca which is mem2reg'able. This returns the new
+/// alloca if possible or null if not.
+AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) {
+ // If we can't convert this scalar, or if mem2reg can trivially do it, bail
+ // out.
+ if (!CanConvertToScalar(AI, 0, nullptr) || !IsNotTrivial)
+ return nullptr;
+
+ // If an alloca has only memset / memcpy uses, it may still have an Unknown
+ // ScalarKind. Treat it as an Integer below.
+ if (ScalarKind == Unknown)
+ ScalarKind = Integer;
+
+ if (ScalarKind == Vector && VectorTy->getBitWidth() != AllocaSize * 8)
+ ScalarKind = Integer;
+
+ // If we were able to find a vector type that can handle this with
+ // insert/extract elements, and if there was at least one use that had
+ // a vector type, promote this to a vector. We don't want to promote
+ // random stuff that doesn't use vectors (e.g. <9 x double>) because then
+ // we just get a lot of insert/extracts. If at least one vector is
+ // involved, then we probably really do have a union of vector/array.
+ Type *NewTy;
+ if (ScalarKind == Vector) {
+ assert(VectorTy && "Missing type for vector scalar.");
+ DEBUG(dbgs() << "CONVERT TO VECTOR: " << *AI << "\n TYPE = "
+ << *VectorTy << '\n');
+ NewTy = VectorTy; // Use the vector type.
+ } else {
+ unsigned BitWidth = AllocaSize * 8;
+
+ // Do not convert to scalar integer if the alloca size exceeds the
+ // scalar load threshold.
+ if (BitWidth > ScalarLoadThreshold)
+ return nullptr;
+
+ if ((ScalarKind == ImplicitVector || ScalarKind == Integer) &&
+ !HadNonMemTransferAccess && !DL.fitsInLegalInteger(BitWidth))
+ return nullptr;
+ // Dynamic accesses on integers aren't yet supported. They need us to shift
+ // by a dynamic amount which could be difficult to work out as we might not
+ // know whether to use a left or right shift.
+ if (ScalarKind == Integer && HadDynamicAccess)
+ return nullptr;
+
+ DEBUG(dbgs() << "CONVERT TO SCALAR INTEGER: " << *AI << "\n");
+ // Create and insert the integer alloca.
+ NewTy = IntegerType::get(AI->getContext(), BitWidth);
+ }
+ AllocaInst *NewAI =
+ new AllocaInst(NewTy, nullptr, "", &AI->getParent()->front());
+ ConvertUsesToScalar(AI, NewAI, 0, nullptr);
+ return NewAI;
+}
+
+/// MergeInTypeForLoadOrStore - Add the 'In' type to the accumulated vector type
+/// (VectorTy) so far at the offset specified by Offset (which is specified in
+/// bytes).
+///
+/// There are two cases we handle here:
+/// 1) A union of vector types of the same size and potentially its elements.
+/// Here we turn element accesses into insert/extract element operations.
+/// This promotes a <4 x float> with a store of float to the third element
+/// into a <4 x float> that uses insert element.
+/// 2) A fully general blob of memory, which we turn into some (potentially
+/// large) integer type with extract and insert operations where the loads
+/// and stores would mutate the memory. We mark this by setting VectorTy
+/// to VoidTy.
+void ConvertToScalarInfo::MergeInTypeForLoadOrStore(Type *In,
+ uint64_t Offset) {
+ // If we already decided to turn this into a blob of integer memory, there is
+ // nothing to be done.
+ if (ScalarKind == Integer)
+ return;
+
+ // If this could be contributing to a vector, analyze it.
+
+ // If the In type is a vector that is the same size as the alloca, see if it
+ // matches the existing VecTy.
+ if (VectorType *VInTy = dyn_cast<VectorType>(In)) {
+ if (MergeInVectorType(VInTy, Offset))
+ return;
+ } else if (In->isFloatTy() || In->isDoubleTy() ||
+ (In->isIntegerTy() && In->getPrimitiveSizeInBits() >= 8 &&
+ isPowerOf2_32(In->getPrimitiveSizeInBits()))) {
+ // Full width accesses can be ignored, because they can always be turned
+ // into bitcasts.
+ unsigned EltSize = In->getPrimitiveSizeInBits()/8;
+ if (EltSize == AllocaSize)
+ return;
+
+ // If we're accessing something that could be an element of a vector, see
+ // if the implied vector agrees with what we already have and if Offset is
+ // compatible with it.
+ if (Offset % EltSize == 0 && AllocaSize % EltSize == 0 &&
+ (!VectorTy || EltSize == VectorTy->getElementType()
+ ->getPrimitiveSizeInBits()/8)) {
+ if (!VectorTy) {
+ ScalarKind = ImplicitVector;
+ VectorTy = VectorType::get(In, AllocaSize/EltSize);
+ }
+ return;
+ }
+ }
+
+ // Otherwise, we have a case that we can't handle with an optimized vector
+ // form. We can still turn this into a large integer.
+ ScalarKind = Integer;
+}
+
+/// MergeInVectorType - Handles the vector case of MergeInTypeForLoadOrStore,
+/// returning true if the type was successfully merged and false otherwise.
+bool ConvertToScalarInfo::MergeInVectorType(VectorType *VInTy,
+ uint64_t Offset) {
+ if (VInTy->getBitWidth()/8 == AllocaSize && Offset == 0) {
+ // If we're storing/loading a vector of the right size, allow it as a
+ // vector. If this the first vector we see, remember the type so that
+ // we know the element size. If this is a subsequent access, ignore it
+ // even if it is a differing type but the same size. Worst case we can
+ // bitcast the resultant vectors.
+ if (!VectorTy)
+ VectorTy = VInTy;
+ ScalarKind = Vector;
+ return true;
+ }
+
+ return false;
+}
+
+/// CanConvertToScalar - V is a pointer. If we can convert the pointee and all
+/// its accesses to a single vector type, return true and set VecTy to
+/// the new type. If we could convert the alloca into a single promotable
+/// integer, return true but set VecTy to VoidTy. Further, if the use is not a
+/// completely trivial use that mem2reg could promote, set IsNotTrivial. Offset
+/// is the current offset from the base of the alloca being analyzed.
+///
+/// If we see at least one access to the value that is as a vector type, set the
+/// SawVec flag.
+bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset,
+ Value* NonConstantIdx) {
+ for (User *U : V->users()) {
+ Instruction *UI = cast<Instruction>(U);
+
+ if (LoadInst *LI = dyn_cast<LoadInst>(UI)) {
+ // Don't break volatile loads.
+ if (!LI->isSimple())
+ return false;
+ // Don't touch MMX operations.
+ if (LI->getType()->isX86_MMXTy())
+ return false;
+ HadNonMemTransferAccess = true;
+ MergeInTypeForLoadOrStore(LI->getType(), Offset);
+ continue;
+ }
+
+ if (StoreInst *SI = dyn_cast<StoreInst>(UI)) {
+ // Storing the pointer, not into the value?
+ if (SI->getOperand(0) == V || !SI->isSimple()) return false;
+ // Don't touch MMX operations.
+ if (SI->getOperand(0)->getType()->isX86_MMXTy())
+ return false;
+ HadNonMemTransferAccess = true;
+ MergeInTypeForLoadOrStore(SI->getOperand(0)->getType(), Offset);
+ continue;
+ }
+
+ if (BitCastInst *BCI = dyn_cast<BitCastInst>(UI)) {
+ if (!onlyUsedByLifetimeMarkers(BCI))
+ IsNotTrivial = true; // Can't be mem2reg'd.
+ if (!CanConvertToScalar(BCI, Offset, NonConstantIdx))
+ return false;
+ continue;
+ }
+
+ if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UI)) {
+ // If this is a GEP with a variable indices, we can't handle it.
+ PointerType* PtrTy = dyn_cast<PointerType>(GEP->getPointerOperandType());
+ if (!PtrTy)
+ return false;
+
+ // Compute the offset that this GEP adds to the pointer.
+ SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end());
+ Value *GEPNonConstantIdx = nullptr;
+ if (!GEP->hasAllConstantIndices()) {
+ if (!isa<VectorType>(PtrTy->getElementType()))
+ return false;
+ if (NonConstantIdx)
+ return false;
+ GEPNonConstantIdx = Indices.pop_back_val();
+ if (!GEPNonConstantIdx->getType()->isIntegerTy(32))
+ return false;
+ HadDynamicAccess = true;
+ } else
+ GEPNonConstantIdx = NonConstantIdx;
+ uint64_t GEPOffset = DL.getIndexedOffset(PtrTy,
+ Indices);
+ // See if all uses can be converted.
+ if (!CanConvertToScalar(GEP, Offset+GEPOffset, GEPNonConstantIdx))
+ return false;
+ IsNotTrivial = true; // Can't be mem2reg'd.
+ HadNonMemTransferAccess = true;
+ continue;
+ }
+
+ // If this is a constant sized memset of a constant value (e.g. 0) we can
+ // handle it.
+ if (MemSetInst *MSI = dyn_cast<MemSetInst>(UI)) {
+ // Store to dynamic index.
+ if (NonConstantIdx)
+ return false;
+ // Store of constant value.
+ if (!isa<ConstantInt>(MSI->getValue()))
+ return false;
+
+ // Store of constant size.
+ ConstantInt *Len = dyn_cast<ConstantInt>(MSI->getLength());
+ if (!Len)
+ return false;
+
+ // If the size differs from the alloca, we can only convert the alloca to
+ // an integer bag-of-bits.
+ // FIXME: This should handle all of the cases that are currently accepted
+ // as vector element insertions.
+ if (Len->getZExtValue() != AllocaSize || Offset != 0)
+ ScalarKind = Integer;
+
+ IsNotTrivial = true; // Can't be mem2reg'd.
+ HadNonMemTransferAccess = true;
+ continue;
+ }
+
+ // If this is a memcpy or memmove into or out of the whole allocation, we
+ // can handle it like a load or store of the scalar type.
+ if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(UI)) {
+ // Store to dynamic index.
+ if (NonConstantIdx)
+ return false;
+ ConstantInt *Len = dyn_cast<ConstantInt>(MTI->getLength());
+ if (!Len || Len->getZExtValue() != AllocaSize || Offset != 0)
+ return false;
+
+ IsNotTrivial = true; // Can't be mem2reg'd.
+ continue;
+ }
+
+ // If this is a lifetime intrinsic, we can handle it.
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(UI)) {
+ if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
+ II->getIntrinsicID() == Intrinsic::lifetime_end) {
+ continue;
+ }
+ }
+
+ // Otherwise, we cannot handle this!
+ return false;
+ }
+
+ return true;
+}
+
+/// ConvertUsesToScalar - Convert all of the users of Ptr to use the new alloca
+/// directly. This happens when we are converting an "integer union" to a
+/// single integer scalar, or when we are converting a "vector union" to a
+/// vector with insert/extractelement instructions.
+///
+/// Offset is an offset from the original alloca, in bits that need to be
+/// shifted to the right. By the end of this, there should be no uses of Ptr.
+void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
+ uint64_t Offset,
+ Value* NonConstantIdx) {
+ while (!Ptr->use_empty()) {
+ Instruction *User = cast<Instruction>(Ptr->user_back());
+
+ if (BitCastInst *CI = dyn_cast<BitCastInst>(User)) {
+ ConvertUsesToScalar(CI, NewAI, Offset, NonConstantIdx);
+ CI->eraseFromParent();
+ continue;
+ }
+
+ if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(User)) {
+ // Compute the offset that this GEP adds to the pointer.
+ SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end());
+ Value* GEPNonConstantIdx = nullptr;
+ if (!GEP->hasAllConstantIndices()) {
+ assert(!NonConstantIdx &&
+ "Dynamic GEP reading from dynamic GEP unsupported");
+ GEPNonConstantIdx = Indices.pop_back_val();
+ } else
+ GEPNonConstantIdx = NonConstantIdx;
+ uint64_t GEPOffset = DL.getIndexedOffset(GEP->getPointerOperandType(),
+ Indices);
+ ConvertUsesToScalar(GEP, NewAI, Offset+GEPOffset*8, GEPNonConstantIdx);
+ GEP->eraseFromParent();
+ continue;
+ }
+
+ IRBuilder<> Builder(User);
+
+ if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
+ // The load is a bit extract from NewAI shifted right by Offset bits.
+ Value *LoadedVal = Builder.CreateLoad(NewAI);
+ Value *NewLoadVal
+ = ConvertScalar_ExtractValue(LoadedVal, LI->getType(), Offset,
+ NonConstantIdx, Builder);
+ LI->replaceAllUsesWith(NewLoadVal);
+ LI->eraseFromParent();
+ continue;
+ }
+
+ if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
+ assert(SI->getOperand(0) != Ptr && "Consistency error!");
+ Instruction *Old = Builder.CreateLoad(NewAI, NewAI->getName()+".in");
+ Value *New = ConvertScalar_InsertValue(SI->getOperand(0), Old, Offset,
+ NonConstantIdx, Builder);
+ Builder.CreateStore(New, NewAI);
+ SI->eraseFromParent();
+
+ // If the load we just inserted is now dead, then the inserted store
+ // overwrote the entire thing.
+ if (Old->use_empty())
+ Old->eraseFromParent();
+ continue;
+ }
+
+ // If this is a constant sized memset of a constant value (e.g. 0) we can
+ // transform it into a store of the expanded constant value.
+ if (MemSetInst *MSI = dyn_cast<MemSetInst>(User)) {
+ assert(MSI->getRawDest() == Ptr && "Consistency error!");
+ assert(!NonConstantIdx && "Cannot replace dynamic memset with insert");
+ int64_t SNumBytes = cast<ConstantInt>(MSI->getLength())->getSExtValue();
+ if (SNumBytes > 0 && (SNumBytes >> 32) == 0) {
+ unsigned NumBytes = static_cast<unsigned>(SNumBytes);
+ unsigned Val = cast<ConstantInt>(MSI->getValue())->getZExtValue();
+
+ // Compute the value replicated the right number of times.
+ APInt APVal(NumBytes*8, Val);
+
+ // Splat the value if non-zero.
+ if (Val)
+ for (unsigned i = 1; i != NumBytes; ++i)
+ APVal |= APVal << 8;
+
+ Instruction *Old = Builder.CreateLoad(NewAI, NewAI->getName()+".in");
+ Value *New = ConvertScalar_InsertValue(
+ ConstantInt::get(User->getContext(), APVal),
+ Old, Offset, nullptr, Builder);
+ Builder.CreateStore(New, NewAI);
+
+ // If the load we just inserted is now dead, then the memset overwrote
+ // the entire thing.
+ if (Old->use_empty())
+ Old->eraseFromParent();
+ }
+ MSI->eraseFromParent();
+ continue;
+ }
+
+ // If this is a memcpy or memmove into or out of the whole allocation, we
+ // can handle it like a load or store of the scalar type.
+ if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(User)) {
+ assert(Offset == 0 && "must be store to start of alloca");
+ assert(!NonConstantIdx && "Cannot replace dynamic transfer with insert");
+
+ // If the source and destination are both to the same alloca, then this is
+ // a noop copy-to-self, just delete it. Otherwise, emit a load and store
+ // as appropriate.
+ AllocaInst *OrigAI = cast<AllocaInst>(GetUnderlyingObject(Ptr, DL, 0));
+
+ if (GetUnderlyingObject(MTI->getSource(), DL, 0) != OrigAI) {
+ // Dest must be OrigAI, change this to be a load from the original
+ // pointer (bitcasted), then a store to our new alloca.
+ assert(MTI->getRawDest() == Ptr && "Neither use is of pointer?");
+ Value *SrcPtr = MTI->getSource();
+ PointerType* SPTy = cast<PointerType>(SrcPtr->getType());
+ PointerType* AIPTy = cast<PointerType>(NewAI->getType());
+ if (SPTy->getAddressSpace() != AIPTy->getAddressSpace()) {
+ AIPTy = PointerType::get(AIPTy->getElementType(),
+ SPTy->getAddressSpace());
+ }
+ SrcPtr = Builder.CreateBitCast(SrcPtr, AIPTy);
+
+ LoadInst *SrcVal = Builder.CreateLoad(SrcPtr, "srcval");
+ SrcVal->setAlignment(MTI->getAlignment());
+ Builder.CreateStore(SrcVal, NewAI);
+ } else if (GetUnderlyingObject(MTI->getDest(), DL, 0) != OrigAI) {
+ // Src must be OrigAI, change this to be a load from NewAI then a store
+ // through the original dest pointer (bitcasted).
+ assert(MTI->getRawSource() == Ptr && "Neither use is of pointer?");
+ LoadInst *SrcVal = Builder.CreateLoad(NewAI, "srcval");
+
+ PointerType* DPTy = cast<PointerType>(MTI->getDest()->getType());
+ PointerType* AIPTy = cast<PointerType>(NewAI->getType());
+ if (DPTy->getAddressSpace() != AIPTy->getAddressSpace()) {
+ AIPTy = PointerType::get(AIPTy->getElementType(),
+ DPTy->getAddressSpace());
+ }
+ Value *DstPtr = Builder.CreateBitCast(MTI->getDest(), AIPTy);
+
+ StoreInst *NewStore = Builder.CreateStore(SrcVal, DstPtr);
+ NewStore->setAlignment(MTI->getAlignment());
+ } else {
+ // Noop transfer. Src == Dst
+ }
+
+ MTI->eraseFromParent();
+ continue;
+ }
+
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) {
+ if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
+ II->getIntrinsicID() == Intrinsic::lifetime_end) {
+ // There's no need to preserve these, as the resulting alloca will be
+ // converted to a register anyways.
+ II->eraseFromParent();
+ continue;
+ }
+ }
+
+ llvm_unreachable("Unsupported operation!");
+ }
+}
+
+/// ConvertScalar_ExtractValue - Extract a value of type ToType from an integer
+/// or vector value FromVal, extracting the bits from the offset specified by
+/// Offset. This returns the value, which is of type ToType.
+///
+/// This happens when we are converting an "integer union" to a single
+/// integer scalar, or when we are converting a "vector union" to a vector with
+/// insert/extractelement instructions.
+///
+/// Offset is an offset from the original alloca, in bits that need to be
+/// shifted to the right.
+Value *ConvertToScalarInfo::
+ConvertScalar_ExtractValue(Value *FromVal, Type *ToType,
+ uint64_t Offset, Value* NonConstantIdx,
+ IRBuilder<> &Builder) {
+ // If the load is of the whole new alloca, no conversion is needed.
+ Type *FromType = FromVal->getType();
+ if (FromType == ToType && Offset == 0)
+ return FromVal;
+
+ // If the result alloca is a vector type, this is either an element
+ // access or a bitcast to another vector type of the same size.
+ if (VectorType *VTy = dyn_cast<VectorType>(FromType)) {
+ unsigned FromTypeSize = DL.getTypeAllocSize(FromType);
+ unsigned ToTypeSize = DL.getTypeAllocSize(ToType);
+ if (FromTypeSize == ToTypeSize)
+ return Builder.CreateBitCast(FromVal, ToType);
+
+ // Otherwise it must be an element access.
+ unsigned Elt = 0;
+ if (Offset) {
+ unsigned EltSize = DL.getTypeAllocSizeInBits(VTy->getElementType());
+ Elt = Offset/EltSize;
+ assert(EltSize*Elt == Offset && "Invalid modulus in validity checking");
+ }
+ // Return the element extracted out of it.
+ Value *Idx;
+ if (NonConstantIdx) {
+ if (Elt)
+ Idx = Builder.CreateAdd(NonConstantIdx,
+ Builder.getInt32(Elt),
+ "dyn.offset");
+ else
+ Idx = NonConstantIdx;
+ } else
+ Idx = Builder.getInt32(Elt);
+ Value *V = Builder.CreateExtractElement(FromVal, Idx);
+ if (V->getType() != ToType)
+ V = Builder.CreateBitCast(V, ToType);
+ return V;
+ }
+
+ // If ToType is a first class aggregate, extract out each of the pieces and
+ // use insertvalue's to form the FCA.
+ if (StructType *ST = dyn_cast<StructType>(ToType)) {
+ assert(!NonConstantIdx &&
+ "Dynamic indexing into struct types not supported");
+ const StructLayout &Layout = *DL.getStructLayout(ST);
+ Value *Res = UndefValue::get(ST);
+ for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) {
+ Value *Elt = ConvertScalar_ExtractValue(FromVal, ST->getElementType(i),
+ Offset+Layout.getElementOffsetInBits(i),
+ nullptr, Builder);
+ Res = Builder.CreateInsertValue(Res, Elt, i);
+ }
+ return Res;
+ }
+
+ if (ArrayType *AT = dyn_cast<ArrayType>(ToType)) {
+ assert(!NonConstantIdx &&
+ "Dynamic indexing into array types not supported");
+ uint64_t EltSize = DL.getTypeAllocSizeInBits(AT->getElementType());
+ Value *Res = UndefValue::get(AT);
+ for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) {
+ Value *Elt = ConvertScalar_ExtractValue(FromVal, AT->getElementType(),
+ Offset+i*EltSize, nullptr,
+ Builder);
+ Res = Builder.CreateInsertValue(Res, Elt, i);
+ }
+ return Res;
+ }
+
+ // Otherwise, this must be a union that was converted to an integer value.
+ IntegerType *NTy = cast<IntegerType>(FromVal->getType());
+
+ // If this is a big-endian system and the load is narrower than the
+ // full alloca type, we need to do a shift to get the right bits.
+ int ShAmt = 0;
+ if (DL.isBigEndian()) {
+ // On big-endian machines, the lowest bit is stored at the bit offset
+ // from the pointer given by getTypeStoreSizeInBits. This matters for
+ // integers with a bitwidth that is not a multiple of 8.
+ ShAmt = DL.getTypeStoreSizeInBits(NTy) -
+ DL.getTypeStoreSizeInBits(ToType) - Offset;
+ } else {
+ ShAmt = Offset;
+ }
+
+ // Note: we support negative bitwidths (with shl) which are not defined.
+ // We do this to support (f.e.) loads off the end of a structure where
+ // only some bits are used.
+ if (ShAmt > 0 && (unsigned)ShAmt < NTy->getBitWidth())
+ FromVal = Builder.CreateLShr(FromVal,
+ ConstantInt::get(FromVal->getType(), ShAmt));
+ else if (ShAmt < 0 && (unsigned)-ShAmt < NTy->getBitWidth())
+ FromVal = Builder.CreateShl(FromVal,
+ ConstantInt::get(FromVal->getType(), -ShAmt));
+
+ // Finally, unconditionally truncate the integer to the right width.
+ unsigned LIBitWidth = DL.getTypeSizeInBits(ToType);
+ if (LIBitWidth < NTy->getBitWidth())
+ FromVal =
+ Builder.CreateTrunc(FromVal, IntegerType::get(FromVal->getContext(),
+ LIBitWidth));
+ else if (LIBitWidth > NTy->getBitWidth())
+ FromVal =
+ Builder.CreateZExt(FromVal, IntegerType::get(FromVal->getContext(),
+ LIBitWidth));
+
+ // If the result is an integer, this is a trunc or bitcast.
+ if (ToType->isIntegerTy()) {
+ // Should be done.
+ } else if (ToType->isFloatingPointTy() || ToType->isVectorTy()) {
+ // Just do a bitcast, we know the sizes match up.
+ FromVal = Builder.CreateBitCast(FromVal, ToType);
+ } else {
+ // Otherwise must be a pointer.
+ FromVal = Builder.CreateIntToPtr(FromVal, ToType);
+ }
+ assert(FromVal->getType() == ToType && "Didn't convert right?");
+ return FromVal;
+}
+
+/// ConvertScalar_InsertValue - Insert the value "SV" into the existing integer
+/// or vector value "Old" at the offset specified by Offset.
+///
+/// This happens when we are converting an "integer union" to a
+/// single integer scalar, or when we are converting a "vector union" to a
+/// vector with insert/extractelement instructions.
+///
+/// Offset is an offset from the original alloca, in bits that need to be
+/// shifted to the right.
+///
+/// NonConstantIdx is an index value if there was a GEP with a non-constant
+/// index value. If this is 0 then all GEPs used to find this insert address
+/// are constant.
+Value *ConvertToScalarInfo::
+ConvertScalar_InsertValue(Value *SV, Value *Old,
+ uint64_t Offset, Value* NonConstantIdx,
+ IRBuilder<> &Builder) {
+ // Convert the stored type to the actual type, shift it left to insert
+ // then 'or' into place.
+ Type *AllocaType = Old->getType();
+ LLVMContext &Context = Old->getContext();
+
+ if (VectorType *VTy = dyn_cast<VectorType>(AllocaType)) {
+ uint64_t VecSize = DL.getTypeAllocSizeInBits(VTy);
+ uint64_t ValSize = DL.getTypeAllocSizeInBits(SV->getType());
+
+ // Changing the whole vector with memset or with an access of a different
+ // vector type?
+ if (ValSize == VecSize)
+ return Builder.CreateBitCast(SV, AllocaType);
+
+ // Must be an element insertion.
+ Type *EltTy = VTy->getElementType();
+ if (SV->getType() != EltTy)
+ SV = Builder.CreateBitCast(SV, EltTy);
+ uint64_t EltSize = DL.getTypeAllocSizeInBits(EltTy);
+ unsigned Elt = Offset/EltSize;
+ Value *Idx;
+ if (NonConstantIdx) {
+ if (Elt)
+ Idx = Builder.CreateAdd(NonConstantIdx,
+ Builder.getInt32(Elt),
+ "dyn.offset");
+ else
+ Idx = NonConstantIdx;
+ } else
+ Idx = Builder.getInt32(Elt);
+ return Builder.CreateInsertElement(Old, SV, Idx);
+ }
+
+ // If SV is a first-class aggregate value, insert each value recursively.
+ if (StructType *ST = dyn_cast<StructType>(SV->getType())) {
+ assert(!NonConstantIdx &&
+ "Dynamic indexing into struct types not supported");
+ const StructLayout &Layout = *DL.getStructLayout(ST);
+ for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) {
+ Value *Elt = Builder.CreateExtractValue(SV, i);
+ Old = ConvertScalar_InsertValue(Elt, Old,
+ Offset+Layout.getElementOffsetInBits(i),
+ nullptr, Builder);
+ }
+ return Old;
+ }
+
+ if (ArrayType *AT = dyn_cast<ArrayType>(SV->getType())) {
+ assert(!NonConstantIdx &&
+ "Dynamic indexing into array types not supported");
+ uint64_t EltSize = DL.getTypeAllocSizeInBits(AT->getElementType());
+ for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) {
+ Value *Elt = Builder.CreateExtractValue(SV, i);
+ Old = ConvertScalar_InsertValue(Elt, Old, Offset+i*EltSize, nullptr,
+ Builder);
+ }
+ return Old;
+ }
+
+ // If SV is a float, convert it to the appropriate integer type.
+ // If it is a pointer, do the same.
+ unsigned SrcWidth = DL.getTypeSizeInBits(SV->getType());
+ unsigned DestWidth = DL.getTypeSizeInBits(AllocaType);
+ unsigned SrcStoreWidth = DL.getTypeStoreSizeInBits(SV->getType());
+ unsigned DestStoreWidth = DL.getTypeStoreSizeInBits(AllocaType);
+ if (SV->getType()->isFloatingPointTy() || SV->getType()->isVectorTy())
+ SV = Builder.CreateBitCast(SV, IntegerType::get(SV->getContext(),SrcWidth));
+ else if (SV->getType()->isPointerTy())
+ SV = Builder.CreatePtrToInt(SV, DL.getIntPtrType(SV->getType()));
+
+ // Zero extend or truncate the value if needed.
+ if (SV->getType() != AllocaType) {
+ if (SV->getType()->getPrimitiveSizeInBits() <
+ AllocaType->getPrimitiveSizeInBits())
+ SV = Builder.CreateZExt(SV, AllocaType);
+ else {
+ // Truncation may be needed if storing more than the alloca can hold
+ // (undefined behavior).
+ SV = Builder.CreateTrunc(SV, AllocaType);
+ SrcWidth = DestWidth;
+ SrcStoreWidth = DestStoreWidth;
+ }
+ }
+
+ // If this is a big-endian system and the store is narrower than the
+ // full alloca type, we need to do a shift to get the right bits.
+ int ShAmt = 0;
+ if (DL.isBigEndian()) {
+ // On big-endian machines, the lowest bit is stored at the bit offset
+ // from the pointer given by getTypeStoreSizeInBits. This matters for
+ // integers with a bitwidth that is not a multiple of 8.
+ ShAmt = DestStoreWidth - SrcStoreWidth - Offset;
+ } else {
+ ShAmt = Offset;
+ }
+
+ // Note: we support negative bitwidths (with shr) which are not defined.
+ // We do this to support (f.e.) stores off the end of a structure where
+ // only some bits in the structure are set.
+ APInt Mask(APInt::getLowBitsSet(DestWidth, SrcWidth));
+ if (ShAmt > 0 && (unsigned)ShAmt < DestWidth) {
+ SV = Builder.CreateShl(SV, ConstantInt::get(SV->getType(), ShAmt));
+ Mask <<= ShAmt;
+ } else if (ShAmt < 0 && (unsigned)-ShAmt < DestWidth) {
+ SV = Builder.CreateLShr(SV, ConstantInt::get(SV->getType(), -ShAmt));
+ Mask = Mask.lshr(-ShAmt);
+ }
+
+ // Mask out the bits we are about to insert from the old value, and or
+ // in the new bits.
+ if (SrcWidth != DestWidth) {
+ assert(DestWidth > SrcWidth);
+ Old = Builder.CreateAnd(Old, ConstantInt::get(Context, ~Mask), "mask");
+ SV = Builder.CreateOr(Old, SV, "ins");
+ }
+ return SV;
+}
+
+
+//===----------------------------------------------------------------------===//
+// SRoA Driver
+//===----------------------------------------------------------------------===//
+
+
+bool SROA::runOnFunction(Function &F) {
+ if (skipOptnoneFunction(F))
+ return false;
+
+ bool Changed = performPromotion(F);
+
+ while (1) {
+ bool LocalChange = performScalarRepl(F);
+ if (!LocalChange) break; // No need to repromote if no scalarrepl
+ Changed = true;
+ LocalChange = performPromotion(F);
+ if (!LocalChange) break; // No need to re-scalarrepl if no promotion
+ }
+
+ return Changed;
+}
+
+namespace {
+class AllocaPromoter : public LoadAndStorePromoter {
+ AllocaInst *AI;
+ DIBuilder *DIB;
+ SmallVector<DbgDeclareInst *, 4> DDIs;
+ SmallVector<DbgValueInst *, 4> DVIs;
+public:
+ AllocaPromoter(ArrayRef<Instruction*> Insts, SSAUpdater &S,
+ DIBuilder *DB)
+ : LoadAndStorePromoter(Insts, S), AI(nullptr), DIB(DB) {}
+
+ void run(AllocaInst *AI, const SmallVectorImpl<Instruction*> &Insts) {
+ // Remember which alloca we're promoting (for isInstInList).
+ this->AI = AI;
+ if (auto *L = LocalAsMetadata::getIfExists(AI)) {
+ if (auto *DINode = MetadataAsValue::getIfExists(AI->getContext(), L)) {
+ for (User *U : DINode->users())
+ if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U))
+ DDIs.push_back(DDI);
+ else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(U))
+ DVIs.push_back(DVI);
+ }
+ }
+
+ LoadAndStorePromoter::run(Insts);
+ AI->eraseFromParent();
+ for (SmallVectorImpl<DbgDeclareInst *>::iterator I = DDIs.begin(),
+ E = DDIs.end(); I != E; ++I) {
+ DbgDeclareInst *DDI = *I;
+ DDI->eraseFromParent();
+ }
+ for (SmallVectorImpl<DbgValueInst *>::iterator I = DVIs.begin(),
+ E = DVIs.end(); I != E; ++I) {
+ DbgValueInst *DVI = *I;
+ DVI->eraseFromParent();
+ }
+ }
+
+ bool isInstInList(Instruction *I,
+ const SmallVectorImpl<Instruction*> &Insts) const override {
+ if (LoadInst *LI = dyn_cast<LoadInst>(I))
+ return LI->getOperand(0) == AI;
+ return cast<StoreInst>(I)->getPointerOperand() == AI;
+ }
+
+ void updateDebugInfo(Instruction *Inst) const override {
+ for (SmallVectorImpl<DbgDeclareInst *>::const_iterator I = DDIs.begin(),
+ E = DDIs.end(); I != E; ++I) {
+ DbgDeclareInst *DDI = *I;
+ if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+ ConvertDebugDeclareToDebugValue(DDI, SI, *DIB);
+ else if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
+ ConvertDebugDeclareToDebugValue(DDI, LI, *DIB);
+ }
+ for (SmallVectorImpl<DbgValueInst *>::const_iterator I = DVIs.begin(),
+ E = DVIs.end(); I != E; ++I) {
+ DbgValueInst *DVI = *I;
+ Value *Arg = nullptr;
+ if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+ // If an argument is zero extended then use argument directly. The ZExt
+ // may be zapped by an optimization pass in future.
+ if (ZExtInst *ZExt = dyn_cast<ZExtInst>(SI->getOperand(0)))
+ Arg = dyn_cast<Argument>(ZExt->getOperand(0));
+ if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0)))
+ Arg = dyn_cast<Argument>(SExt->getOperand(0));
+ if (!Arg)
+ Arg = SI->getOperand(0);
+ } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+ Arg = LI->getOperand(0);
+ } else {
+ continue;
+ }
+ DIB->insertDbgValueIntrinsic(Arg, 0, DVI->getVariable(),
+ DVI->getExpression(), DVI->getDebugLoc(),
+ Inst);
+ }
+ }
+};
+} // end anon namespace
+
+/// isSafeSelectToSpeculate - Select instructions that use an alloca and are
+/// subsequently loaded can be rewritten to load both input pointers and then
+/// select between the result, allowing the load of the alloca to be promoted.
+/// From this:
+/// %P2 = select i1 %cond, i32* %Alloca, i32* %Other
+/// %V = load i32* %P2
+/// to:
+/// %V1 = load i32* %Alloca -> will be mem2reg'd
+/// %V2 = load i32* %Other
+/// %V = select i1 %cond, i32 %V1, i32 %V2
+///
+/// We can do this to a select if its only uses are loads and if the operand to
+/// the select can be loaded unconditionally.
+static bool isSafeSelectToSpeculate(SelectInst *SI) {
+ const DataLayout &DL = SI->getModule()->getDataLayout();
+ bool TDerefable = isDereferenceablePointer(SI->getTrueValue(), DL);
+ bool FDerefable = isDereferenceablePointer(SI->getFalseValue(), DL);
+
+ for (User *U : SI->users()) {
+ LoadInst *LI = dyn_cast<LoadInst>(U);
+ if (!LI || !LI->isSimple()) return false;
+
+ // Both operands to the select need to be dereferencable, either absolutely
+ // (e.g. allocas) or at this point because we can see other accesses to it.
+ if (!TDerefable &&
+ !isSafeToLoadUnconditionally(SI->getTrueValue(), LI,
+ LI->getAlignment()))
+ return false;
+ if (!FDerefable &&
+ !isSafeToLoadUnconditionally(SI->getFalseValue(), LI,
+ LI->getAlignment()))
+ return false;
+ }
+
+ return true;
+}
+
+/// isSafePHIToSpeculate - PHI instructions that use an alloca and are
+/// subsequently loaded can be rewritten to load both input pointers in the pred
+/// blocks and then PHI the results, allowing the load of the alloca to be
+/// promoted.
+/// From this:
+/// %P2 = phi [i32* %Alloca, i32* %Other]
+/// %V = load i32* %P2
+/// to:
+/// %V1 = load i32* %Alloca -> will be mem2reg'd
+/// ...
+/// %V2 = load i32* %Other
+/// ...
+/// %V = phi [i32 %V1, i32 %V2]
+///
+/// We can do this to a select if its only uses are loads and if the operand to
+/// the select can be loaded unconditionally.
+static bool isSafePHIToSpeculate(PHINode *PN) {
+ // For now, we can only do this promotion if the load is in the same block as
+ // the PHI, and if there are no stores between the phi and load.
+ // TODO: Allow recursive phi users.
+ // TODO: Allow stores.
+ BasicBlock *BB = PN->getParent();
+ unsigned MaxAlign = 0;
+ for (User *U : PN->users()) {
+ LoadInst *LI = dyn_cast<LoadInst>(U);
+ if (!LI || !LI->isSimple()) return false;
+
+ // For now we only allow loads in the same block as the PHI. This is a
+ // common case that happens when instcombine merges two loads through a PHI.
+ if (LI->getParent() != BB) return false;
+
+ // Ensure that there are no instructions between the PHI and the load that
+ // could store.
+ for (BasicBlock::iterator BBI(PN); &*BBI != LI; ++BBI)
+ if (BBI->mayWriteToMemory())
+ return false;
+
+ MaxAlign = std::max(MaxAlign, LI->getAlignment());
+ }
+
+ const DataLayout &DL = PN->getModule()->getDataLayout();
+
+ // Okay, we know that we have one or more loads in the same block as the PHI.
+ // We can transform this if it is safe to push the loads into the predecessor
+ // blocks. The only thing to watch out for is that we can't put a possibly
+ // trapping load in the predecessor if it is a critical edge.
+ for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+ BasicBlock *Pred = PN->getIncomingBlock(i);
+ Value *InVal = PN->getIncomingValue(i);
+
+ // If the terminator of the predecessor has side-effects (an invoke),
+ // there is no safe place to put a load in the predecessor.
+ if (Pred->getTerminator()->mayHaveSideEffects())
+ return false;
+
+ // If the value is produced by the terminator of the predecessor
+ // (an invoke), there is no valid place to put a load in the predecessor.
+ if (Pred->getTerminator() == InVal)
+ return false;
+
+ // If the predecessor has a single successor, then the edge isn't critical.
+ if (Pred->getTerminator()->getNumSuccessors() == 1)
+ continue;
+
+ // If this pointer is always safe to load, or if we can prove that there is
+ // already a load in the block, then we can move the load to the pred block.
+ if (isDereferenceablePointer(InVal, DL) ||
+ isSafeToLoadUnconditionally(InVal, Pred->getTerminator(), MaxAlign))
+ continue;
+
+ return false;
+ }
+
+ return true;
+}
+
+
+/// tryToMakeAllocaBePromotable - This returns true if the alloca only has
+/// direct (non-volatile) loads and stores to it. If the alloca is close but
+/// not quite there, this will transform the code to allow promotion. As such,
+/// it is a non-pure predicate.
+static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout &DL) {
+ SetVector<Instruction*, SmallVector<Instruction*, 4>,
+ SmallPtrSet<Instruction*, 4> > InstsToRewrite;
+ for (User *U : AI->users()) {
+ if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
+ if (!LI->isSimple())
+ return false;
+ continue;
+ }
+
+ if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
+ if (SI->getOperand(0) == AI || !SI->isSimple())
+ return false; // Don't allow a store OF the AI, only INTO the AI.
+ continue;
+ }
+
+ if (SelectInst *SI = dyn_cast<SelectInst>(U)) {
+ // If the condition being selected on is a constant, fold the select, yes
+ // this does (rarely) happen early on.
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(SI->getCondition())) {
+ Value *Result = SI->getOperand(1+CI->isZero());
+ SI->replaceAllUsesWith(Result);
+ SI->eraseFromParent();
+
+ // This is very rare and we just scrambled the use list of AI, start
+ // over completely.
+ return tryToMakeAllocaBePromotable(AI, DL);
+ }
+
+ // If it is safe to turn "load (select c, AI, ptr)" into a select of two
+ // loads, then we can transform this by rewriting the select.
+ if (!isSafeSelectToSpeculate(SI))
+ return false;
+
+ InstsToRewrite.insert(SI);
+ continue;
+ }
+
+ if (PHINode *PN = dyn_cast<PHINode>(U)) {
+ if (PN->use_empty()) { // Dead PHIs can be stripped.
+ InstsToRewrite.insert(PN);
+ continue;
+ }
+
+ // If it is safe to turn "load (phi [AI, ptr, ...])" into a PHI of loads
+ // in the pred blocks, then we can transform this by rewriting the PHI.
+ if (!isSafePHIToSpeculate(PN))
+ return false;
+
+ InstsToRewrite.insert(PN);
+ continue;
+ }
+
+ if (BitCastInst *BCI = dyn_cast<BitCastInst>(U)) {
+ if (onlyUsedByLifetimeMarkers(BCI)) {
+ InstsToRewrite.insert(BCI);
+ continue;
+ }
+ }
+
+ return false;
+ }
+
+ // If there are no instructions to rewrite, then all uses are load/stores and
+ // we're done!
+ if (InstsToRewrite.empty())
+ return true;
+
+ // If we have instructions that need to be rewritten for this to be promotable
+ // take care of it now.
+ for (unsigned i = 0, e = InstsToRewrite.size(); i != e; ++i) {
+ if (BitCastInst *BCI = dyn_cast<BitCastInst>(InstsToRewrite[i])) {
+ // This could only be a bitcast used by nothing but lifetime intrinsics.
+ for (BitCastInst::user_iterator I = BCI->user_begin(), E = BCI->user_end();
+ I != E;)
+ cast<Instruction>(*I++)->eraseFromParent();
+ BCI->eraseFromParent();
+ continue;
+ }
+
+ if (SelectInst *SI = dyn_cast<SelectInst>(InstsToRewrite[i])) {
+ // Selects in InstsToRewrite only have load uses. Rewrite each as two
+ // loads with a new select.
+ while (!SI->use_empty()) {
+ LoadInst *LI = cast<LoadInst>(SI->user_back());
+
+ IRBuilder<> Builder(LI);
+ LoadInst *TrueLoad =
+ Builder.CreateLoad(SI->getTrueValue(), LI->getName()+".t");
+ LoadInst *FalseLoad =
+ Builder.CreateLoad(SI->getFalseValue(), LI->getName()+".f");
+
+ // Transfer alignment and AA info if present.
+ TrueLoad->setAlignment(LI->getAlignment());
+ FalseLoad->setAlignment(LI->getAlignment());
+
+ AAMDNodes Tags;
+ LI->getAAMetadata(Tags);
+ if (Tags) {
+ TrueLoad->setAAMetadata(Tags);
+ FalseLoad->setAAMetadata(Tags);
+ }
+
+ Value *V = Builder.CreateSelect(SI->getCondition(), TrueLoad, FalseLoad);
+ V->takeName(LI);
+ LI->replaceAllUsesWith(V);
+ LI->eraseFromParent();
+ }
+
+ // Now that all the loads are gone, the select is gone too.
+ SI->eraseFromParent();
+ continue;
+ }
+
+ // Otherwise, we have a PHI node which allows us to push the loads into the
+ // predecessors.
+ PHINode *PN = cast<PHINode>(InstsToRewrite[i]);
+ if (PN->use_empty()) {
+ PN->eraseFromParent();
+ continue;
+ }
+
+ Type *LoadTy = cast<PointerType>(PN->getType())->getElementType();
+ PHINode *NewPN = PHINode::Create(LoadTy, PN->getNumIncomingValues(),
+ PN->getName()+".ld", PN);
+
+ // Get the AA tags and alignment to use from one of the loads. It doesn't
+ // matter which one we get and if any differ, it doesn't matter.
+ LoadInst *SomeLoad = cast<LoadInst>(PN->user_back());
+
+ AAMDNodes AATags;
+ SomeLoad->getAAMetadata(AATags);
+ unsigned Align = SomeLoad->getAlignment();
+
+ // Rewrite all loads of the PN to use the new PHI.
+ while (!PN->use_empty()) {
+ LoadInst *LI = cast<LoadInst>(PN->user_back());
+ LI->replaceAllUsesWith(NewPN);
+ LI->eraseFromParent();
+ }
+
+ // Inject loads into all of the pred blocks. Keep track of which blocks we
+ // insert them into in case we have multiple edges from the same block.
+ DenseMap<BasicBlock*, LoadInst*> InsertedLoads;
+
+ for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+ BasicBlock *Pred = PN->getIncomingBlock(i);
+ LoadInst *&Load = InsertedLoads[Pred];
+ if (!Load) {
+ Load = new LoadInst(PN->getIncomingValue(i),
+ PN->getName() + "." + Pred->getName(),
+ Pred->getTerminator());
+ Load->setAlignment(Align);
+ if (AATags) Load->setAAMetadata(AATags);
+ }
+
+ NewPN->addIncoming(Load, Pred);
+ }
+
+ PN->eraseFromParent();
+ }
+
+ ++NumAdjusted;
+ return true;
+}
+
+bool SROA::performPromotion(Function &F) {
+ std::vector<AllocaInst*> Allocas;
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ DominatorTree *DT = nullptr;
+ if (HasDomTree)
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ AssumptionCache &AC =
+ getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+
+ BasicBlock &BB = F.getEntryBlock(); // Get the entry node for the function
+ DIBuilder DIB(*F.getParent(), /*AllowUnresolved*/ false);
+ bool Changed = false;
+ SmallVector<Instruction*, 64> Insts;
+ while (1) {
+ Allocas.clear();
+
+ // Find allocas that are safe to promote, by looking at all instructions in
+ // the entry node
+ for (BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I)
+ if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) // Is it an alloca?
+ if (tryToMakeAllocaBePromotable(AI, DL))
+ Allocas.push_back(AI);
+
+ if (Allocas.empty()) break;
+
+ if (HasDomTree)
+ PromoteMemToReg(Allocas, *DT, nullptr, &AC);
+ else {
+ SSAUpdater SSA;
+ for (unsigned i = 0, e = Allocas.size(); i != e; ++i) {
+ AllocaInst *AI = Allocas[i];
+
+ // Build list of instructions to promote.
+ for (User *U : AI->users())
+ Insts.push_back(cast<Instruction>(U));
+ AllocaPromoter(Insts, SSA, &DIB).run(AI, Insts);
+ Insts.clear();
+ }
+ }
+ NumPromoted += Allocas.size();
+ Changed = true;
+ }
+
+ return Changed;
+}
+
+
+/// ShouldAttemptScalarRepl - Decide if an alloca is a good candidate for
+/// SROA. It must be a struct or array type with a small number of elements.
+bool SROA::ShouldAttemptScalarRepl(AllocaInst *AI) {
+ Type *T = AI->getAllocatedType();
+ // Do not promote any struct that has too many members.
+ if (StructType *ST = dyn_cast<StructType>(T))
+ return ST->getNumElements() <= StructMemberThreshold;
+ // Do not promote any array that has too many elements.
+ if (ArrayType *AT = dyn_cast<ArrayType>(T))
+ return AT->getNumElements() <= ArrayElementThreshold;
+ return false;
+}
+
+// performScalarRepl - This algorithm is a simple worklist driven algorithm,
+// which runs on all of the alloca instructions in the entry block, removing
+// them if they are only used by getelementptr instructions.
+//
+bool SROA::performScalarRepl(Function &F) {
+ std::vector<AllocaInst*> WorkList;
+ const DataLayout &DL = F.getParent()->getDataLayout();
+
+ // Scan the entry basic block, adding allocas to the worklist.
+ BasicBlock &BB = F.getEntryBlock();
+ for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ++I)
+ if (AllocaInst *A = dyn_cast<AllocaInst>(I))
+ WorkList.push_back(A);
+
+ // Process the worklist
+ bool Changed = false;
+ while (!WorkList.empty()) {
+ AllocaInst *AI = WorkList.back();
+ WorkList.pop_back();
+
+ // Handle dead allocas trivially. These can be formed by SROA'ing arrays
+ // with unused elements.
+ if (AI->use_empty()) {
+ AI->eraseFromParent();
+ Changed = true;
+ continue;
+ }
+
+ // If this alloca is impossible for us to promote, reject it early.
+ if (AI->isArrayAllocation() || !AI->getAllocatedType()->isSized())
+ continue;
+
+ // Check to see if we can perform the core SROA transformation. We cannot
+ // transform the allocation instruction if it is an array allocation
+ // (allocations OF arrays are ok though), and an allocation of a scalar
+ // value cannot be decomposed at all.
+ uint64_t AllocaSize = DL.getTypeAllocSize(AI->getAllocatedType());
+
+ // Do not promote [0 x %struct].
+ if (AllocaSize == 0) continue;
+
+ // Do not promote any struct whose size is too big.
+ if (AllocaSize > SRThreshold) continue;
+
+ // If the alloca looks like a good candidate for scalar replacement, and if
+ // all its users can be transformed, then split up the aggregate into its
+ // separate elements.
+ if (ShouldAttemptScalarRepl(AI) && isSafeAllocaToScalarRepl(AI)) {
+ DoScalarReplacement(AI, WorkList);
+ Changed = true;
+ continue;
+ }
+
+ // If we can turn this aggregate value (potentially with casts) into a
+ // simple scalar value that can be mem2reg'd into a register value.
+ // IsNotTrivial tracks whether this is something that mem2reg could have
+ // promoted itself. If so, we don't want to transform it needlessly. Note
+ // that we can't just check based on the type: the alloca may be of an i32
+ // but that has pointer arithmetic to set byte 3 of it or something.
+ if (AllocaInst *NewAI =
+ ConvertToScalarInfo((unsigned)AllocaSize, DL, ScalarLoadThreshold)
+ .TryConvert(AI)) {
+ NewAI->takeName(AI);
+ AI->eraseFromParent();
+ ++NumConverted;
+ Changed = true;
+ continue;
+ }
+
+ // Otherwise, couldn't process this alloca.
+ }
+
+ return Changed;
+}
+
+/// DoScalarReplacement - This alloca satisfied the isSafeAllocaToScalarRepl
+/// predicate, do SROA now.
+void SROA::DoScalarReplacement(AllocaInst *AI,
+ std::vector<AllocaInst*> &WorkList) {
+ DEBUG(dbgs() << "Found inst to SROA: " << *AI << '\n');
+ SmallVector<AllocaInst*, 32> ElementAllocas;
+ if (StructType *ST = dyn_cast<StructType>(AI->getAllocatedType())) {
+ ElementAllocas.reserve(ST->getNumContainedTypes());
+ for (unsigned i = 0, e = ST->getNumContainedTypes(); i != e; ++i) {
+ AllocaInst *NA = new AllocaInst(ST->getContainedType(i), nullptr,
+ AI->getAlignment(),
+ AI->getName() + "." + Twine(i), AI);
+ ElementAllocas.push_back(NA);
+ WorkList.push_back(NA); // Add to worklist for recursive processing
+ }
+ } else {
+ ArrayType *AT = cast<ArrayType>(AI->getAllocatedType());
+ ElementAllocas.reserve(AT->getNumElements());
+ Type *ElTy = AT->getElementType();
+ for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) {
+ AllocaInst *NA = new AllocaInst(ElTy, nullptr, AI->getAlignment(),
+ AI->getName() + "." + Twine(i), AI);
+ ElementAllocas.push_back(NA);
+ WorkList.push_back(NA); // Add to worklist for recursive processing
+ }
+ }
+
+ // Now that we have created the new alloca instructions, rewrite all the
+ // uses of the old alloca.
+ RewriteForScalarRepl(AI, AI, 0, ElementAllocas);
+
+ // Now erase any instructions that were made dead while rewriting the alloca.
+ DeleteDeadInstructions();
+ AI->eraseFromParent();
+
+ ++NumReplaced;
+}
+
+/// DeleteDeadInstructions - Erase instructions on the DeadInstrs list,
+/// recursively including all their operands that become trivially dead.
+void SROA::DeleteDeadInstructions() {
+ while (!DeadInsts.empty()) {
+ Instruction *I = cast<Instruction>(DeadInsts.pop_back_val());
+
+ for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI)
+ if (Instruction *U = dyn_cast<Instruction>(*OI)) {
+ // Zero out the operand and see if it becomes trivially dead.
+ // (But, don't add allocas to the dead instruction list -- they are
+ // already on the worklist and will be deleted separately.)
+ *OI = nullptr;
+ if (isInstructionTriviallyDead(U) && !isa<AllocaInst>(U))
+ DeadInsts.push_back(U);
+ }
+
+ I->eraseFromParent();
+ }
+}
+
+/// isSafeForScalarRepl - Check if instruction I is a safe use with regard to
+/// performing scalar replacement of alloca AI. The results are flagged in
+/// the Info parameter. Offset indicates the position within AI that is
+/// referenced by this instruction.
+void SROA::isSafeForScalarRepl(Instruction *I, uint64_t Offset,
+ AllocaInfo &Info) {
+ const DataLayout &DL = I->getModule()->getDataLayout();
+ for (Use &U : I->uses()) {
+ Instruction *User = cast<Instruction>(U.getUser());
+
+ if (BitCastInst *BC = dyn_cast<BitCastInst>(User)) {
+ isSafeForScalarRepl(BC, Offset, Info);
+ } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(User)) {
+ uint64_t GEPOffset = Offset;
+ isSafeGEP(GEPI, GEPOffset, Info);
+ if (!Info.isUnsafe)
+ isSafeForScalarRepl(GEPI, GEPOffset, Info);
+ } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(User)) {
+ ConstantInt *Length = dyn_cast<ConstantInt>(MI->getLength());
+ if (!Length || Length->isNegative())
+ return MarkUnsafe(Info, User);
+
+ isSafeMemAccess(Offset, Length->getZExtValue(), nullptr,
+ U.getOperandNo() == 0, Info, MI,
+ true /*AllowWholeAccess*/);
+ } else if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
+ if (!LI->isSimple())
+ return MarkUnsafe(Info, User);
+ Type *LIType = LI->getType();
+ isSafeMemAccess(Offset, DL.getTypeAllocSize(LIType), LIType, false, Info,
+ LI, true /*AllowWholeAccess*/);
+ Info.hasALoadOrStore = true;
+
+ } else if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
+ // Store is ok if storing INTO the pointer, not storing the pointer
+ if (!SI->isSimple() || SI->getOperand(0) == I)
+ return MarkUnsafe(Info, User);
+
+ Type *SIType = SI->getOperand(0)->getType();
+ isSafeMemAccess(Offset, DL.getTypeAllocSize(SIType), SIType, true, Info,
+ SI, true /*AllowWholeAccess*/);
+ Info.hasALoadOrStore = true;
+ } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) {
+ if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
+ II->getIntrinsicID() != Intrinsic::lifetime_end)
+ return MarkUnsafe(Info, User);
+ } else if (isa<PHINode>(User) || isa<SelectInst>(User)) {
+ isSafePHISelectUseForScalarRepl(User, Offset, Info);
+ } else {
+ return MarkUnsafe(Info, User);
+ }
+ if (Info.isUnsafe) return;
+ }
+}
+
+
+/// isSafePHIUseForScalarRepl - If we see a PHI node or select using a pointer
+/// derived from the alloca, we can often still split the alloca into elements.
+/// This is useful if we have a large alloca where one element is phi'd
+/// together somewhere: we can SRoA and promote all the other elements even if
+/// we end up not being able to promote this one.
+///
+/// All we require is that the uses of the PHI do not index into other parts of
+/// the alloca. The most important use case for this is single load and stores
+/// that are PHI'd together, which can happen due to code sinking.
+void SROA::isSafePHISelectUseForScalarRepl(Instruction *I, uint64_t Offset,
+ AllocaInfo &Info) {
+ // If we've already checked this PHI, don't do it again.
+ if (PHINode *PN = dyn_cast<PHINode>(I))
+ if (!Info.CheckedPHIs.insert(PN).second)
+ return;
+
+ const DataLayout &DL = I->getModule()->getDataLayout();
+ for (User *U : I->users()) {
+ Instruction *UI = cast<Instruction>(U);
+
+ if (BitCastInst *BC = dyn_cast<BitCastInst>(UI)) {
+ isSafePHISelectUseForScalarRepl(BC, Offset, Info);
+ } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(UI)) {
+ // Only allow "bitcast" GEPs for simplicity. We could generalize this,
+ // but would have to prove that we're staying inside of an element being
+ // promoted.
+ if (!GEPI->hasAllZeroIndices())
+ return MarkUnsafe(Info, UI);
+ isSafePHISelectUseForScalarRepl(GEPI, Offset, Info);
+ } else if (LoadInst *LI = dyn_cast<LoadInst>(UI)) {
+ if (!LI->isSimple())
+ return MarkUnsafe(Info, UI);
+ Type *LIType = LI->getType();
+ isSafeMemAccess(Offset, DL.getTypeAllocSize(LIType), LIType, false, Info,
+ LI, false /*AllowWholeAccess*/);
+ Info.hasALoadOrStore = true;
+
+ } else if (StoreInst *SI = dyn_cast<StoreInst>(UI)) {
+ // Store is ok if storing INTO the pointer, not storing the pointer
+ if (!SI->isSimple() || SI->getOperand(0) == I)
+ return MarkUnsafe(Info, UI);
+
+ Type *SIType = SI->getOperand(0)->getType();
+ isSafeMemAccess(Offset, DL.getTypeAllocSize(SIType), SIType, true, Info,
+ SI, false /*AllowWholeAccess*/);
+ Info.hasALoadOrStore = true;
+ } else if (isa<PHINode>(UI) || isa<SelectInst>(UI)) {
+ isSafePHISelectUseForScalarRepl(UI, Offset, Info);
+ } else {
+ return MarkUnsafe(Info, UI);
+ }
+ if (Info.isUnsafe) return;
+ }
+}
+
+/// isSafeGEP - Check if a GEP instruction can be handled for scalar
+/// replacement. It is safe when all the indices are constant, in-bounds
+/// references, and when the resulting offset corresponds to an element within
+/// the alloca type. The results are flagged in the Info parameter. Upon
+/// return, Offset is adjusted as specified by the GEP indices.
+void SROA::isSafeGEP(GetElementPtrInst *GEPI,
+ uint64_t &Offset, AllocaInfo &Info) {
+ gep_type_iterator GEPIt = gep_type_begin(GEPI), E = gep_type_end(GEPI);
+ if (GEPIt == E)
+ return;
+ bool NonConstant = false;
+ unsigned NonConstantIdxSize = 0;
+
+ // Walk through the GEP type indices, checking the types that this indexes
+ // into.
+ for (; GEPIt != E; ++GEPIt) {
+ // Ignore struct elements, no extra checking needed for these.
+ if ((*GEPIt)->isStructTy())
+ continue;
+
+ ConstantInt *IdxVal = dyn_cast<ConstantInt>(GEPIt.getOperand());
+ if (!IdxVal)
+ return MarkUnsafe(Info, GEPI);
+ }
+
+ // Compute the offset due to this GEP and check if the alloca has a
+ // component element at that offset.
+ SmallVector<Value*, 8> Indices(GEPI->op_begin() + 1, GEPI->op_end());
+ // If this GEP is non-constant then the last operand must have been a
+ // dynamic index into a vector. Pop this now as it has no impact on the
+ // constant part of the offset.
+ if (NonConstant)
+ Indices.pop_back();
+
+ const DataLayout &DL = GEPI->getModule()->getDataLayout();
+ Offset += DL.getIndexedOffset(GEPI->getPointerOperandType(), Indices);
+ if (!TypeHasComponent(Info.AI->getAllocatedType(), Offset, NonConstantIdxSize,
+ DL))
+ MarkUnsafe(Info, GEPI);
+}
+
+/// isHomogeneousAggregate - Check if type T is a struct or array containing
+/// elements of the same type (which is always true for arrays). If so,
+/// return true with NumElts and EltTy set to the number of elements and the
+/// element type, respectively.
+static bool isHomogeneousAggregate(Type *T, unsigned &NumElts,
+ Type *&EltTy) {
+ if (ArrayType *AT = dyn_cast<ArrayType>(T)) {
+ NumElts = AT->getNumElements();
+ EltTy = (NumElts == 0 ? nullptr : AT->getElementType());
+ return true;
+ }
+ if (StructType *ST = dyn_cast<StructType>(T)) {
+ NumElts = ST->getNumContainedTypes();
+ EltTy = (NumElts == 0 ? nullptr : ST->getContainedType(0));
+ for (unsigned n = 1; n < NumElts; ++n) {
+ if (ST->getContainedType(n) != EltTy)
+ return false;
+ }
+ return true;
+ }
+ return false;
+}
+
+/// isCompatibleAggregate - Check if T1 and T2 are either the same type or are
+/// "homogeneous" aggregates with the same element type and number of elements.
+static bool isCompatibleAggregate(Type *T1, Type *T2) {
+ if (T1 == T2)
+ return true;
+
+ unsigned NumElts1, NumElts2;
+ Type *EltTy1, *EltTy2;
+ if (isHomogeneousAggregate(T1, NumElts1, EltTy1) &&
+ isHomogeneousAggregate(T2, NumElts2, EltTy2) &&
+ NumElts1 == NumElts2 &&
+ EltTy1 == EltTy2)
+ return true;
+
+ return false;
+}
+
+/// isSafeMemAccess - Check if a load/store/memcpy operates on the entire AI
+/// alloca or has an offset and size that corresponds to a component element
+/// within it. The offset checked here may have been formed from a GEP with a
+/// pointer bitcasted to a different type.
+///
+/// If AllowWholeAccess is true, then this allows uses of the entire alloca as a
+/// unit. If false, it only allows accesses known to be in a single element.
+void SROA::isSafeMemAccess(uint64_t Offset, uint64_t MemSize,
+ Type *MemOpType, bool isStore,
+ AllocaInfo &Info, Instruction *TheAccess,
+ bool AllowWholeAccess) {
+ const DataLayout &DL = TheAccess->getModule()->getDataLayout();
+ // Check if this is a load/store of the entire alloca.
+ if (Offset == 0 && AllowWholeAccess &&
+ MemSize == DL.getTypeAllocSize(Info.AI->getAllocatedType())) {
+ // This can be safe for MemIntrinsics (where MemOpType is 0) and integer
+ // loads/stores (which are essentially the same as the MemIntrinsics with
+ // regard to copying padding between elements). But, if an alloca is
+ // flagged as both a source and destination of such operations, we'll need
+ // to check later for padding between elements.
+ if (!MemOpType || MemOpType->isIntegerTy()) {
+ if (isStore)
+ Info.isMemCpyDst = true;
+ else
+ Info.isMemCpySrc = true;
+ return;
+ }
+ // This is also safe for references using a type that is compatible with
+ // the type of the alloca, so that loads/stores can be rewritten using
+ // insertvalue/extractvalue.
+ if (isCompatibleAggregate(MemOpType, Info.AI->getAllocatedType())) {
+ Info.hasSubelementAccess = true;
+ return;
+ }
+ }
+ // Check if the offset/size correspond to a component within the alloca type.
+ Type *T = Info.AI->getAllocatedType();
+ if (TypeHasComponent(T, Offset, MemSize, DL)) {
+ Info.hasSubelementAccess = true;
+ return;
+ }
+
+ return MarkUnsafe(Info, TheAccess);
+}
+
+/// TypeHasComponent - Return true if T has a component type with the
+/// specified offset and size. If Size is zero, do not check the size.
+bool SROA::TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size,
+ const DataLayout &DL) {
+ Type *EltTy;
+ uint64_t EltSize;
+ if (StructType *ST = dyn_cast<StructType>(T)) {
+ const StructLayout *Layout = DL.getStructLayout(ST);
+ unsigned EltIdx = Layout->getElementContainingOffset(Offset);
+ EltTy = ST->getContainedType(EltIdx);
+ EltSize = DL.getTypeAllocSize(EltTy);
+ Offset -= Layout->getElementOffset(EltIdx);
+ } else if (ArrayType *AT = dyn_cast<ArrayType>(T)) {
+ EltTy = AT->getElementType();
+ EltSize = DL.getTypeAllocSize(EltTy);
+ if (Offset >= AT->getNumElements() * EltSize)
+ return false;
+ Offset %= EltSize;
+ } else if (VectorType *VT = dyn_cast<VectorType>(T)) {
+ EltTy = VT->getElementType();
+ EltSize = DL.getTypeAllocSize(EltTy);
+ if (Offset >= VT->getNumElements() * EltSize)
+ return false;
+ Offset %= EltSize;
+ } else {
+ return false;
+ }
+ if (Offset == 0 && (Size == 0 || EltSize == Size))
+ return true;
+ // Check if the component spans multiple elements.
+ if (Offset + Size > EltSize)
+ return false;
+ return TypeHasComponent(EltTy, Offset, Size, DL);
+}
+
+/// RewriteForScalarRepl - Alloca AI is being split into NewElts, so rewrite
+/// the instruction I, which references it, to use the separate elements.
+/// Offset indicates the position within AI that is referenced by this
+/// instruction.
+void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
+ SmallVectorImpl<AllocaInst *> &NewElts) {
+ const DataLayout &DL = I->getModule()->getDataLayout();
+ for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI!=E;) {
+ Use &TheUse = *UI++;
+ Instruction *User = cast<Instruction>(TheUse.getUser());
+
+ if (BitCastInst *BC = dyn_cast<BitCastInst>(User)) {
+ RewriteBitCast(BC, AI, Offset, NewElts);
+ continue;
+ }
+
+ if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(User)) {
+ RewriteGEP(GEPI, AI, Offset, NewElts);
+ continue;
+ }
+
+ if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(User)) {
+ ConstantInt *Length = dyn_cast<ConstantInt>(MI->getLength());
+ uint64_t MemSize = Length->getZExtValue();
+ if (Offset == 0 && MemSize == DL.getTypeAllocSize(AI->getAllocatedType()))
+ RewriteMemIntrinUserOfAlloca(MI, I, AI, NewElts);
+ // Otherwise the intrinsic can only touch a single element and the
+ // address operand will be updated, so nothing else needs to be done.
+ continue;
+ }
+
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) {
+ if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
+ II->getIntrinsicID() == Intrinsic::lifetime_end) {
+ RewriteLifetimeIntrinsic(II, AI, Offset, NewElts);
+ }
+ continue;
+ }
+
+ if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
+ Type *LIType = LI->getType();
+
+ if (isCompatibleAggregate(LIType, AI->getAllocatedType())) {
+ // Replace:
+ // %res = load { i32, i32 }* %alloc
+ // with:
+ // %load.0 = load i32* %alloc.0
+ // %insert.0 insertvalue { i32, i32 } zeroinitializer, i32 %load.0, 0
+ // %load.1 = load i32* %alloc.1
+ // %insert = insertvalue { i32, i32 } %insert.0, i32 %load.1, 1
+ // (Also works for arrays instead of structs)
+ Value *Insert = UndefValue::get(LIType);
+ IRBuilder<> Builder(LI);
+ for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
+ Value *Load = Builder.CreateLoad(NewElts[i], "load");
+ Insert = Builder.CreateInsertValue(Insert, Load, i, "insert");
+ }
+ LI->replaceAllUsesWith(Insert);
+ DeadInsts.push_back(LI);
+ } else if (LIType->isIntegerTy() &&
+ DL.getTypeAllocSize(LIType) ==
+ DL.getTypeAllocSize(AI->getAllocatedType())) {
+ // If this is a load of the entire alloca to an integer, rewrite it.
+ RewriteLoadUserOfWholeAlloca(LI, AI, NewElts);
+ }
+ continue;
+ }
+
+ if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
+ Value *Val = SI->getOperand(0);
+ Type *SIType = Val->getType();
+ if (isCompatibleAggregate(SIType, AI->getAllocatedType())) {
+ // Replace:
+ // store { i32, i32 } %val, { i32, i32 }* %alloc
+ // with:
+ // %val.0 = extractvalue { i32, i32 } %val, 0
+ // store i32 %val.0, i32* %alloc.0
+ // %val.1 = extractvalue { i32, i32 } %val, 1
+ // store i32 %val.1, i32* %alloc.1
+ // (Also works for arrays instead of structs)
+ IRBuilder<> Builder(SI);
+ for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
+ Value *Extract = Builder.CreateExtractValue(Val, i, Val->getName());
+ Builder.CreateStore(Extract, NewElts[i]);
+ }
+ DeadInsts.push_back(SI);
+ } else if (SIType->isIntegerTy() &&
+ DL.getTypeAllocSize(SIType) ==
+ DL.getTypeAllocSize(AI->getAllocatedType())) {
+ // If this is a store of the entire alloca from an integer, rewrite it.
+ RewriteStoreUserOfWholeAlloca(SI, AI, NewElts);
+ }
+ continue;
+ }
+
+ if (isa<SelectInst>(User) || isa<PHINode>(User)) {
+ // If we have a PHI user of the alloca itself (as opposed to a GEP or
+ // bitcast) we have to rewrite it. GEP and bitcast uses will be RAUW'd to
+ // the new pointer.
+ if (!isa<AllocaInst>(I)) continue;
+
+ assert(Offset == 0 && NewElts[0] &&
+ "Direct alloca use should have a zero offset");
+
+ // If we have a use of the alloca, we know the derived uses will be
+ // utilizing just the first element of the scalarized result. Insert a
+ // bitcast of the first alloca before the user as required.
+ AllocaInst *NewAI = NewElts[0];
+ BitCastInst *BCI = new BitCastInst(NewAI, AI->getType(), "", NewAI);
+ NewAI->moveBefore(BCI);
+ TheUse = BCI;
+ continue;
+ }
+ }
+}
+
+/// RewriteBitCast - Update a bitcast reference to the alloca being replaced
+/// and recursively continue updating all of its uses.
+void SROA::RewriteBitCast(BitCastInst *BC, AllocaInst *AI, uint64_t Offset,
+ SmallVectorImpl<AllocaInst *> &NewElts) {
+ RewriteForScalarRepl(BC, AI, Offset, NewElts);
+ if (BC->getOperand(0) != AI)
+ return;
+
+ // The bitcast references the original alloca. Replace its uses with
+ // references to the alloca containing offset zero (which is normally at
+ // index zero, but might not be in cases involving structs with elements
+ // of size zero).
+ Type *T = AI->getAllocatedType();
+ uint64_t EltOffset = 0;
+ Type *IdxTy;
+ uint64_t Idx = FindElementAndOffset(T, EltOffset, IdxTy,
+ BC->getModule()->getDataLayout());
+ Instruction *Val = NewElts[Idx];
+ if (Val->getType() != BC->getDestTy()) {
+ Val = new BitCastInst(Val, BC->getDestTy(), "", BC);
+ Val->takeName(BC);
+ }
+ BC->replaceAllUsesWith(Val);
+ DeadInsts.push_back(BC);
+}
+
+/// FindElementAndOffset - Return the index of the element containing Offset
+/// within the specified type, which must be either a struct or an array.
+/// Sets T to the type of the element and Offset to the offset within that
+/// element. IdxTy is set to the type of the index result to be used in a
+/// GEP instruction.
+uint64_t SROA::FindElementAndOffset(Type *&T, uint64_t &Offset, Type *&IdxTy,
+ const DataLayout &DL) {
+ uint64_t Idx = 0;
+
+ if (StructType *ST = dyn_cast<StructType>(T)) {
+ const StructLayout *Layout = DL.getStructLayout(ST);
+ Idx = Layout->getElementContainingOffset(Offset);
+ T = ST->getContainedType(Idx);
+ Offset -= Layout->getElementOffset(Idx);
+ IdxTy = Type::getInt32Ty(T->getContext());
+ return Idx;
+ } else if (ArrayType *AT = dyn_cast<ArrayType>(T)) {
+ T = AT->getElementType();
+ uint64_t EltSize = DL.getTypeAllocSize(T);
+ Idx = Offset / EltSize;
+ Offset -= Idx * EltSize;
+ IdxTy = Type::getInt64Ty(T->getContext());
+ return Idx;
+ }
+ VectorType *VT = cast<VectorType>(T);
+ T = VT->getElementType();
+ uint64_t EltSize = DL.getTypeAllocSize(T);
+ Idx = Offset / EltSize;
+ Offset -= Idx * EltSize;
+ IdxTy = Type::getInt64Ty(T->getContext());
+ return Idx;
+}
+
+/// RewriteGEP - Check if this GEP instruction moves the pointer across
+/// elements of the alloca that are being split apart, and if so, rewrite
+/// the GEP to be relative to the new element.
+void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset,
+ SmallVectorImpl<AllocaInst *> &NewElts) {
+ uint64_t OldOffset = Offset;
+ const DataLayout &DL = GEPI->getModule()->getDataLayout();
+ SmallVector<Value*, 8> Indices(GEPI->op_begin() + 1, GEPI->op_end());
+ // If the GEP was dynamic then it must have been a dynamic vector lookup.
+ // In this case, it must be the last GEP operand which is dynamic so keep that
+ // aside until we've found the constant GEP offset then add it back in at the
+ // end.
+ Value* NonConstantIdx = nullptr;
+ if (!GEPI->hasAllConstantIndices())
+ NonConstantIdx = Indices.pop_back_val();
+ Offset += DL.getIndexedOffset(GEPI->getPointerOperandType(), Indices);
+
+ RewriteForScalarRepl(GEPI, AI, Offset, NewElts);
+
+ Type *T = AI->getAllocatedType();
+ Type *IdxTy;
+ uint64_t OldIdx = FindElementAndOffset(T, OldOffset, IdxTy, DL);
+ if (GEPI->getOperand(0) == AI)
+ OldIdx = ~0ULL; // Force the GEP to be rewritten.
+
+ T = AI->getAllocatedType();
+ uint64_t EltOffset = Offset;
+ uint64_t Idx = FindElementAndOffset(T, EltOffset, IdxTy, DL);
+
+ // If this GEP does not move the pointer across elements of the alloca
+ // being split, then it does not needs to be rewritten.
+ if (Idx == OldIdx)
+ return;
+
+ Type *i32Ty = Type::getInt32Ty(AI->getContext());
+ SmallVector<Value*, 8> NewArgs;
+ NewArgs.push_back(Constant::getNullValue(i32Ty));
+ while (EltOffset != 0) {
+ uint64_t EltIdx = FindElementAndOffset(T, EltOffset, IdxTy, DL);
+ NewArgs.push_back(ConstantInt::get(IdxTy, EltIdx));
+ }
+ if (NonConstantIdx) {
+ Type* GepTy = T;
+ // This GEP has a dynamic index. We need to add "i32 0" to index through
+ // any structs or arrays in the original type until we get to the vector
+ // to index.
+ while (!isa<VectorType>(GepTy)) {
+ NewArgs.push_back(Constant::getNullValue(i32Ty));
+ GepTy = cast<CompositeType>(GepTy)->getTypeAtIndex(0U);
+ }
+ NewArgs.push_back(NonConstantIdx);
+ }
+ Instruction *Val = NewElts[Idx];
+ if (NewArgs.size() > 1) {
+ Val = GetElementPtrInst::CreateInBounds(Val, NewArgs, "", GEPI);
+ Val->takeName(GEPI);
+ }
+ if (Val->getType() != GEPI->getType())
+ Val = new BitCastInst(Val, GEPI->getType(), Val->getName(), GEPI);
+ GEPI->replaceAllUsesWith(Val);
+ DeadInsts.push_back(GEPI);
+}
+
+/// RewriteLifetimeIntrinsic - II is a lifetime.start/lifetime.end. Rewrite it
+/// to mark the lifetime of the scalarized memory.
+void SROA::RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI,
+ uint64_t Offset,
+ SmallVectorImpl<AllocaInst *> &NewElts) {
+ ConstantInt *OldSize = cast<ConstantInt>(II->getArgOperand(0));
+ // Put matching lifetime markers on everything from Offset up to
+ // Offset+OldSize.
+ Type *AIType = AI->getAllocatedType();
+ const DataLayout &DL = II->getModule()->getDataLayout();
+ uint64_t NewOffset = Offset;
+ Type *IdxTy;
+ uint64_t Idx = FindElementAndOffset(AIType, NewOffset, IdxTy, DL);
+
+ IRBuilder<> Builder(II);
+ uint64_t Size = OldSize->getLimitedValue();
+
+ if (NewOffset) {
+ // Splice the first element and index 'NewOffset' bytes in. SROA will
+ // split the alloca again later.
+ unsigned AS = AI->getType()->getAddressSpace();
+ Value *V = Builder.CreateBitCast(NewElts[Idx], Builder.getInt8PtrTy(AS));
+ V = Builder.CreateGEP(Builder.getInt8Ty(), V, Builder.getInt64(NewOffset));
+
+ IdxTy = NewElts[Idx]->getAllocatedType();
+ uint64_t EltSize = DL.getTypeAllocSize(IdxTy) - NewOffset;
+ if (EltSize > Size) {
+ EltSize = Size;
+ Size = 0;
+ } else {
+ Size -= EltSize;
+ }
+ if (II->getIntrinsicID() == Intrinsic::lifetime_start)
+ Builder.CreateLifetimeStart(V, Builder.getInt64(EltSize));
+ else
+ Builder.CreateLifetimeEnd(V, Builder.getInt64(EltSize));
+ ++Idx;
+ }
+
+ for (; Idx != NewElts.size() && Size; ++Idx) {
+ IdxTy = NewElts[Idx]->getAllocatedType();
+ uint64_t EltSize = DL.getTypeAllocSize(IdxTy);
+ if (EltSize > Size) {
+ EltSize = Size;
+ Size = 0;
+ } else {
+ Size -= EltSize;
+ }
+ if (II->getIntrinsicID() == Intrinsic::lifetime_start)
+ Builder.CreateLifetimeStart(NewElts[Idx],
+ Builder.getInt64(EltSize));
+ else
+ Builder.CreateLifetimeEnd(NewElts[Idx],
+ Builder.getInt64(EltSize));
+ }
+ DeadInsts.push_back(II);
+}
+
+/// RewriteMemIntrinUserOfAlloca - MI is a memcpy/memset/memmove from or to AI.
+/// Rewrite it to copy or set the elements of the scalarized memory.
+void
+SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
+ AllocaInst *AI,
+ SmallVectorImpl<AllocaInst *> &NewElts) {
+ // If this is a memcpy/memmove, construct the other pointer as the
+ // appropriate type. The "Other" pointer is the pointer that goes to memory
+ // that doesn't have anything to do with the alloca that we are promoting. For
+ // memset, this Value* stays null.
+ Value *OtherPtr = nullptr;
+ unsigned MemAlignment = MI->getAlignment();
+ if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) { // memmove/memcopy
+ if (Inst == MTI->getRawDest())
+ OtherPtr = MTI->getRawSource();
+ else {
+ assert(Inst == MTI->getRawSource());
+ OtherPtr = MTI->getRawDest();
+ }
+ }
+
+ // If there is an other pointer, we want to convert it to the same pointer
+ // type as AI has, so we can GEP through it safely.
+ if (OtherPtr) {
+ unsigned AddrSpace =
+ cast<PointerType>(OtherPtr->getType())->getAddressSpace();
+
+ // Remove bitcasts and all-zero GEPs from OtherPtr. This is an
+ // optimization, but it's also required to detect the corner case where
+ // both pointer operands are referencing the same memory, and where
+ // OtherPtr may be a bitcast or GEP that currently being rewritten. (This
+ // function is only called for mem intrinsics that access the whole
+ // aggregate, so non-zero GEPs are not an issue here.)
+ OtherPtr = OtherPtr->stripPointerCasts();
+
+ // Copying the alloca to itself is a no-op: just delete it.
+ if (OtherPtr == AI || OtherPtr == NewElts[0]) {
+ // This code will run twice for a no-op memcpy -- once for each operand.
+ // Put only one reference to MI on the DeadInsts list.
+ for (SmallVectorImpl<Value *>::const_iterator I = DeadInsts.begin(),
+ E = DeadInsts.end(); I != E; ++I)
+ if (*I == MI) return;
+ DeadInsts.push_back(MI);
+ return;
+ }
+
+ // If the pointer is not the right type, insert a bitcast to the right
+ // type.
+ Type *NewTy =
+ PointerType::get(AI->getType()->getElementType(), AddrSpace);
+
+ if (OtherPtr->getType() != NewTy)
+ OtherPtr = new BitCastInst(OtherPtr, NewTy, OtherPtr->getName(), MI);
+ }
+
+ // Process each element of the aggregate.
+ bool SROADest = MI->getRawDest() == Inst;
+
+ Constant *Zero = Constant::getNullValue(Type::getInt32Ty(MI->getContext()));
+ const DataLayout &DL = MI->getModule()->getDataLayout();
+
+ for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
+ // If this is a memcpy/memmove, emit a GEP of the other element address.
+ Value *OtherElt = nullptr;
+ unsigned OtherEltAlign = MemAlignment;
+
+ if (OtherPtr) {
+ Value *Idx[2] = { Zero,
+ ConstantInt::get(Type::getInt32Ty(MI->getContext()), i) };
+ OtherElt = GetElementPtrInst::CreateInBounds(OtherPtr, Idx,
+ OtherPtr->getName()+"."+Twine(i),
+ MI);
+ uint64_t EltOffset;
+ PointerType *OtherPtrTy = cast<PointerType>(OtherPtr->getType());
+ Type *OtherTy = OtherPtrTy->getElementType();
+ if (StructType *ST = dyn_cast<StructType>(OtherTy)) {
+ EltOffset = DL.getStructLayout(ST)->getElementOffset(i);
+ } else {
+ Type *EltTy = cast<SequentialType>(OtherTy)->getElementType();
+ EltOffset = DL.getTypeAllocSize(EltTy) * i;
+ }
+
+ // The alignment of the other pointer is the guaranteed alignment of the
+ // element, which is affected by both the known alignment of the whole
+ // mem intrinsic and the alignment of the element. If the alignment of
+ // the memcpy (f.e.) is 32 but the element is at a 4-byte offset, then the
+ // known alignment is just 4 bytes.
+ OtherEltAlign = (unsigned)MinAlign(OtherEltAlign, EltOffset);
+ }
+
+ Value *EltPtr = NewElts[i];
+ Type *EltTy = cast<PointerType>(EltPtr->getType())->getElementType();
+
+ // If we got down to a scalar, insert a load or store as appropriate.
+ if (EltTy->isSingleValueType()) {
+ if (isa<MemTransferInst>(MI)) {
+ if (SROADest) {
+ // From Other to Alloca.
+ Value *Elt = new LoadInst(OtherElt, "tmp", false, OtherEltAlign, MI);
+ new StoreInst(Elt, EltPtr, MI);
+ } else {
+ // From Alloca to Other.
+ Value *Elt = new LoadInst(EltPtr, "tmp", MI);
+ new StoreInst(Elt, OtherElt, false, OtherEltAlign, MI);
+ }
+ continue;
+ }
+ assert(isa<MemSetInst>(MI));
+
+ // If the stored element is zero (common case), just store a null
+ // constant.
+ Constant *StoreVal;
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(MI->getArgOperand(1))) {
+ if (CI->isZero()) {
+ StoreVal = Constant::getNullValue(EltTy); // 0.0, null, 0, <0,0>
+ } else {
+ // If EltTy is a vector type, get the element type.
+ Type *ValTy = EltTy->getScalarType();
+
+ // Construct an integer with the right value.
+ unsigned EltSize = DL.getTypeSizeInBits(ValTy);
+ APInt OneVal(EltSize, CI->getZExtValue());
+ APInt TotalVal(OneVal);
+ // Set each byte.
+ for (unsigned i = 0; 8*i < EltSize; ++i) {
+ TotalVal = TotalVal.shl(8);
+ TotalVal |= OneVal;
+ }
+
+ // Convert the integer value to the appropriate type.
+ StoreVal = ConstantInt::get(CI->getContext(), TotalVal);
+ if (ValTy->isPointerTy())
+ StoreVal = ConstantExpr::getIntToPtr(StoreVal, ValTy);
+ else if (ValTy->isFloatingPointTy())
+ StoreVal = ConstantExpr::getBitCast(StoreVal, ValTy);
+ assert(StoreVal->getType() == ValTy && "Type mismatch!");
+
+ // If the requested value was a vector constant, create it.
+ if (EltTy->isVectorTy()) {
+ unsigned NumElts = cast<VectorType>(EltTy)->getNumElements();
+ StoreVal = ConstantVector::getSplat(NumElts, StoreVal);
+ }
+ }
+ new StoreInst(StoreVal, EltPtr, MI);
+ continue;
+ }
+ // Otherwise, if we're storing a byte variable, use a memset call for
+ // this element.
+ }
+
+ unsigned EltSize = DL.getTypeAllocSize(EltTy);
+ if (!EltSize)
+ continue;
+
+ IRBuilder<> Builder(MI);
+
+ // Finally, insert the meminst for this element.
+ if (isa<MemSetInst>(MI)) {
+ Builder.CreateMemSet(EltPtr, MI->getArgOperand(1), EltSize,
+ MI->isVolatile());
+ } else {
+ assert(isa<MemTransferInst>(MI));
+ Value *Dst = SROADest ? EltPtr : OtherElt; // Dest ptr
+ Value *Src = SROADest ? OtherElt : EltPtr; // Src ptr
+
+ if (isa<MemCpyInst>(MI))
+ Builder.CreateMemCpy(Dst, Src, EltSize, OtherEltAlign,MI->isVolatile());
+ else
+ Builder.CreateMemMove(Dst, Src, EltSize,OtherEltAlign,MI->isVolatile());
+ }
+ }
+ DeadInsts.push_back(MI);
+}
+
+/// RewriteStoreUserOfWholeAlloca - We found a store of an integer that
+/// overwrites the entire allocation. Extract out the pieces of the stored
+/// integer and store them individually.
+void
+SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI,
+ SmallVectorImpl<AllocaInst *> &NewElts) {
+ // Extract each element out of the integer according to its structure offset
+ // and store the element value to the individual alloca.
+ Value *SrcVal = SI->getOperand(0);
+ Type *AllocaEltTy = AI->getAllocatedType();
+ const DataLayout &DL = SI->getModule()->getDataLayout();
+ uint64_t AllocaSizeBits = DL.getTypeAllocSizeInBits(AllocaEltTy);
+
+ IRBuilder<> Builder(SI);
+
+ // Handle tail padding by extending the operand
+ if (DL.getTypeSizeInBits(SrcVal->getType()) != AllocaSizeBits)
+ SrcVal = Builder.CreateZExt(SrcVal,
+ IntegerType::get(SI->getContext(), AllocaSizeBits));
+
+ DEBUG(dbgs() << "PROMOTING STORE TO WHOLE ALLOCA: " << *AI << '\n' << *SI
+ << '\n');
+
+ // There are two forms here: AI could be an array or struct. Both cases
+ // have different ways to compute the element offset.
+ if (StructType *EltSTy = dyn_cast<StructType>(AllocaEltTy)) {
+ const StructLayout *Layout = DL.getStructLayout(EltSTy);
+
+ for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
+ // Get the number of bits to shift SrcVal to get the value.
+ Type *FieldTy = EltSTy->getElementType(i);
+ uint64_t Shift = Layout->getElementOffsetInBits(i);
+
+ if (DL.isBigEndian())
+ Shift = AllocaSizeBits - Shift - DL.getTypeAllocSizeInBits(FieldTy);
+
+ Value *EltVal = SrcVal;
+ if (Shift) {
+ Value *ShiftVal = ConstantInt::get(EltVal->getType(), Shift);
+ EltVal = Builder.CreateLShr(EltVal, ShiftVal, "sroa.store.elt");
+ }
+
+ // Truncate down to an integer of the right size.
+ uint64_t FieldSizeBits = DL.getTypeSizeInBits(FieldTy);
+
+ // Ignore zero sized fields like {}, they obviously contain no data.
+ if (FieldSizeBits == 0) continue;
+
+ if (FieldSizeBits != AllocaSizeBits)
+ EltVal = Builder.CreateTrunc(EltVal,
+ IntegerType::get(SI->getContext(), FieldSizeBits));
+ Value *DestField = NewElts[i];
+ if (EltVal->getType() == FieldTy) {
+ // Storing to an integer field of this size, just do it.
+ } else if (FieldTy->isFloatingPointTy() || FieldTy->isVectorTy()) {
+ // Bitcast to the right element type (for fp/vector values).
+ EltVal = Builder.CreateBitCast(EltVal, FieldTy);
+ } else {
+ // Otherwise, bitcast the dest pointer (for aggregates).
+ DestField = Builder.CreateBitCast(DestField,
+ PointerType::getUnqual(EltVal->getType()));
+ }
+ new StoreInst(EltVal, DestField, SI);
+ }
+
+ } else {
+ ArrayType *ATy = cast<ArrayType>(AllocaEltTy);
+ Type *ArrayEltTy = ATy->getElementType();
+ uint64_t ElementOffset = DL.getTypeAllocSizeInBits(ArrayEltTy);
+ uint64_t ElementSizeBits = DL.getTypeSizeInBits(ArrayEltTy);
+
+ uint64_t Shift;
+
+ if (DL.isBigEndian())
+ Shift = AllocaSizeBits-ElementOffset;
+ else
+ Shift = 0;
+
+ for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
+ // Ignore zero sized fields like {}, they obviously contain no data.
+ if (ElementSizeBits == 0) continue;
+
+ Value *EltVal = SrcVal;
+ if (Shift) {
+ Value *ShiftVal = ConstantInt::get(EltVal->getType(), Shift);
+ EltVal = Builder.CreateLShr(EltVal, ShiftVal, "sroa.store.elt");
+ }
+
+ // Truncate down to an integer of the right size.
+ if (ElementSizeBits != AllocaSizeBits)
+ EltVal = Builder.CreateTrunc(EltVal,
+ IntegerType::get(SI->getContext(),
+ ElementSizeBits));
+ Value *DestField = NewElts[i];
+ if (EltVal->getType() == ArrayEltTy) {
+ // Storing to an integer field of this size, just do it.
+ } else if (ArrayEltTy->isFloatingPointTy() ||
+ ArrayEltTy->isVectorTy()) {
+ // Bitcast to the right element type (for fp/vector values).
+ EltVal = Builder.CreateBitCast(EltVal, ArrayEltTy);
+ } else {
+ // Otherwise, bitcast the dest pointer (for aggregates).
+ DestField = Builder.CreateBitCast(DestField,
+ PointerType::getUnqual(EltVal->getType()));
+ }
+ new StoreInst(EltVal, DestField, SI);
+
+ if (DL.isBigEndian())
+ Shift -= ElementOffset;
+ else
+ Shift += ElementOffset;
+ }
+ }
+
+ DeadInsts.push_back(SI);
+}
+
+/// RewriteLoadUserOfWholeAlloca - We found a load of the entire allocation to
+/// an integer. Load the individual pieces to form the aggregate value.
+void
+SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI,
+ SmallVectorImpl<AllocaInst *> &NewElts) {
+ // Extract each element out of the NewElts according to its structure offset
+ // and form the result value.
+ Type *AllocaEltTy = AI->getAllocatedType();
+ const DataLayout &DL = LI->getModule()->getDataLayout();
+ uint64_t AllocaSizeBits = DL.getTypeAllocSizeInBits(AllocaEltTy);
+
+ DEBUG(dbgs() << "PROMOTING LOAD OF WHOLE ALLOCA: " << *AI << '\n' << *LI
+ << '\n');
+
+ // There are two forms here: AI could be an array or struct. Both cases
+ // have different ways to compute the element offset.
+ const StructLayout *Layout = nullptr;
+ uint64_t ArrayEltBitOffset = 0;
+ if (StructType *EltSTy = dyn_cast<StructType>(AllocaEltTy)) {
+ Layout = DL.getStructLayout(EltSTy);
+ } else {
+ Type *ArrayEltTy = cast<ArrayType>(AllocaEltTy)->getElementType();
+ ArrayEltBitOffset = DL.getTypeAllocSizeInBits(ArrayEltTy);
+ }
+
+ Value *ResultVal =
+ Constant::getNullValue(IntegerType::get(LI->getContext(), AllocaSizeBits));
+
+ for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
+ // Load the value from the alloca. If the NewElt is an aggregate, cast
+ // the pointer to an integer of the same size before doing the load.
+ Value *SrcField = NewElts[i];
+ Type *FieldTy =
+ cast<PointerType>(SrcField->getType())->getElementType();
+ uint64_t FieldSizeBits = DL.getTypeSizeInBits(FieldTy);
+
+ // Ignore zero sized fields like {}, they obviously contain no data.
+ if (FieldSizeBits == 0) continue;
+
+ IntegerType *FieldIntTy = IntegerType::get(LI->getContext(),
+ FieldSizeBits);
+ if (!FieldTy->isIntegerTy() && !FieldTy->isFloatingPointTy() &&
+ !FieldTy->isVectorTy())
+ SrcField = new BitCastInst(SrcField,
+ PointerType::getUnqual(FieldIntTy),
+ "", LI);
+ SrcField = new LoadInst(SrcField, "sroa.load.elt", LI);
+
+ // If SrcField is a fp or vector of the right size but that isn't an
+ // integer type, bitcast to an integer so we can shift it.
+ if (SrcField->getType() != FieldIntTy)
+ SrcField = new BitCastInst(SrcField, FieldIntTy, "", LI);
+
+ // Zero extend the field to be the same size as the final alloca so that
+ // we can shift and insert it.
+ if (SrcField->getType() != ResultVal->getType())
+ SrcField = new ZExtInst(SrcField, ResultVal->getType(), "", LI);
+
+ // Determine the number of bits to shift SrcField.
+ uint64_t Shift;
+ if (Layout) // Struct case.
+ Shift = Layout->getElementOffsetInBits(i);
+ else // Array case.
+ Shift = i*ArrayEltBitOffset;
+
+ if (DL.isBigEndian())
+ Shift = AllocaSizeBits-Shift-FieldIntTy->getBitWidth();
+
+ if (Shift) {
+ Value *ShiftVal = ConstantInt::get(SrcField->getType(), Shift);
+ SrcField = BinaryOperator::CreateShl(SrcField, ShiftVal, "", LI);
+ }
+
+ // Don't create an 'or x, 0' on the first iteration.
+ if (!isa<Constant>(ResultVal) ||
+ !cast<Constant>(ResultVal)->isNullValue())
+ ResultVal = BinaryOperator::CreateOr(SrcField, ResultVal, "", LI);
+ else
+ ResultVal = SrcField;
+ }
+
+ // Handle tail padding by truncating the result
+ if (DL.getTypeSizeInBits(LI->getType()) != AllocaSizeBits)
+ ResultVal = new TruncInst(ResultVal, LI->getType(), "", LI);
+
+ LI->replaceAllUsesWith(ResultVal);
+ DeadInsts.push_back(LI);
+}
+
+/// HasPadding - Return true if the specified type has any structure or
+/// alignment padding in between the elements that would be split apart
+/// by SROA; return false otherwise.
+static bool HasPadding(Type *Ty, const DataLayout &DL) {
+ if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+ Ty = ATy->getElementType();
+ return DL.getTypeSizeInBits(Ty) != DL.getTypeAllocSizeInBits(Ty);
+ }
+
+ // SROA currently handles only Arrays and Structs.
+ StructType *STy = cast<StructType>(Ty);
+ const StructLayout *SL = DL.getStructLayout(STy);
+ unsigned PrevFieldBitOffset = 0;
+ for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+ unsigned FieldBitOffset = SL->getElementOffsetInBits(i);
+
+ // Check to see if there is any padding between this element and the
+ // previous one.
+ if (i) {
+ unsigned PrevFieldEnd =
+ PrevFieldBitOffset+DL.getTypeSizeInBits(STy->getElementType(i-1));
+ if (PrevFieldEnd < FieldBitOffset)
+ return true;
+ }
+ PrevFieldBitOffset = FieldBitOffset;
+ }
+ // Check for tail padding.
+ if (unsigned EltCount = STy->getNumElements()) {
+ unsigned PrevFieldEnd = PrevFieldBitOffset +
+ DL.getTypeSizeInBits(STy->getElementType(EltCount-1));
+ if (PrevFieldEnd < SL->getSizeInBits())
+ return true;
+ }
+ return false;
+}
+
+/// isSafeStructAllocaToScalarRepl - Check to see if the specified allocation of
+/// an aggregate can be broken down into elements. Return 0 if not, 3 if safe,
+/// or 1 if safe after canonicalization has been performed.
+bool SROA::isSafeAllocaToScalarRepl(AllocaInst *AI) {
+ // Loop over the use list of the alloca. We can only transform it if all of
+ // the users are safe to transform.
+ AllocaInfo Info(AI);
+
+ isSafeForScalarRepl(AI, 0, Info);
+ if (Info.isUnsafe) {
+ DEBUG(dbgs() << "Cannot transform: " << *AI << '\n');
+ return false;
+ }
+
+ const DataLayout &DL = AI->getModule()->getDataLayout();
+
+ // Okay, we know all the users are promotable. If the aggregate is a memcpy
+ // source and destination, we have to be careful. In particular, the memcpy
+ // could be moving around elements that live in structure padding of the LLVM
+ // types, but may actually be used. In these cases, we refuse to promote the
+ // struct.
+ if (Info.isMemCpySrc && Info.isMemCpyDst &&
+ HasPadding(AI->getAllocatedType(), DL))
+ return false;
+
+ // If the alloca never has an access to just *part* of it, but is accessed
+ // via loads and stores, then we should use ConvertToScalarInfo to promote
+ // the alloca instead of promoting each piece at a time and inserting fission
+ // and fusion code.
+ if (!Info.hasSubelementAccess && Info.hasALoadOrStore) {
+ // If the struct/array just has one element, use basic SRoA.
+ if (StructType *ST = dyn_cast<StructType>(AI->getAllocatedType())) {
+ if (ST->getNumElements() > 1) return false;
+ } else {
+ if (cast<ArrayType>(AI->getAllocatedType())->getNumElements() > 1)
+ return false;
+ }
+ }
+
+ return true;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp
new file mode 100644
index 0000000..054bacd
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp
@@ -0,0 +1,678 @@
+//===--- Scalarizer.cpp - Scalarize vector operations ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass converts vector operations into scalar operations, in order
+// to expose optimization opportunities on the individual scalar operations.
+// It is mainly intended for targets that do not have vector units, but it
+// may also be useful for revectorizing code to different vector widths.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "scalarizer"
+
+namespace {
+// Used to store the scattered form of a vector.
+typedef SmallVector<Value *, 8> ValueVector;
+
+// Used to map a vector Value to its scattered form. We use std::map
+// because we want iterators to persist across insertion and because the
+// values are relatively large.
+typedef std::map<Value *, ValueVector> ScatterMap;
+
+// Lists Instructions that have been replaced with scalar implementations,
+// along with a pointer to their scattered forms.
+typedef SmallVector<std::pair<Instruction *, ValueVector *>, 16> GatherList;
+
+// Provides a very limited vector-like interface for lazily accessing one
+// component of a scattered vector or vector pointer.
+class Scatterer {
+public:
+ Scatterer() {}
+
+ // Scatter V into Size components. If new instructions are needed,
+ // insert them before BBI in BB. If Cache is nonnull, use it to cache
+ // the results.
+ Scatterer(BasicBlock *bb, BasicBlock::iterator bbi, Value *v,
+ ValueVector *cachePtr = nullptr);
+
+ // Return component I, creating a new Value for it if necessary.
+ Value *operator[](unsigned I);
+
+ // Return the number of components.
+ unsigned size() const { return Size; }
+
+private:
+ BasicBlock *BB;
+ BasicBlock::iterator BBI;
+ Value *V;
+ ValueVector *CachePtr;
+ PointerType *PtrTy;
+ ValueVector Tmp;
+ unsigned Size;
+};
+
+// FCmpSpliiter(FCI)(Builder, X, Y, Name) uses Builder to create an FCmp
+// called Name that compares X and Y in the same way as FCI.
+struct FCmpSplitter {
+ FCmpSplitter(FCmpInst &fci) : FCI(fci) {}
+ Value *operator()(IRBuilder<> &Builder, Value *Op0, Value *Op1,
+ const Twine &Name) const {
+ return Builder.CreateFCmp(FCI.getPredicate(), Op0, Op1, Name);
+ }
+ FCmpInst &FCI;
+};
+
+// ICmpSpliiter(ICI)(Builder, X, Y, Name) uses Builder to create an ICmp
+// called Name that compares X and Y in the same way as ICI.
+struct ICmpSplitter {
+ ICmpSplitter(ICmpInst &ici) : ICI(ici) {}
+ Value *operator()(IRBuilder<> &Builder, Value *Op0, Value *Op1,
+ const Twine &Name) const {
+ return Builder.CreateICmp(ICI.getPredicate(), Op0, Op1, Name);
+ }
+ ICmpInst &ICI;
+};
+
+// BinarySpliiter(BO)(Builder, X, Y, Name) uses Builder to create
+// a binary operator like BO called Name with operands X and Y.
+struct BinarySplitter {
+ BinarySplitter(BinaryOperator &bo) : BO(bo) {}
+ Value *operator()(IRBuilder<> &Builder, Value *Op0, Value *Op1,
+ const Twine &Name) const {
+ return Builder.CreateBinOp(BO.getOpcode(), Op0, Op1, Name);
+ }
+ BinaryOperator &BO;
+};
+
+// Information about a load or store that we're scalarizing.
+struct VectorLayout {
+ VectorLayout() : VecTy(nullptr), ElemTy(nullptr), VecAlign(0), ElemSize(0) {}
+
+ // Return the alignment of element I.
+ uint64_t getElemAlign(unsigned I) {
+ return MinAlign(VecAlign, I * ElemSize);
+ }
+
+ // The type of the vector.
+ VectorType *VecTy;
+
+ // The type of each element.
+ Type *ElemTy;
+
+ // The alignment of the vector.
+ uint64_t VecAlign;
+
+ // The size of each element.
+ uint64_t ElemSize;
+};
+
+class Scalarizer : public FunctionPass,
+ public InstVisitor<Scalarizer, bool> {
+public:
+ static char ID;
+
+ Scalarizer() :
+ FunctionPass(ID) {
+ initializeScalarizerPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool doInitialization(Module &M) override;
+ bool runOnFunction(Function &F) override;
+
+ // InstVisitor methods. They return true if the instruction was scalarized,
+ // false if nothing changed.
+ bool visitInstruction(Instruction &) { return false; }
+ bool visitSelectInst(SelectInst &SI);
+ bool visitICmpInst(ICmpInst &);
+ bool visitFCmpInst(FCmpInst &);
+ bool visitBinaryOperator(BinaryOperator &);
+ bool visitGetElementPtrInst(GetElementPtrInst &);
+ bool visitCastInst(CastInst &);
+ bool visitBitCastInst(BitCastInst &);
+ bool visitShuffleVectorInst(ShuffleVectorInst &);
+ bool visitPHINode(PHINode &);
+ bool visitLoadInst(LoadInst &);
+ bool visitStoreInst(StoreInst &);
+
+ static void registerOptions() {
+ // This is disabled by default because having separate loads and stores
+ // makes it more likely that the -combiner-alias-analysis limits will be
+ // reached.
+ OptionRegistry::registerOption<bool, Scalarizer,
+ &Scalarizer::ScalarizeLoadStore>(
+ "scalarize-load-store",
+ "Allow the scalarizer pass to scalarize loads and store", false);
+ }
+
+private:
+ Scatterer scatter(Instruction *, Value *);
+ void gather(Instruction *, const ValueVector &);
+ bool canTransferMetadata(unsigned Kind);
+ void transferMetadata(Instruction *, const ValueVector &);
+ bool getVectorLayout(Type *, unsigned, VectorLayout &, const DataLayout &);
+ bool finish();
+
+ template<typename T> bool splitBinary(Instruction &, const T &);
+
+ ScatterMap Scattered;
+ GatherList Gathered;
+ unsigned ParallelLoopAccessMDKind;
+ bool ScalarizeLoadStore;
+};
+
+char Scalarizer::ID = 0;
+} // end anonymous namespace
+
+INITIALIZE_PASS_WITH_OPTIONS(Scalarizer, "scalarizer",
+ "Scalarize vector operations", false, false)
+
+Scatterer::Scatterer(BasicBlock *bb, BasicBlock::iterator bbi, Value *v,
+ ValueVector *cachePtr)
+ : BB(bb), BBI(bbi), V(v), CachePtr(cachePtr) {
+ Type *Ty = V->getType();
+ PtrTy = dyn_cast<PointerType>(Ty);
+ if (PtrTy)
+ Ty = PtrTy->getElementType();
+ Size = Ty->getVectorNumElements();
+ if (!CachePtr)
+ Tmp.resize(Size, nullptr);
+ else if (CachePtr->empty())
+ CachePtr->resize(Size, nullptr);
+ else
+ assert(Size == CachePtr->size() && "Inconsistent vector sizes");
+}
+
+// Return component I, creating a new Value for it if necessary.
+Value *Scatterer::operator[](unsigned I) {
+ ValueVector &CV = (CachePtr ? *CachePtr : Tmp);
+ // Try to reuse a previous value.
+ if (CV[I])
+ return CV[I];
+ IRBuilder<> Builder(BB, BBI);
+ if (PtrTy) {
+ if (!CV[0]) {
+ Type *Ty =
+ PointerType::get(PtrTy->getElementType()->getVectorElementType(),
+ PtrTy->getAddressSpace());
+ CV[0] = Builder.CreateBitCast(V, Ty, V->getName() + ".i0");
+ }
+ if (I != 0)
+ CV[I] = Builder.CreateConstGEP1_32(nullptr, CV[0], I,
+ V->getName() + ".i" + Twine(I));
+ } else {
+ // Search through a chain of InsertElementInsts looking for element I.
+ // Record other elements in the cache. The new V is still suitable
+ // for all uncached indices.
+ for (;;) {
+ InsertElementInst *Insert = dyn_cast<InsertElementInst>(V);
+ if (!Insert)
+ break;
+ ConstantInt *Idx = dyn_cast<ConstantInt>(Insert->getOperand(2));
+ if (!Idx)
+ break;
+ unsigned J = Idx->getZExtValue();
+ V = Insert->getOperand(0);
+ if (I == J) {
+ CV[J] = Insert->getOperand(1);
+ return CV[J];
+ } else if (!CV[J]) {
+ // Only cache the first entry we find for each index we're not actively
+ // searching for. This prevents us from going too far up the chain and
+ // caching incorrect entries.
+ CV[J] = Insert->getOperand(1);
+ }
+ }
+ CV[I] = Builder.CreateExtractElement(V, Builder.getInt32(I),
+ V->getName() + ".i" + Twine(I));
+ }
+ return CV[I];
+}
+
+bool Scalarizer::doInitialization(Module &M) {
+ ParallelLoopAccessMDKind =
+ M.getContext().getMDKindID("llvm.mem.parallel_loop_access");
+ ScalarizeLoadStore =
+ M.getContext().getOption<bool, Scalarizer, &Scalarizer::ScalarizeLoadStore>();
+ return false;
+}
+
+bool Scalarizer::runOnFunction(Function &F) {
+ assert(Gathered.empty() && Scattered.empty());
+ for (BasicBlock &BB : F) {
+ for (BasicBlock::iterator II = BB.begin(), IE = BB.end(); II != IE;) {
+ Instruction *I = &*II;
+ bool Done = visit(I);
+ ++II;
+ if (Done && I->getType()->isVoidTy())
+ I->eraseFromParent();
+ }
+ }
+ return finish();
+}
+
+// Return a scattered form of V that can be accessed by Point. V must be a
+// vector or a pointer to a vector.
+Scatterer Scalarizer::scatter(Instruction *Point, Value *V) {
+ if (Argument *VArg = dyn_cast<Argument>(V)) {
+ // Put the scattered form of arguments in the entry block,
+ // so that it can be used everywhere.
+ Function *F = VArg->getParent();
+ BasicBlock *BB = &F->getEntryBlock();
+ return Scatterer(BB, BB->begin(), V, &Scattered[V]);
+ }
+ if (Instruction *VOp = dyn_cast<Instruction>(V)) {
+ // Put the scattered form of an instruction directly after the
+ // instruction.
+ BasicBlock *BB = VOp->getParent();
+ return Scatterer(BB, std::next(BasicBlock::iterator(VOp)),
+ V, &Scattered[V]);
+ }
+ // In the fallback case, just put the scattered before Point and
+ // keep the result local to Point.
+ return Scatterer(Point->getParent(), Point->getIterator(), V);
+}
+
+// Replace Op with the gathered form of the components in CV. Defer the
+// deletion of Op and creation of the gathered form to the end of the pass,
+// so that we can avoid creating the gathered form if all uses of Op are
+// replaced with uses of CV.
+void Scalarizer::gather(Instruction *Op, const ValueVector &CV) {
+ // Since we're not deleting Op yet, stub out its operands, so that it
+ // doesn't make anything live unnecessarily.
+ for (unsigned I = 0, E = Op->getNumOperands(); I != E; ++I)
+ Op->setOperand(I, UndefValue::get(Op->getOperand(I)->getType()));
+
+ transferMetadata(Op, CV);
+
+ // If we already have a scattered form of Op (created from ExtractElements
+ // of Op itself), replace them with the new form.
+ ValueVector &SV = Scattered[Op];
+ if (!SV.empty()) {
+ for (unsigned I = 0, E = SV.size(); I != E; ++I) {
+ Instruction *Old = cast<Instruction>(SV[I]);
+ CV[I]->takeName(Old);
+ Old->replaceAllUsesWith(CV[I]);
+ Old->eraseFromParent();
+ }
+ }
+ SV = CV;
+ Gathered.push_back(GatherList::value_type(Op, &SV));
+}
+
+// Return true if it is safe to transfer the given metadata tag from
+// vector to scalar instructions.
+bool Scalarizer::canTransferMetadata(unsigned Tag) {
+ return (Tag == LLVMContext::MD_tbaa
+ || Tag == LLVMContext::MD_fpmath
+ || Tag == LLVMContext::MD_tbaa_struct
+ || Tag == LLVMContext::MD_invariant_load
+ || Tag == LLVMContext::MD_alias_scope
+ || Tag == LLVMContext::MD_noalias
+ || Tag == ParallelLoopAccessMDKind);
+}
+
+// Transfer metadata from Op to the instructions in CV if it is known
+// to be safe to do so.
+void Scalarizer::transferMetadata(Instruction *Op, const ValueVector &CV) {
+ SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
+ Op->getAllMetadataOtherThanDebugLoc(MDs);
+ for (unsigned I = 0, E = CV.size(); I != E; ++I) {
+ if (Instruction *New = dyn_cast<Instruction>(CV[I])) {
+ for (SmallVectorImpl<std::pair<unsigned, MDNode *>>::iterator
+ MI = MDs.begin(),
+ ME = MDs.end();
+ MI != ME; ++MI)
+ if (canTransferMetadata(MI->first))
+ New->setMetadata(MI->first, MI->second);
+ New->setDebugLoc(Op->getDebugLoc());
+ }
+ }
+}
+
+// Try to fill in Layout from Ty, returning true on success. Alignment is
+// the alignment of the vector, or 0 if the ABI default should be used.
+bool Scalarizer::getVectorLayout(Type *Ty, unsigned Alignment,
+ VectorLayout &Layout, const DataLayout &DL) {
+ // Make sure we're dealing with a vector.
+ Layout.VecTy = dyn_cast<VectorType>(Ty);
+ if (!Layout.VecTy)
+ return false;
+
+ // Check that we're dealing with full-byte elements.
+ Layout.ElemTy = Layout.VecTy->getElementType();
+ if (DL.getTypeSizeInBits(Layout.ElemTy) !=
+ DL.getTypeStoreSizeInBits(Layout.ElemTy))
+ return false;
+
+ if (Alignment)
+ Layout.VecAlign = Alignment;
+ else
+ Layout.VecAlign = DL.getABITypeAlignment(Layout.VecTy);
+ Layout.ElemSize = DL.getTypeStoreSize(Layout.ElemTy);
+ return true;
+}
+
+// Scalarize two-operand instruction I, using Split(Builder, X, Y, Name)
+// to create an instruction like I with operands X and Y and name Name.
+template<typename Splitter>
+bool Scalarizer::splitBinary(Instruction &I, const Splitter &Split) {
+ VectorType *VT = dyn_cast<VectorType>(I.getType());
+ if (!VT)
+ return false;
+
+ unsigned NumElems = VT->getNumElements();
+ IRBuilder<> Builder(&I);
+ Scatterer Op0 = scatter(&I, I.getOperand(0));
+ Scatterer Op1 = scatter(&I, I.getOperand(1));
+ assert(Op0.size() == NumElems && "Mismatched binary operation");
+ assert(Op1.size() == NumElems && "Mismatched binary operation");
+ ValueVector Res;
+ Res.resize(NumElems);
+ for (unsigned Elem = 0; Elem < NumElems; ++Elem)
+ Res[Elem] = Split(Builder, Op0[Elem], Op1[Elem],
+ I.getName() + ".i" + Twine(Elem));
+ gather(&I, Res);
+ return true;
+}
+
+bool Scalarizer::visitSelectInst(SelectInst &SI) {
+ VectorType *VT = dyn_cast<VectorType>(SI.getType());
+ if (!VT)
+ return false;
+
+ unsigned NumElems = VT->getNumElements();
+ IRBuilder<> Builder(&SI);
+ Scatterer Op1 = scatter(&SI, SI.getOperand(1));
+ Scatterer Op2 = scatter(&SI, SI.getOperand(2));
+ assert(Op1.size() == NumElems && "Mismatched select");
+ assert(Op2.size() == NumElems && "Mismatched select");
+ ValueVector Res;
+ Res.resize(NumElems);
+
+ if (SI.getOperand(0)->getType()->isVectorTy()) {
+ Scatterer Op0 = scatter(&SI, SI.getOperand(0));
+ assert(Op0.size() == NumElems && "Mismatched select");
+ for (unsigned I = 0; I < NumElems; ++I)
+ Res[I] = Builder.CreateSelect(Op0[I], Op1[I], Op2[I],
+ SI.getName() + ".i" + Twine(I));
+ } else {
+ Value *Op0 = SI.getOperand(0);
+ for (unsigned I = 0; I < NumElems; ++I)
+ Res[I] = Builder.CreateSelect(Op0, Op1[I], Op2[I],
+ SI.getName() + ".i" + Twine(I));
+ }
+ gather(&SI, Res);
+ return true;
+}
+
+bool Scalarizer::visitICmpInst(ICmpInst &ICI) {
+ return splitBinary(ICI, ICmpSplitter(ICI));
+}
+
+bool Scalarizer::visitFCmpInst(FCmpInst &FCI) {
+ return splitBinary(FCI, FCmpSplitter(FCI));
+}
+
+bool Scalarizer::visitBinaryOperator(BinaryOperator &BO) {
+ return splitBinary(BO, BinarySplitter(BO));
+}
+
+bool Scalarizer::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
+ VectorType *VT = dyn_cast<VectorType>(GEPI.getType());
+ if (!VT)
+ return false;
+
+ IRBuilder<> Builder(&GEPI);
+ unsigned NumElems = VT->getNumElements();
+ unsigned NumIndices = GEPI.getNumIndices();
+
+ Scatterer Base = scatter(&GEPI, GEPI.getOperand(0));
+
+ SmallVector<Scatterer, 8> Ops;
+ Ops.resize(NumIndices);
+ for (unsigned I = 0; I < NumIndices; ++I)
+ Ops[I] = scatter(&GEPI, GEPI.getOperand(I + 1));
+
+ ValueVector Res;
+ Res.resize(NumElems);
+ for (unsigned I = 0; I < NumElems; ++I) {
+ SmallVector<Value *, 8> Indices;
+ Indices.resize(NumIndices);
+ for (unsigned J = 0; J < NumIndices; ++J)
+ Indices[J] = Ops[J][I];
+ Res[I] = Builder.CreateGEP(GEPI.getSourceElementType(), Base[I], Indices,
+ GEPI.getName() + ".i" + Twine(I));
+ if (GEPI.isInBounds())
+ if (GetElementPtrInst *NewGEPI = dyn_cast<GetElementPtrInst>(Res[I]))
+ NewGEPI->setIsInBounds();
+ }
+ gather(&GEPI, Res);
+ return true;
+}
+
+bool Scalarizer::visitCastInst(CastInst &CI) {
+ VectorType *VT = dyn_cast<VectorType>(CI.getDestTy());
+ if (!VT)
+ return false;
+
+ unsigned NumElems = VT->getNumElements();
+ IRBuilder<> Builder(&CI);
+ Scatterer Op0 = scatter(&CI, CI.getOperand(0));
+ assert(Op0.size() == NumElems && "Mismatched cast");
+ ValueVector Res;
+ Res.resize(NumElems);
+ for (unsigned I = 0; I < NumElems; ++I)
+ Res[I] = Builder.CreateCast(CI.getOpcode(), Op0[I], VT->getElementType(),
+ CI.getName() + ".i" + Twine(I));
+ gather(&CI, Res);
+ return true;
+}
+
+bool Scalarizer::visitBitCastInst(BitCastInst &BCI) {
+ VectorType *DstVT = dyn_cast<VectorType>(BCI.getDestTy());
+ VectorType *SrcVT = dyn_cast<VectorType>(BCI.getSrcTy());
+ if (!DstVT || !SrcVT)
+ return false;
+
+ unsigned DstNumElems = DstVT->getNumElements();
+ unsigned SrcNumElems = SrcVT->getNumElements();
+ IRBuilder<> Builder(&BCI);
+ Scatterer Op0 = scatter(&BCI, BCI.getOperand(0));
+ ValueVector Res;
+ Res.resize(DstNumElems);
+
+ if (DstNumElems == SrcNumElems) {
+ for (unsigned I = 0; I < DstNumElems; ++I)
+ Res[I] = Builder.CreateBitCast(Op0[I], DstVT->getElementType(),
+ BCI.getName() + ".i" + Twine(I));
+ } else if (DstNumElems > SrcNumElems) {
+ // <M x t1> -> <N*M x t2>. Convert each t1 to <N x t2> and copy the
+ // individual elements to the destination.
+ unsigned FanOut = DstNumElems / SrcNumElems;
+ Type *MidTy = VectorType::get(DstVT->getElementType(), FanOut);
+ unsigned ResI = 0;
+ for (unsigned Op0I = 0; Op0I < SrcNumElems; ++Op0I) {
+ Value *V = Op0[Op0I];
+ Instruction *VI;
+ // Look through any existing bitcasts before converting to <N x t2>.
+ // In the best case, the resulting conversion might be a no-op.
+ while ((VI = dyn_cast<Instruction>(V)) &&
+ VI->getOpcode() == Instruction::BitCast)
+ V = VI->getOperand(0);
+ V = Builder.CreateBitCast(V, MidTy, V->getName() + ".cast");
+ Scatterer Mid = scatter(&BCI, V);
+ for (unsigned MidI = 0; MidI < FanOut; ++MidI)
+ Res[ResI++] = Mid[MidI];
+ }
+ } else {
+ // <N*M x t1> -> <M x t2>. Convert each group of <N x t1> into a t2.
+ unsigned FanIn = SrcNumElems / DstNumElems;
+ Type *MidTy = VectorType::get(SrcVT->getElementType(), FanIn);
+ unsigned Op0I = 0;
+ for (unsigned ResI = 0; ResI < DstNumElems; ++ResI) {
+ Value *V = UndefValue::get(MidTy);
+ for (unsigned MidI = 0; MidI < FanIn; ++MidI)
+ V = Builder.CreateInsertElement(V, Op0[Op0I++], Builder.getInt32(MidI),
+ BCI.getName() + ".i" + Twine(ResI)
+ + ".upto" + Twine(MidI));
+ Res[ResI] = Builder.CreateBitCast(V, DstVT->getElementType(),
+ BCI.getName() + ".i" + Twine(ResI));
+ }
+ }
+ gather(&BCI, Res);
+ return true;
+}
+
+bool Scalarizer::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
+ VectorType *VT = dyn_cast<VectorType>(SVI.getType());
+ if (!VT)
+ return false;
+
+ unsigned NumElems = VT->getNumElements();
+ Scatterer Op0 = scatter(&SVI, SVI.getOperand(0));
+ Scatterer Op1 = scatter(&SVI, SVI.getOperand(1));
+ ValueVector Res;
+ Res.resize(NumElems);
+
+ for (unsigned I = 0; I < NumElems; ++I) {
+ int Selector = SVI.getMaskValue(I);
+ if (Selector < 0)
+ Res[I] = UndefValue::get(VT->getElementType());
+ else if (unsigned(Selector) < Op0.size())
+ Res[I] = Op0[Selector];
+ else
+ Res[I] = Op1[Selector - Op0.size()];
+ }
+ gather(&SVI, Res);
+ return true;
+}
+
+bool Scalarizer::visitPHINode(PHINode &PHI) {
+ VectorType *VT = dyn_cast<VectorType>(PHI.getType());
+ if (!VT)
+ return false;
+
+ unsigned NumElems = VT->getNumElements();
+ IRBuilder<> Builder(&PHI);
+ ValueVector Res;
+ Res.resize(NumElems);
+
+ unsigned NumOps = PHI.getNumOperands();
+ for (unsigned I = 0; I < NumElems; ++I)
+ Res[I] = Builder.CreatePHI(VT->getElementType(), NumOps,
+ PHI.getName() + ".i" + Twine(I));
+
+ for (unsigned I = 0; I < NumOps; ++I) {
+ Scatterer Op = scatter(&PHI, PHI.getIncomingValue(I));
+ BasicBlock *IncomingBlock = PHI.getIncomingBlock(I);
+ for (unsigned J = 0; J < NumElems; ++J)
+ cast<PHINode>(Res[J])->addIncoming(Op[J], IncomingBlock);
+ }
+ gather(&PHI, Res);
+ return true;
+}
+
+bool Scalarizer::visitLoadInst(LoadInst &LI) {
+ if (!ScalarizeLoadStore)
+ return false;
+ if (!LI.isSimple())
+ return false;
+
+ VectorLayout Layout;
+ if (!getVectorLayout(LI.getType(), LI.getAlignment(), Layout,
+ LI.getModule()->getDataLayout()))
+ return false;
+
+ unsigned NumElems = Layout.VecTy->getNumElements();
+ IRBuilder<> Builder(&LI);
+ Scatterer Ptr = scatter(&LI, LI.getPointerOperand());
+ ValueVector Res;
+ Res.resize(NumElems);
+
+ for (unsigned I = 0; I < NumElems; ++I)
+ Res[I] = Builder.CreateAlignedLoad(Ptr[I], Layout.getElemAlign(I),
+ LI.getName() + ".i" + Twine(I));
+ gather(&LI, Res);
+ return true;
+}
+
+bool Scalarizer::visitStoreInst(StoreInst &SI) {
+ if (!ScalarizeLoadStore)
+ return false;
+ if (!SI.isSimple())
+ return false;
+
+ VectorLayout Layout;
+ Value *FullValue = SI.getValueOperand();
+ if (!getVectorLayout(FullValue->getType(), SI.getAlignment(), Layout,
+ SI.getModule()->getDataLayout()))
+ return false;
+
+ unsigned NumElems = Layout.VecTy->getNumElements();
+ IRBuilder<> Builder(&SI);
+ Scatterer Ptr = scatter(&SI, SI.getPointerOperand());
+ Scatterer Val = scatter(&SI, FullValue);
+
+ ValueVector Stores;
+ Stores.resize(NumElems);
+ for (unsigned I = 0; I < NumElems; ++I) {
+ unsigned Align = Layout.getElemAlign(I);
+ Stores[I] = Builder.CreateAlignedStore(Val[I], Ptr[I], Align);
+ }
+ transferMetadata(&SI, Stores);
+ return true;
+}
+
+// Delete the instructions that we scalarized. If a full vector result
+// is still needed, recreate it using InsertElements.
+bool Scalarizer::finish() {
+ // The presence of data in Gathered or Scattered indicates changes
+ // made to the Function.
+ if (Gathered.empty() && Scattered.empty())
+ return false;
+ for (GatherList::iterator GMI = Gathered.begin(), GME = Gathered.end();
+ GMI != GME; ++GMI) {
+ Instruction *Op = GMI->first;
+ ValueVector &CV = *GMI->second;
+ if (!Op->use_empty()) {
+ // The value is still needed, so recreate it using a series of
+ // InsertElements.
+ Type *Ty = Op->getType();
+ Value *Res = UndefValue::get(Ty);
+ BasicBlock *BB = Op->getParent();
+ unsigned Count = Ty->getVectorNumElements();
+ IRBuilder<> Builder(Op);
+ if (isa<PHINode>(Op))
+ Builder.SetInsertPoint(BB, BB->getFirstInsertionPt());
+ for (unsigned I = 0; I < Count; ++I)
+ Res = Builder.CreateInsertElement(Res, CV[I], Builder.getInt32(I),
+ Op->getName() + ".upto" + Twine(I));
+ Res->takeName(Op);
+ Op->replaceAllUsesWith(Res);
+ }
+ Op->eraseFromParent();
+ }
+ Gathered.clear();
+ Scattered.clear();
+ return true;
+}
+
+FunctionPass *llvm::createScalarizerPass() {
+ return new Scalarizer();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
new file mode 100644
index 0000000..86a10d2
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -0,0 +1,1265 @@
+//===-- SeparateConstOffsetFromGEP.cpp - ------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Loop unrolling may create many similar GEPs for array accesses.
+// e.g., a 2-level loop
+//
+// float a[32][32]; // global variable
+//
+// for (int i = 0; i < 2; ++i) {
+// for (int j = 0; j < 2; ++j) {
+// ...
+// ... = a[x + i][y + j];
+// ...
+// }
+// }
+//
+// will probably be unrolled to:
+//
+// gep %a, 0, %x, %y; load
+// gep %a, 0, %x, %y + 1; load
+// gep %a, 0, %x + 1, %y; load
+// gep %a, 0, %x + 1, %y + 1; load
+//
+// LLVM's GVN does not use partial redundancy elimination yet, and is thus
+// unable to reuse (gep %a, 0, %x, %y). As a result, this misoptimization incurs
+// significant slowdown in targets with limited addressing modes. For instance,
+// because the PTX target does not support the reg+reg addressing mode, the
+// NVPTX backend emits PTX code that literally computes the pointer address of
+// each GEP, wasting tons of registers. It emits the following PTX for the
+// first load and similar PTX for other loads.
+//
+// mov.u32 %r1, %x;
+// mov.u32 %r2, %y;
+// mul.wide.u32 %rl2, %r1, 128;
+// mov.u64 %rl3, a;
+// add.s64 %rl4, %rl3, %rl2;
+// mul.wide.u32 %rl5, %r2, 4;
+// add.s64 %rl6, %rl4, %rl5;
+// ld.global.f32 %f1, [%rl6];
+//
+// To reduce the register pressure, the optimization implemented in this file
+// merges the common part of a group of GEPs, so we can compute each pointer
+// address by adding a simple offset to the common part, saving many registers.
+//
+// It works by splitting each GEP into a variadic base and a constant offset.
+// The variadic base can be computed once and reused by multiple GEPs, and the
+// constant offsets can be nicely folded into the reg+immediate addressing mode
+// (supported by most targets) without using any extra register.
+//
+// For instance, we transform the four GEPs and four loads in the above example
+// into:
+//
+// base = gep a, 0, x, y
+// load base
+// laod base + 1 * sizeof(float)
+// load base + 32 * sizeof(float)
+// load base + 33 * sizeof(float)
+//
+// Given the transformed IR, a backend that supports the reg+immediate
+// addressing mode can easily fold the pointer arithmetics into the loads. For
+// example, the NVPTX backend can easily fold the pointer arithmetics into the
+// ld.global.f32 instructions, and the resultant PTX uses much fewer registers.
+//
+// mov.u32 %r1, %tid.x;
+// mov.u32 %r2, %tid.y;
+// mul.wide.u32 %rl2, %r1, 128;
+// mov.u64 %rl3, a;
+// add.s64 %rl4, %rl3, %rl2;
+// mul.wide.u32 %rl5, %r2, 4;
+// add.s64 %rl6, %rl4, %rl5;
+// ld.global.f32 %f1, [%rl6]; // so far the same as unoptimized PTX
+// ld.global.f32 %f2, [%rl6+4]; // much better
+// ld.global.f32 %f3, [%rl6+128]; // much better
+// ld.global.f32 %f4, [%rl6+132]; // much better
+//
+// Another improvement enabled by the LowerGEP flag is to lower a GEP with
+// multiple indices to either multiple GEPs with a single index or arithmetic
+// operations (depending on whether the target uses alias analysis in codegen).
+// Such transformation can have following benefits:
+// (1) It can always extract constants in the indices of structure type.
+// (2) After such Lowering, there are more optimization opportunities such as
+// CSE, LICM and CGP.
+//
+// E.g. The following GEPs have multiple indices:
+// BB1:
+// %p = getelementptr [10 x %struct]* %ptr, i64 %i, i64 %j1, i32 3
+// load %p
+// ...
+// BB2:
+// %p2 = getelementptr [10 x %struct]* %ptr, i64 %i, i64 %j1, i32 2
+// load %p2
+// ...
+//
+// We can not do CSE for to the common part related to index "i64 %i". Lowering
+// GEPs can achieve such goals.
+// If the target does not use alias analysis in codegen, this pass will
+// lower a GEP with multiple indices into arithmetic operations:
+// BB1:
+// %1 = ptrtoint [10 x %struct]* %ptr to i64 ; CSE opportunity
+// %2 = mul i64 %i, length_of_10xstruct ; CSE opportunity
+// %3 = add i64 %1, %2 ; CSE opportunity
+// %4 = mul i64 %j1, length_of_struct
+// %5 = add i64 %3, %4
+// %6 = add i64 %3, struct_field_3 ; Constant offset
+// %p = inttoptr i64 %6 to i32*
+// load %p
+// ...
+// BB2:
+// %7 = ptrtoint [10 x %struct]* %ptr to i64 ; CSE opportunity
+// %8 = mul i64 %i, length_of_10xstruct ; CSE opportunity
+// %9 = add i64 %7, %8 ; CSE opportunity
+// %10 = mul i64 %j2, length_of_struct
+// %11 = add i64 %9, %10
+// %12 = add i64 %11, struct_field_2 ; Constant offset
+// %p = inttoptr i64 %12 to i32*
+// load %p2
+// ...
+//
+// If the target uses alias analysis in codegen, this pass will lower a GEP
+// with multiple indices into multiple GEPs with a single index:
+// BB1:
+// %1 = bitcast [10 x %struct]* %ptr to i8* ; CSE opportunity
+// %2 = mul i64 %i, length_of_10xstruct ; CSE opportunity
+// %3 = getelementptr i8* %1, i64 %2 ; CSE opportunity
+// %4 = mul i64 %j1, length_of_struct
+// %5 = getelementptr i8* %3, i64 %4
+// %6 = getelementptr i8* %5, struct_field_3 ; Constant offset
+// %p = bitcast i8* %6 to i32*
+// load %p
+// ...
+// BB2:
+// %7 = bitcast [10 x %struct]* %ptr to i8* ; CSE opportunity
+// %8 = mul i64 %i, length_of_10xstruct ; CSE opportunity
+// %9 = getelementptr i8* %7, i64 %8 ; CSE opportunity
+// %10 = mul i64 %j2, length_of_struct
+// %11 = getelementptr i8* %9, i64 %10
+// %12 = getelementptr i8* %11, struct_field_2 ; Constant offset
+// %p2 = bitcast i8* %12 to i32*
+// load %p2
+// ...
+//
+// Lowering GEPs can also benefit other passes such as LICM and CGP.
+// LICM (Loop Invariant Code Motion) can not hoist/sink a GEP of multiple
+// indices if one of the index is variant. If we lower such GEP into invariant
+// parts and variant parts, LICM can hoist/sink those invariant parts.
+// CGP (CodeGen Prepare) tries to sink address calculations that match the
+// target's addressing modes. A GEP with multiple indices may not match and will
+// not be sunk. If we lower such GEP into smaller parts, CGP may sink some of
+// them. So we end up with a better addressing mode.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/IR/IRBuilder.h"
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+static cl::opt<bool> DisableSeparateConstOffsetFromGEP(
+ "disable-separate-const-offset-from-gep", cl::init(false),
+ cl::desc("Do not separate the constant offset from a GEP instruction"),
+ cl::Hidden);
+// Setting this flag may emit false positives when the input module already
+// contains dead instructions. Therefore, we set it only in unit tests that are
+// free of dead code.
+static cl::opt<bool>
+ VerifyNoDeadCode("reassociate-geps-verify-no-dead-code", cl::init(false),
+ cl::desc("Verify this pass produces no dead code"),
+ cl::Hidden);
+
+namespace {
+
+/// \brief A helper class for separating a constant offset from a GEP index.
+///
+/// In real programs, a GEP index may be more complicated than a simple addition
+/// of something and a constant integer which can be trivially splitted. For
+/// example, to split ((a << 3) | 5) + b, we need to search deeper for the
+/// constant offset, so that we can separate the index to (a << 3) + b and 5.
+///
+/// Therefore, this class looks into the expression that computes a given GEP
+/// index, and tries to find a constant integer that can be hoisted to the
+/// outermost level of the expression as an addition. Not every constant in an
+/// expression can jump out. e.g., we cannot transform (b * (a + 5)) to (b * a +
+/// 5); nor can we transform (3 * (a + 5)) to (3 * a + 5), however in this case,
+/// -instcombine probably already optimized (3 * (a + 5)) to (3 * a + 15).
+class ConstantOffsetExtractor {
+public:
+ /// Extracts a constant offset from the given GEP index. It returns the
+ /// new index representing the remainder (equal to the original index minus
+ /// the constant offset), or nullptr if we cannot extract a constant offset.
+ /// \p Idx The given GEP index
+ /// \p GEP The given GEP
+ /// \p UserChainTail Outputs the tail of UserChain so that we can
+ /// garbage-collect unused instructions in UserChain.
+ static Value *Extract(Value *Idx, GetElementPtrInst *GEP,
+ User *&UserChainTail, const DominatorTree *DT);
+ /// Looks for a constant offset from the given GEP index without extracting
+ /// it. It returns the numeric value of the extracted constant offset (0 if
+ /// failed). The meaning of the arguments are the same as Extract.
+ static int64_t Find(Value *Idx, GetElementPtrInst *GEP,
+ const DominatorTree *DT);
+
+private:
+ ConstantOffsetExtractor(Instruction *InsertionPt, const DominatorTree *DT)
+ : IP(InsertionPt), DL(InsertionPt->getModule()->getDataLayout()), DT(DT) {
+ }
+ /// Searches the expression that computes V for a non-zero constant C s.t.
+ /// V can be reassociated into the form V' + C. If the searching is
+ /// successful, returns C and update UserChain as a def-use chain from C to V;
+ /// otherwise, UserChain is empty.
+ ///
+ /// \p V The given expression
+ /// \p SignExtended Whether V will be sign-extended in the computation of the
+ /// GEP index
+ /// \p ZeroExtended Whether V will be zero-extended in the computation of the
+ /// GEP index
+ /// \p NonNegative Whether V is guaranteed to be non-negative. For example,
+ /// an index of an inbounds GEP is guaranteed to be
+ /// non-negative. Levaraging this, we can better split
+ /// inbounds GEPs.
+ APInt find(Value *V, bool SignExtended, bool ZeroExtended, bool NonNegative);
+ /// A helper function to look into both operands of a binary operator.
+ APInt findInEitherOperand(BinaryOperator *BO, bool SignExtended,
+ bool ZeroExtended);
+ /// After finding the constant offset C from the GEP index I, we build a new
+ /// index I' s.t. I' + C = I. This function builds and returns the new
+ /// index I' according to UserChain produced by function "find".
+ ///
+ /// The building conceptually takes two steps:
+ /// 1) iteratively distribute s/zext towards the leaves of the expression tree
+ /// that computes I
+ /// 2) reassociate the expression tree to the form I' + C.
+ ///
+ /// For example, to extract the 5 from sext(a + (b + 5)), we first distribute
+ /// sext to a, b and 5 so that we have
+ /// sext(a) + (sext(b) + 5).
+ /// Then, we reassociate it to
+ /// (sext(a) + sext(b)) + 5.
+ /// Given this form, we know I' is sext(a) + sext(b).
+ Value *rebuildWithoutConstOffset();
+ /// After the first step of rebuilding the GEP index without the constant
+ /// offset, distribute s/zext to the operands of all operators in UserChain.
+ /// e.g., zext(sext(a + (b + 5)) (assuming no overflow) =>
+ /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5))).
+ ///
+ /// The function also updates UserChain to point to new subexpressions after
+ /// distributing s/zext. e.g., the old UserChain of the above example is
+ /// 5 -> b + 5 -> a + (b + 5) -> sext(...) -> zext(sext(...)),
+ /// and the new UserChain is
+ /// zext(sext(5)) -> zext(sext(b)) + zext(sext(5)) ->
+ /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5))
+ ///
+ /// \p ChainIndex The index to UserChain. ChainIndex is initially
+ /// UserChain.size() - 1, and is decremented during
+ /// the recursion.
+ Value *distributeExtsAndCloneChain(unsigned ChainIndex);
+ /// Reassociates the GEP index to the form I' + C and returns I'.
+ Value *removeConstOffset(unsigned ChainIndex);
+ /// A helper function to apply ExtInsts, a list of s/zext, to value V.
+ /// e.g., if ExtInsts = [sext i32 to i64, zext i16 to i32], this function
+ /// returns "sext i32 (zext i16 V to i32) to i64".
+ Value *applyExts(Value *V);
+
+ /// A helper function that returns whether we can trace into the operands
+ /// of binary operator BO for a constant offset.
+ ///
+ /// \p SignExtended Whether BO is surrounded by sext
+ /// \p ZeroExtended Whether BO is surrounded by zext
+ /// \p NonNegative Whether BO is known to be non-negative, e.g., an in-bound
+ /// array index.
+ bool CanTraceInto(bool SignExtended, bool ZeroExtended, BinaryOperator *BO,
+ bool NonNegative);
+
+ /// The path from the constant offset to the old GEP index. e.g., if the GEP
+ /// index is "a * b + (c + 5)". After running function find, UserChain[0] will
+ /// be the constant 5, UserChain[1] will be the subexpression "c + 5", and
+ /// UserChain[2] will be the entire expression "a * b + (c + 5)".
+ ///
+ /// This path helps to rebuild the new GEP index.
+ SmallVector<User *, 8> UserChain;
+ /// A data structure used in rebuildWithoutConstOffset. Contains all
+ /// sext/zext instructions along UserChain.
+ SmallVector<CastInst *, 16> ExtInsts;
+ Instruction *IP; /// Insertion position of cloned instructions.
+ const DataLayout &DL;
+ const DominatorTree *DT;
+};
+
+/// \brief A pass that tries to split every GEP in the function into a variadic
+/// base and a constant offset. It is a FunctionPass because searching for the
+/// constant offset may inspect other basic blocks.
+class SeparateConstOffsetFromGEP : public FunctionPass {
+public:
+ static char ID;
+ SeparateConstOffsetFromGEP(const TargetMachine *TM = nullptr,
+ bool LowerGEP = false)
+ : FunctionPass(ID), DL(nullptr), DT(nullptr), TM(TM), LowerGEP(LowerGEP) {
+ initializeSeparateConstOffsetFromGEPPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.setPreservesCFG();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ }
+
+ bool doInitialization(Module &M) override {
+ DL = &M.getDataLayout();
+ return false;
+ }
+ bool runOnFunction(Function &F) override;
+
+private:
+ /// Tries to split the given GEP into a variadic base and a constant offset,
+ /// and returns true if the splitting succeeds.
+ bool splitGEP(GetElementPtrInst *GEP);
+ /// Lower a GEP with multiple indices into multiple GEPs with a single index.
+ /// Function splitGEP already split the original GEP into a variadic part and
+ /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the
+ /// variadic part into a set of GEPs with a single index and applies
+ /// AccumulativeByteOffset to it.
+ /// \p Variadic The variadic part of the original GEP.
+ /// \p AccumulativeByteOffset The constant offset.
+ void lowerToSingleIndexGEPs(GetElementPtrInst *Variadic,
+ int64_t AccumulativeByteOffset);
+ /// Lower a GEP with multiple indices into ptrtoint+arithmetics+inttoptr form.
+ /// Function splitGEP already split the original GEP into a variadic part and
+ /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the
+ /// variadic part into a set of arithmetic operations and applies
+ /// AccumulativeByteOffset to it.
+ /// \p Variadic The variadic part of the original GEP.
+ /// \p AccumulativeByteOffset The constant offset.
+ void lowerToArithmetics(GetElementPtrInst *Variadic,
+ int64_t AccumulativeByteOffset);
+ /// Finds the constant offset within each index and accumulates them. If
+ /// LowerGEP is true, it finds in indices of both sequential and structure
+ /// types, otherwise it only finds in sequential indices. The output
+ /// NeedsExtraction indicates whether we successfully find a non-zero constant
+ /// offset.
+ int64_t accumulateByteOffset(GetElementPtrInst *GEP, bool &NeedsExtraction);
+ /// Canonicalize array indices to pointer-size integers. This helps to
+ /// simplify the logic of splitting a GEP. For example, if a + b is a
+ /// pointer-size integer, we have
+ /// gep base, a + b = gep (gep base, a), b
+ /// However, this equality may not hold if the size of a + b is smaller than
+ /// the pointer size, because LLVM conceptually sign-extends GEP indices to
+ /// pointer size before computing the address
+ /// (http://llvm.org/docs/LangRef.html#id181).
+ ///
+ /// This canonicalization is very likely already done in clang and
+ /// instcombine. Therefore, the program will probably remain the same.
+ ///
+ /// Returns true if the module changes.
+ ///
+ /// Verified in @i32_add in split-gep.ll
+ bool canonicalizeArrayIndicesToPointerSize(GetElementPtrInst *GEP);
+ /// Optimize sext(a)+sext(b) to sext(a+b) when a+b can't sign overflow.
+ /// SeparateConstOffsetFromGEP distributes a sext to leaves before extracting
+ /// the constant offset. After extraction, it becomes desirable to reunion the
+ /// distributed sexts. For example,
+ ///
+ /// &a[sext(i +nsw (j +nsw 5)]
+ /// => distribute &a[sext(i) +nsw (sext(j) +nsw 5)]
+ /// => constant extraction &a[sext(i) + sext(j)] + 5
+ /// => reunion &a[sext(i +nsw j)] + 5
+ bool reuniteExts(Function &F);
+ /// A helper that reunites sexts in an instruction.
+ bool reuniteExts(Instruction *I);
+ /// Find the closest dominator of <Dominatee> that is equivalent to <Key>.
+ Instruction *findClosestMatchingDominator(const SCEV *Key,
+ Instruction *Dominatee);
+ /// Verify F is free of dead code.
+ void verifyNoDeadCode(Function &F);
+
+ bool hasMoreThanOneUseInLoop(Value *v, Loop *L);
+ // Swap the index operand of two GEP.
+ void swapGEPOperand(GetElementPtrInst *First, GetElementPtrInst *Second);
+ // Check if it is safe to swap operand of two GEP.
+ bool isLegalToSwapOperand(GetElementPtrInst *First, GetElementPtrInst *Second,
+ Loop *CurLoop);
+
+ const DataLayout *DL;
+ DominatorTree *DT;
+ ScalarEvolution *SE;
+ const TargetMachine *TM;
+
+ LoopInfo *LI;
+ TargetLibraryInfo *TLI;
+ /// Whether to lower a GEP with multiple indices into arithmetic operations or
+ /// multiple GEPs with a single index.
+ bool LowerGEP;
+ DenseMap<const SCEV *, SmallVector<Instruction *, 2>> DominatingExprs;
+};
+} // anonymous namespace
+
+char SeparateConstOffsetFromGEP::ID = 0;
+INITIALIZE_PASS_BEGIN(
+ SeparateConstOffsetFromGEP, "separate-const-offset-from-gep",
+ "Split GEPs to a variadic base and a constant offset for better CSE", false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(
+ SeparateConstOffsetFromGEP, "separate-const-offset-from-gep",
+ "Split GEPs to a variadic base and a constant offset for better CSE", false,
+ false)
+
+FunctionPass *
+llvm::createSeparateConstOffsetFromGEPPass(const TargetMachine *TM,
+ bool LowerGEP) {
+ return new SeparateConstOffsetFromGEP(TM, LowerGEP);
+}
+
+bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended,
+ bool ZeroExtended,
+ BinaryOperator *BO,
+ bool NonNegative) {
+ // We only consider ADD, SUB and OR, because a non-zero constant found in
+ // expressions composed of these operations can be easily hoisted as a
+ // constant offset by reassociation.
+ if (BO->getOpcode() != Instruction::Add &&
+ BO->getOpcode() != Instruction::Sub &&
+ BO->getOpcode() != Instruction::Or) {
+ return false;
+ }
+
+ Value *LHS = BO->getOperand(0), *RHS = BO->getOperand(1);
+ // Do not trace into "or" unless it is equivalent to "add". If LHS and RHS
+ // don't have common bits, (LHS | RHS) is equivalent to (LHS + RHS).
+ if (BO->getOpcode() == Instruction::Or &&
+ !haveNoCommonBitsSet(LHS, RHS, DL, nullptr, BO, DT))
+ return false;
+
+ // In addition, tracing into BO requires that its surrounding s/zext (if
+ // any) is distributable to both operands.
+ //
+ // Suppose BO = A op B.
+ // SignExtended | ZeroExtended | Distributable?
+ // --------------+--------------+----------------------------------
+ // 0 | 0 | true because no s/zext exists
+ // 0 | 1 | zext(BO) == zext(A) op zext(B)
+ // 1 | 0 | sext(BO) == sext(A) op sext(B)
+ // 1 | 1 | zext(sext(BO)) ==
+ // | | zext(sext(A)) op zext(sext(B))
+ if (BO->getOpcode() == Instruction::Add && !ZeroExtended && NonNegative) {
+ // If a + b >= 0 and (a >= 0 or b >= 0), then
+ // sext(a + b) = sext(a) + sext(b)
+ // even if the addition is not marked nsw.
+ //
+ // Leveraging this invarient, we can trace into an sext'ed inbound GEP
+ // index if the constant offset is non-negative.
+ //
+ // Verified in @sext_add in split-gep.ll.
+ if (ConstantInt *ConstLHS = dyn_cast<ConstantInt>(LHS)) {
+ if (!ConstLHS->isNegative())
+ return true;
+ }
+ if (ConstantInt *ConstRHS = dyn_cast<ConstantInt>(RHS)) {
+ if (!ConstRHS->isNegative())
+ return true;
+ }
+ }
+
+ // sext (add/sub nsw A, B) == add/sub nsw (sext A), (sext B)
+ // zext (add/sub nuw A, B) == add/sub nuw (zext A), (zext B)
+ if (BO->getOpcode() == Instruction::Add ||
+ BO->getOpcode() == Instruction::Sub) {
+ if (SignExtended && !BO->hasNoSignedWrap())
+ return false;
+ if (ZeroExtended && !BO->hasNoUnsignedWrap())
+ return false;
+ }
+
+ return true;
+}
+
+APInt ConstantOffsetExtractor::findInEitherOperand(BinaryOperator *BO,
+ bool SignExtended,
+ bool ZeroExtended) {
+ // BO being non-negative does not shed light on whether its operands are
+ // non-negative. Clear the NonNegative flag here.
+ APInt ConstantOffset = find(BO->getOperand(0), SignExtended, ZeroExtended,
+ /* NonNegative */ false);
+ // If we found a constant offset in the left operand, stop and return that.
+ // This shortcut might cause us to miss opportunities of combining the
+ // constant offsets in both operands, e.g., (a + 4) + (b + 5) => (a + b) + 9.
+ // However, such cases are probably already handled by -instcombine,
+ // given this pass runs after the standard optimizations.
+ if (ConstantOffset != 0) return ConstantOffset;
+ ConstantOffset = find(BO->getOperand(1), SignExtended, ZeroExtended,
+ /* NonNegative */ false);
+ // If U is a sub operator, negate the constant offset found in the right
+ // operand.
+ if (BO->getOpcode() == Instruction::Sub)
+ ConstantOffset = -ConstantOffset;
+ return ConstantOffset;
+}
+
+APInt ConstantOffsetExtractor::find(Value *V, bool SignExtended,
+ bool ZeroExtended, bool NonNegative) {
+ // TODO(jingyue): We could trace into integer/pointer casts, such as
+ // inttoptr, ptrtoint, bitcast, and addrspacecast. We choose to handle only
+ // integers because it gives good enough results for our benchmarks.
+ unsigned BitWidth = cast<IntegerType>(V->getType())->getBitWidth();
+
+ // We cannot do much with Values that are not a User, such as an Argument.
+ User *U = dyn_cast<User>(V);
+ if (U == nullptr) return APInt(BitWidth, 0);
+
+ APInt ConstantOffset(BitWidth, 0);
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+ // Hooray, we found it!
+ ConstantOffset = CI->getValue();
+ } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(V)) {
+ // Trace into subexpressions for more hoisting opportunities.
+ if (CanTraceInto(SignExtended, ZeroExtended, BO, NonNegative))
+ ConstantOffset = findInEitherOperand(BO, SignExtended, ZeroExtended);
+ } else if (isa<SExtInst>(V)) {
+ ConstantOffset = find(U->getOperand(0), /* SignExtended */ true,
+ ZeroExtended, NonNegative).sext(BitWidth);
+ } else if (isa<ZExtInst>(V)) {
+ // As an optimization, we can clear the SignExtended flag because
+ // sext(zext(a)) = zext(a). Verified in @sext_zext in split-gep.ll.
+ //
+ // Clear the NonNegative flag, because zext(a) >= 0 does not imply a >= 0.
+ ConstantOffset =
+ find(U->getOperand(0), /* SignExtended */ false,
+ /* ZeroExtended */ true, /* NonNegative */ false).zext(BitWidth);
+ }
+
+ // If we found a non-zero constant offset, add it to the path for
+ // rebuildWithoutConstOffset. Zero is a valid constant offset, but doesn't
+ // help this optimization.
+ if (ConstantOffset != 0)
+ UserChain.push_back(U);
+ return ConstantOffset;
+}
+
+Value *ConstantOffsetExtractor::applyExts(Value *V) {
+ Value *Current = V;
+ // ExtInsts is built in the use-def order. Therefore, we apply them to V
+ // in the reversed order.
+ for (auto I = ExtInsts.rbegin(), E = ExtInsts.rend(); I != E; ++I) {
+ if (Constant *C = dyn_cast<Constant>(Current)) {
+ // If Current is a constant, apply s/zext using ConstantExpr::getCast.
+ // ConstantExpr::getCast emits a ConstantInt if C is a ConstantInt.
+ Current = ConstantExpr::getCast((*I)->getOpcode(), C, (*I)->getType());
+ } else {
+ Instruction *Ext = (*I)->clone();
+ Ext->setOperand(0, Current);
+ Ext->insertBefore(IP);
+ Current = Ext;
+ }
+ }
+ return Current;
+}
+
+Value *ConstantOffsetExtractor::rebuildWithoutConstOffset() {
+ distributeExtsAndCloneChain(UserChain.size() - 1);
+ // Remove all nullptrs (used to be s/zext) from UserChain.
+ unsigned NewSize = 0;
+ for (auto I = UserChain.begin(), E = UserChain.end(); I != E; ++I) {
+ if (*I != nullptr) {
+ UserChain[NewSize] = *I;
+ NewSize++;
+ }
+ }
+ UserChain.resize(NewSize);
+ return removeConstOffset(UserChain.size() - 1);
+}
+
+Value *
+ConstantOffsetExtractor::distributeExtsAndCloneChain(unsigned ChainIndex) {
+ User *U = UserChain[ChainIndex];
+ if (ChainIndex == 0) {
+ assert(isa<ConstantInt>(U));
+ // If U is a ConstantInt, applyExts will return a ConstantInt as well.
+ return UserChain[ChainIndex] = cast<ConstantInt>(applyExts(U));
+ }
+
+ if (CastInst *Cast = dyn_cast<CastInst>(U)) {
+ assert((isa<SExtInst>(Cast) || isa<ZExtInst>(Cast)) &&
+ "We only traced into two types of CastInst: sext and zext");
+ ExtInsts.push_back(Cast);
+ UserChain[ChainIndex] = nullptr;
+ return distributeExtsAndCloneChain(ChainIndex - 1);
+ }
+
+ // Function find only trace into BinaryOperator and CastInst.
+ BinaryOperator *BO = cast<BinaryOperator>(U);
+ // OpNo = which operand of BO is UserChain[ChainIndex - 1]
+ unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1);
+ Value *TheOther = applyExts(BO->getOperand(1 - OpNo));
+ Value *NextInChain = distributeExtsAndCloneChain(ChainIndex - 1);
+
+ BinaryOperator *NewBO = nullptr;
+ if (OpNo == 0) {
+ NewBO = BinaryOperator::Create(BO->getOpcode(), NextInChain, TheOther,
+ BO->getName(), IP);
+ } else {
+ NewBO = BinaryOperator::Create(BO->getOpcode(), TheOther, NextInChain,
+ BO->getName(), IP);
+ }
+ return UserChain[ChainIndex] = NewBO;
+}
+
+Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
+ if (ChainIndex == 0) {
+ assert(isa<ConstantInt>(UserChain[ChainIndex]));
+ return ConstantInt::getNullValue(UserChain[ChainIndex]->getType());
+ }
+
+ BinaryOperator *BO = cast<BinaryOperator>(UserChain[ChainIndex]);
+ assert(BO->getNumUses() <= 1 &&
+ "distributeExtsAndCloneChain clones each BinaryOperator in "
+ "UserChain, so no one should be used more than "
+ "once");
+
+ unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1);
+ assert(BO->getOperand(OpNo) == UserChain[ChainIndex - 1]);
+ Value *NextInChain = removeConstOffset(ChainIndex - 1);
+ Value *TheOther = BO->getOperand(1 - OpNo);
+
+ // If NextInChain is 0 and not the LHS of a sub, we can simplify the
+ // sub-expression to be just TheOther.
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(NextInChain)) {
+ if (CI->isZero() && !(BO->getOpcode() == Instruction::Sub && OpNo == 0))
+ return TheOther;
+ }
+
+ BinaryOperator::BinaryOps NewOp = BO->getOpcode();
+ if (BO->getOpcode() == Instruction::Or) {
+ // Rebuild "or" as "add", because "or" may be invalid for the new
+ // epxression.
+ //
+ // For instance, given
+ // a | (b + 5) where a and b + 5 have no common bits,
+ // we can extract 5 as the constant offset.
+ //
+ // However, reusing the "or" in the new index would give us
+ // (a | b) + 5
+ // which does not equal a | (b + 5).
+ //
+ // Replacing the "or" with "add" is fine, because
+ // a | (b + 5) = a + (b + 5) = (a + b) + 5
+ NewOp = Instruction::Add;
+ }
+
+ BinaryOperator *NewBO;
+ if (OpNo == 0) {
+ NewBO = BinaryOperator::Create(NewOp, NextInChain, TheOther, "", IP);
+ } else {
+ NewBO = BinaryOperator::Create(NewOp, TheOther, NextInChain, "", IP);
+ }
+ NewBO->takeName(BO);
+ return NewBO;
+}
+
+Value *ConstantOffsetExtractor::Extract(Value *Idx, GetElementPtrInst *GEP,
+ User *&UserChainTail,
+ const DominatorTree *DT) {
+ ConstantOffsetExtractor Extractor(GEP, DT);
+ // Find a non-zero constant offset first.
+ APInt ConstantOffset =
+ Extractor.find(Idx, /* SignExtended */ false, /* ZeroExtended */ false,
+ GEP->isInBounds());
+ if (ConstantOffset == 0) {
+ UserChainTail = nullptr;
+ return nullptr;
+ }
+ // Separates the constant offset from the GEP index.
+ Value *IdxWithoutConstOffset = Extractor.rebuildWithoutConstOffset();
+ UserChainTail = Extractor.UserChain.back();
+ return IdxWithoutConstOffset;
+}
+
+int64_t ConstantOffsetExtractor::Find(Value *Idx, GetElementPtrInst *GEP,
+ const DominatorTree *DT) {
+ // If Idx is an index of an inbound GEP, Idx is guaranteed to be non-negative.
+ return ConstantOffsetExtractor(GEP, DT)
+ .find(Idx, /* SignExtended */ false, /* ZeroExtended */ false,
+ GEP->isInBounds())
+ .getSExtValue();
+}
+
+bool SeparateConstOffsetFromGEP::canonicalizeArrayIndicesToPointerSize(
+ GetElementPtrInst *GEP) {
+ bool Changed = false;
+ Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
+ gep_type_iterator GTI = gep_type_begin(*GEP);
+ for (User::op_iterator I = GEP->op_begin() + 1, E = GEP->op_end();
+ I != E; ++I, ++GTI) {
+ // Skip struct member indices which must be i32.
+ if (isa<SequentialType>(*GTI)) {
+ if ((*I)->getType() != IntPtrTy) {
+ *I = CastInst::CreateIntegerCast(*I, IntPtrTy, true, "idxprom", GEP);
+ Changed = true;
+ }
+ }
+ }
+ return Changed;
+}
+
+int64_t
+SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP,
+ bool &NeedsExtraction) {
+ NeedsExtraction = false;
+ int64_t AccumulativeByteOffset = 0;
+ gep_type_iterator GTI = gep_type_begin(*GEP);
+ for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
+ if (isa<SequentialType>(*GTI)) {
+ // Tries to extract a constant offset from this GEP index.
+ int64_t ConstantOffset =
+ ConstantOffsetExtractor::Find(GEP->getOperand(I), GEP, DT);
+ if (ConstantOffset != 0) {
+ NeedsExtraction = true;
+ // A GEP may have multiple indices. We accumulate the extracted
+ // constant offset to a byte offset, and later offset the remainder of
+ // the original GEP with this byte offset.
+ AccumulativeByteOffset +=
+ ConstantOffset * DL->getTypeAllocSize(GTI.getIndexedType());
+ }
+ } else if (LowerGEP) {
+ StructType *StTy = cast<StructType>(*GTI);
+ uint64_t Field = cast<ConstantInt>(GEP->getOperand(I))->getZExtValue();
+ // Skip field 0 as the offset is always 0.
+ if (Field != 0) {
+ NeedsExtraction = true;
+ AccumulativeByteOffset +=
+ DL->getStructLayout(StTy)->getElementOffset(Field);
+ }
+ }
+ }
+ return AccumulativeByteOffset;
+}
+
+void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs(
+ GetElementPtrInst *Variadic, int64_t AccumulativeByteOffset) {
+ IRBuilder<> Builder(Variadic);
+ Type *IntPtrTy = DL->getIntPtrType(Variadic->getType());
+
+ Type *I8PtrTy =
+ Builder.getInt8PtrTy(Variadic->getType()->getPointerAddressSpace());
+ Value *ResultPtr = Variadic->getOperand(0);
+ Loop *L = LI->getLoopFor(Variadic->getParent());
+ // Check if the base is not loop invariant or used more than once.
+ bool isSwapCandidate =
+ L && L->isLoopInvariant(ResultPtr) &&
+ !hasMoreThanOneUseInLoop(ResultPtr, L);
+ Value *FirstResult = nullptr;
+
+ if (ResultPtr->getType() != I8PtrTy)
+ ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy);
+
+ gep_type_iterator GTI = gep_type_begin(*Variadic);
+ // Create an ugly GEP for each sequential index. We don't create GEPs for
+ // structure indices, as they are accumulated in the constant offset index.
+ for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; ++I, ++GTI) {
+ if (isa<SequentialType>(*GTI)) {
+ Value *Idx = Variadic->getOperand(I);
+ // Skip zero indices.
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx))
+ if (CI->isZero())
+ continue;
+
+ APInt ElementSize = APInt(IntPtrTy->getIntegerBitWidth(),
+ DL->getTypeAllocSize(GTI.getIndexedType()));
+ // Scale the index by element size.
+ if (ElementSize != 1) {
+ if (ElementSize.isPowerOf2()) {
+ Idx = Builder.CreateShl(
+ Idx, ConstantInt::get(IntPtrTy, ElementSize.logBase2()));
+ } else {
+ Idx = Builder.CreateMul(Idx, ConstantInt::get(IntPtrTy, ElementSize));
+ }
+ }
+ // Create an ugly GEP with a single index for each index.
+ ResultPtr =
+ Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Idx, "uglygep");
+ if (FirstResult == nullptr)
+ FirstResult = ResultPtr;
+ }
+ }
+
+ // Create a GEP with the constant offset index.
+ if (AccumulativeByteOffset != 0) {
+ Value *Offset = ConstantInt::get(IntPtrTy, AccumulativeByteOffset);
+ ResultPtr =
+ Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Offset, "uglygep");
+ } else
+ isSwapCandidate = false;
+
+ // If we created a GEP with constant index, and the base is loop invariant,
+ // then we swap the first one with it, so LICM can move constant GEP out
+ // later.
+ GetElementPtrInst *FirstGEP = dyn_cast<GetElementPtrInst>(FirstResult);
+ GetElementPtrInst *SecondGEP = dyn_cast<GetElementPtrInst>(ResultPtr);
+ if (isSwapCandidate && isLegalToSwapOperand(FirstGEP, SecondGEP, L))
+ swapGEPOperand(FirstGEP, SecondGEP);
+
+ if (ResultPtr->getType() != Variadic->getType())
+ ResultPtr = Builder.CreateBitCast(ResultPtr, Variadic->getType());
+
+ Variadic->replaceAllUsesWith(ResultPtr);
+ Variadic->eraseFromParent();
+}
+
+void
+SeparateConstOffsetFromGEP::lowerToArithmetics(GetElementPtrInst *Variadic,
+ int64_t AccumulativeByteOffset) {
+ IRBuilder<> Builder(Variadic);
+ Type *IntPtrTy = DL->getIntPtrType(Variadic->getType());
+
+ Value *ResultPtr = Builder.CreatePtrToInt(Variadic->getOperand(0), IntPtrTy);
+ gep_type_iterator GTI = gep_type_begin(*Variadic);
+ // Create ADD/SHL/MUL arithmetic operations for each sequential indices. We
+ // don't create arithmetics for structure indices, as they are accumulated
+ // in the constant offset index.
+ for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; ++I, ++GTI) {
+ if (isa<SequentialType>(*GTI)) {
+ Value *Idx = Variadic->getOperand(I);
+ // Skip zero indices.
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx))
+ if (CI->isZero())
+ continue;
+
+ APInt ElementSize = APInt(IntPtrTy->getIntegerBitWidth(),
+ DL->getTypeAllocSize(GTI.getIndexedType()));
+ // Scale the index by element size.
+ if (ElementSize != 1) {
+ if (ElementSize.isPowerOf2()) {
+ Idx = Builder.CreateShl(
+ Idx, ConstantInt::get(IntPtrTy, ElementSize.logBase2()));
+ } else {
+ Idx = Builder.CreateMul(Idx, ConstantInt::get(IntPtrTy, ElementSize));
+ }
+ }
+ // Create an ADD for each index.
+ ResultPtr = Builder.CreateAdd(ResultPtr, Idx);
+ }
+ }
+
+ // Create an ADD for the constant offset index.
+ if (AccumulativeByteOffset != 0) {
+ ResultPtr = Builder.CreateAdd(
+ ResultPtr, ConstantInt::get(IntPtrTy, AccumulativeByteOffset));
+ }
+
+ ResultPtr = Builder.CreateIntToPtr(ResultPtr, Variadic->getType());
+ Variadic->replaceAllUsesWith(ResultPtr);
+ Variadic->eraseFromParent();
+}
+
+bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
+ // Skip vector GEPs.
+ if (GEP->getType()->isVectorTy())
+ return false;
+
+ // The backend can already nicely handle the case where all indices are
+ // constant.
+ if (GEP->hasAllConstantIndices())
+ return false;
+
+ bool Changed = canonicalizeArrayIndicesToPointerSize(GEP);
+
+ bool NeedsExtraction;
+ int64_t AccumulativeByteOffset = accumulateByteOffset(GEP, NeedsExtraction);
+
+ if (!NeedsExtraction)
+ return Changed;
+ // If LowerGEP is disabled, before really splitting the GEP, check whether the
+ // backend supports the addressing mode we are about to produce. If no, this
+ // splitting probably won't be beneficial.
+ // If LowerGEP is enabled, even the extracted constant offset can not match
+ // the addressing mode, we can still do optimizations to other lowered parts
+ // of variable indices. Therefore, we don't check for addressing modes in that
+ // case.
+ if (!LowerGEP) {
+ TargetTransformInfo &TTI =
+ getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+ *GEP->getParent()->getParent());
+ unsigned AddrSpace = GEP->getPointerAddressSpace();
+ if (!TTI.isLegalAddressingMode(GEP->getType()->getElementType(),
+ /*BaseGV=*/nullptr, AccumulativeByteOffset,
+ /*HasBaseReg=*/true, /*Scale=*/0,
+ AddrSpace)) {
+ return Changed;
+ }
+ }
+
+ // Remove the constant offset in each sequential index. The resultant GEP
+ // computes the variadic base.
+ // Notice that we don't remove struct field indices here. If LowerGEP is
+ // disabled, a structure index is not accumulated and we still use the old
+ // one. If LowerGEP is enabled, a structure index is accumulated in the
+ // constant offset. LowerToSingleIndexGEPs or lowerToArithmetics will later
+ // handle the constant offset and won't need a new structure index.
+ gep_type_iterator GTI = gep_type_begin(*GEP);
+ for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
+ if (isa<SequentialType>(*GTI)) {
+ // Splits this GEP index into a variadic part and a constant offset, and
+ // uses the variadic part as the new index.
+ Value *OldIdx = GEP->getOperand(I);
+ User *UserChainTail;
+ Value *NewIdx =
+ ConstantOffsetExtractor::Extract(OldIdx, GEP, UserChainTail, DT);
+ if (NewIdx != nullptr) {
+ // Switches to the index with the constant offset removed.
+ GEP->setOperand(I, NewIdx);
+ // After switching to the new index, we can garbage-collect UserChain
+ // and the old index if they are not used.
+ RecursivelyDeleteTriviallyDeadInstructions(UserChainTail);
+ RecursivelyDeleteTriviallyDeadInstructions(OldIdx);
+ }
+ }
+ }
+
+ // Clear the inbounds attribute because the new index may be off-bound.
+ // e.g.,
+ //
+ // b = add i64 a, 5
+ // addr = gep inbounds float, float* p, i64 b
+ //
+ // is transformed to:
+ //
+ // addr2 = gep float, float* p, i64 a ; inbounds removed
+ // addr = gep inbounds float, float* addr2, i64 5
+ //
+ // If a is -4, although the old index b is in bounds, the new index a is
+ // off-bound. http://llvm.org/docs/LangRef.html#id181 says "if the
+ // inbounds keyword is not present, the offsets are added to the base
+ // address with silently-wrapping two's complement arithmetic".
+ // Therefore, the final code will be a semantically equivalent.
+ //
+ // TODO(jingyue): do some range analysis to keep as many inbounds as
+ // possible. GEPs with inbounds are more friendly to alias analysis.
+ bool GEPWasInBounds = GEP->isInBounds();
+ GEP->setIsInBounds(false);
+
+ // Lowers a GEP to either GEPs with a single index or arithmetic operations.
+ if (LowerGEP) {
+ // As currently BasicAA does not analyze ptrtoint/inttoptr, do not lower to
+ // arithmetic operations if the target uses alias analysis in codegen.
+ if (TM && TM->getSubtargetImpl(*GEP->getParent()->getParent())->useAA())
+ lowerToSingleIndexGEPs(GEP, AccumulativeByteOffset);
+ else
+ lowerToArithmetics(GEP, AccumulativeByteOffset);
+ return true;
+ }
+
+ // No need to create another GEP if the accumulative byte offset is 0.
+ if (AccumulativeByteOffset == 0)
+ return true;
+
+ // Offsets the base with the accumulative byte offset.
+ //
+ // %gep ; the base
+ // ... %gep ...
+ //
+ // => add the offset
+ //
+ // %gep2 ; clone of %gep
+ // %new.gep = gep %gep2, <offset / sizeof(*%gep)>
+ // %gep ; will be removed
+ // ... %gep ...
+ //
+ // => replace all uses of %gep with %new.gep and remove %gep
+ //
+ // %gep2 ; clone of %gep
+ // %new.gep = gep %gep2, <offset / sizeof(*%gep)>
+ // ... %new.gep ...
+ //
+ // If AccumulativeByteOffset is not a multiple of sizeof(*%gep), we emit an
+ // uglygep (http://llvm.org/docs/GetElementPtr.html#what-s-an-uglygep):
+ // bitcast %gep2 to i8*, add the offset, and bitcast the result back to the
+ // type of %gep.
+ //
+ // %gep2 ; clone of %gep
+ // %0 = bitcast %gep2 to i8*
+ // %uglygep = gep %0, <offset>
+ // %new.gep = bitcast %uglygep to <type of %gep>
+ // ... %new.gep ...
+ Instruction *NewGEP = GEP->clone();
+ NewGEP->insertBefore(GEP);
+
+ // Per ANSI C standard, signed / unsigned = unsigned and signed % unsigned =
+ // unsigned.. Therefore, we cast ElementTypeSizeOfGEP to signed because it is
+ // used with unsigned integers later.
+ int64_t ElementTypeSizeOfGEP = static_cast<int64_t>(
+ DL->getTypeAllocSize(GEP->getType()->getElementType()));
+ Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
+ if (AccumulativeByteOffset % ElementTypeSizeOfGEP == 0) {
+ // Very likely. As long as %gep is natually aligned, the byte offset we
+ // extracted should be a multiple of sizeof(*%gep).
+ int64_t Index = AccumulativeByteOffset / ElementTypeSizeOfGEP;
+ NewGEP = GetElementPtrInst::Create(GEP->getResultElementType(), NewGEP,
+ ConstantInt::get(IntPtrTy, Index, true),
+ GEP->getName(), GEP);
+ // Inherit the inbounds attribute of the original GEP.
+ cast<GetElementPtrInst>(NewGEP)->setIsInBounds(GEPWasInBounds);
+ } else {
+ // Unlikely but possible. For example,
+ // #pragma pack(1)
+ // struct S {
+ // int a[3];
+ // int64 b[8];
+ // };
+ // #pragma pack()
+ //
+ // Suppose the gep before extraction is &s[i + 1].b[j + 3]. After
+ // extraction, it becomes &s[i].b[j] and AccumulativeByteOffset is
+ // sizeof(S) + 3 * sizeof(int64) = 100, which is not a multiple of
+ // sizeof(int64).
+ //
+ // Emit an uglygep in this case.
+ Type *I8PtrTy = Type::getInt8PtrTy(GEP->getContext(),
+ GEP->getPointerAddressSpace());
+ NewGEP = new BitCastInst(NewGEP, I8PtrTy, "", GEP);
+ NewGEP = GetElementPtrInst::Create(
+ Type::getInt8Ty(GEP->getContext()), NewGEP,
+ ConstantInt::get(IntPtrTy, AccumulativeByteOffset, true), "uglygep",
+ GEP);
+ // Inherit the inbounds attribute of the original GEP.
+ cast<GetElementPtrInst>(NewGEP)->setIsInBounds(GEPWasInBounds);
+ if (GEP->getType() != I8PtrTy)
+ NewGEP = new BitCastInst(NewGEP, GEP->getType(), GEP->getName(), GEP);
+ }
+
+ GEP->replaceAllUsesWith(NewGEP);
+ GEP->eraseFromParent();
+
+ return true;
+}
+
+bool SeparateConstOffsetFromGEP::runOnFunction(Function &F) {
+ if (skipOptnoneFunction(F))
+ return false;
+
+ if (DisableSeparateConstOffsetFromGEP)
+ return false;
+
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ bool Changed = false;
+ for (Function::iterator B = F.begin(), BE = F.end(); B != BE; ++B) {
+ for (BasicBlock::iterator I = B->begin(), IE = B->end(); I != IE;)
+ if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I++))
+ Changed |= splitGEP(GEP);
+ // No need to split GEP ConstantExprs because all its indices are constant
+ // already.
+ }
+
+ Changed |= reuniteExts(F);
+
+ if (VerifyNoDeadCode)
+ verifyNoDeadCode(F);
+
+ return Changed;
+}
+
+Instruction *SeparateConstOffsetFromGEP::findClosestMatchingDominator(
+ const SCEV *Key, Instruction *Dominatee) {
+ auto Pos = DominatingExprs.find(Key);
+ if (Pos == DominatingExprs.end())
+ return nullptr;
+
+ auto &Candidates = Pos->second;
+ // Because we process the basic blocks in pre-order of the dominator tree, a
+ // candidate that doesn't dominate the current instruction won't dominate any
+ // future instruction either. Therefore, we pop it out of the stack. This
+ // optimization makes the algorithm O(n).
+ while (!Candidates.empty()) {
+ Instruction *Candidate = Candidates.back();
+ if (DT->dominates(Candidate, Dominatee))
+ return Candidate;
+ Candidates.pop_back();
+ }
+ return nullptr;
+}
+
+bool SeparateConstOffsetFromGEP::reuniteExts(Instruction *I) {
+ if (!SE->isSCEVable(I->getType()))
+ return false;
+
+ // Dom: LHS+RHS
+ // I: sext(LHS)+sext(RHS)
+ // If Dom can't sign overflow and Dom dominates I, optimize I to sext(Dom).
+ // TODO: handle zext
+ Value *LHS = nullptr, *RHS = nullptr;
+ if (match(I, m_Add(m_SExt(m_Value(LHS)), m_SExt(m_Value(RHS)))) ||
+ match(I, m_Sub(m_SExt(m_Value(LHS)), m_SExt(m_Value(RHS))))) {
+ if (LHS->getType() == RHS->getType()) {
+ const SCEV *Key =
+ SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS));
+ if (auto *Dom = findClosestMatchingDominator(Key, I)) {
+ Instruction *NewSExt = new SExtInst(Dom, I->getType(), "", I);
+ NewSExt->takeName(I);
+ I->replaceAllUsesWith(NewSExt);
+ RecursivelyDeleteTriviallyDeadInstructions(I);
+ return true;
+ }
+ }
+ }
+
+ // Add I to DominatingExprs if it's an add/sub that can't sign overflow.
+ if (match(I, m_NSWAdd(m_Value(LHS), m_Value(RHS))) ||
+ match(I, m_NSWSub(m_Value(LHS), m_Value(RHS)))) {
+ if (isKnownNotFullPoison(I)) {
+ const SCEV *Key =
+ SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS));
+ DominatingExprs[Key].push_back(I);
+ }
+ }
+ return false;
+}
+
+bool SeparateConstOffsetFromGEP::reuniteExts(Function &F) {
+ bool Changed = false;
+ DominatingExprs.clear();
+ for (auto Node = GraphTraits<DominatorTree *>::nodes_begin(DT);
+ Node != GraphTraits<DominatorTree *>::nodes_end(DT); ++Node) {
+ BasicBlock *BB = Node->getBlock();
+ for (auto I = BB->begin(); I != BB->end(); ) {
+ Instruction *Cur = &*I++;
+ Changed |= reuniteExts(Cur);
+ }
+ }
+ return Changed;
+}
+
+void SeparateConstOffsetFromGEP::verifyNoDeadCode(Function &F) {
+ for (auto &B : F) {
+ for (auto &I : B) {
+ if (isInstructionTriviallyDead(&I)) {
+ std::string ErrMessage;
+ raw_string_ostream RSO(ErrMessage);
+ RSO << "Dead instruction detected!\n" << I << "\n";
+ llvm_unreachable(RSO.str().c_str());
+ }
+ }
+ }
+}
+
+bool SeparateConstOffsetFromGEP::isLegalToSwapOperand(
+ GetElementPtrInst *FirstGEP, GetElementPtrInst *SecondGEP, Loop *CurLoop) {
+ if (!FirstGEP || !FirstGEP->hasOneUse())
+ return false;
+
+ if (!SecondGEP || FirstGEP->getParent() != SecondGEP->getParent())
+ return false;
+
+ if (FirstGEP == SecondGEP)
+ return false;
+
+ unsigned FirstNum = FirstGEP->getNumOperands();
+ unsigned SecondNum = SecondGEP->getNumOperands();
+ // Give up if the number of operands are not 2.
+ if (FirstNum != SecondNum || FirstNum != 2)
+ return false;
+
+ Value *FirstBase = FirstGEP->getOperand(0);
+ Value *SecondBase = SecondGEP->getOperand(0);
+ Value *FirstOffset = FirstGEP->getOperand(1);
+ // Give up if the index of the first GEP is loop invariant.
+ if (CurLoop->isLoopInvariant(FirstOffset))
+ return false;
+
+ // Give up if base doesn't have same type.
+ if (FirstBase->getType() != SecondBase->getType())
+ return false;
+
+ Instruction *FirstOffsetDef = dyn_cast<Instruction>(FirstOffset);
+
+ // Check if the second operand of first GEP has constant coefficient.
+ // For an example, for the following code, we won't gain anything by
+ // hoisting the second GEP out because the second GEP can be folded away.
+ // %scevgep.sum.ur159 = add i64 %idxprom48.ur, 256
+ // %67 = shl i64 %scevgep.sum.ur159, 2
+ // %uglygep160 = getelementptr i8* %65, i64 %67
+ // %uglygep161 = getelementptr i8* %uglygep160, i64 -1024
+
+ // Skip constant shift instruction which may be generated by Splitting GEPs.
+ if (FirstOffsetDef && FirstOffsetDef->isShift() &&
+ isa<ConstantInt>(FirstOffsetDef->getOperand(1)))
+ FirstOffsetDef = dyn_cast<Instruction>(FirstOffsetDef->getOperand(0));
+
+ // Give up if FirstOffsetDef is an Add or Sub with constant.
+ // Because it may not profitable at all due to constant folding.
+ if (FirstOffsetDef)
+ if (BinaryOperator *BO = dyn_cast<BinaryOperator>(FirstOffsetDef)) {
+ unsigned opc = BO->getOpcode();
+ if ((opc == Instruction::Add || opc == Instruction::Sub) &&
+ (isa<ConstantInt>(BO->getOperand(0)) ||
+ isa<ConstantInt>(BO->getOperand(1))))
+ return false;
+ }
+ return true;
+}
+
+bool SeparateConstOffsetFromGEP::hasMoreThanOneUseInLoop(Value *V, Loop *L) {
+ int UsesInLoop = 0;
+ for (User *U : V->users()) {
+ if (Instruction *User = dyn_cast<Instruction>(U))
+ if (L->contains(User))
+ if (++UsesInLoop > 1)
+ return true;
+ }
+ return false;
+}
+
+void SeparateConstOffsetFromGEP::swapGEPOperand(GetElementPtrInst *First,
+ GetElementPtrInst *Second) {
+ Value *Offset1 = First->getOperand(1);
+ Value *Offset2 = Second->getOperand(1);
+ First->setOperand(1, Offset2);
+ Second->setOperand(1, Offset1);
+
+ // We changed p+o+c to p+c+o, p+c may not be inbound anymore.
+ const DataLayout &DAL = First->getModule()->getDataLayout();
+ APInt Offset(DAL.getPointerSizeInBits(
+ cast<PointerType>(First->getType())->getAddressSpace()),
+ 0);
+ Value *NewBase =
+ First->stripAndAccumulateInBoundsConstantOffsets(DAL, Offset);
+ uint64_t ObjectSize;
+ if (!getObjectSize(NewBase, ObjectSize, DAL, TLI) ||
+ Offset.ugt(ObjectSize)) {
+ First->setIsInBounds(false);
+ Second->setIsInBounds(false);
+ } else
+ First->setIsInBounds(true);
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
new file mode 100644
index 0000000..63c8836
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -0,0 +1,239 @@
+//===- SimplifyCFGPass.cpp - CFG Simplification Pass ----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements dead code elimination and basic block merging, along
+// with a collection of other peephole control flow optimizations. For example:
+//
+// * Removes basic blocks with no predecessors.
+// * Merges a basic block into its predecessor if there is only one and the
+// predecessor only has one successor.
+// * Eliminates PHI nodes for basic blocks with a single predecessor.
+// * Eliminates a basic block that only contains an unconditional branch.
+// * Changes invoke instructions to nounwind functions to be calls.
+// * Change things like "if (x) if (y)" into "if (x&y)".
+// * etc..
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/SimplifyCFG.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Scalar.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "simplifycfg"
+
+static cl::opt<unsigned>
+UserBonusInstThreshold("bonus-inst-threshold", cl::Hidden, cl::init(1),
+ cl::desc("Control the number of bonus instructions (default = 1)"));
+
+STATISTIC(NumSimpl, "Number of blocks simplified");
+
+/// If we have more than one empty (other than phi node) return blocks,
+/// merge them together to promote recursive block merging.
+static bool mergeEmptyReturnBlocks(Function &F) {
+ bool Changed = false;
+
+ BasicBlock *RetBlock = nullptr;
+
+ // Scan all the blocks in the function, looking for empty return blocks.
+ for (Function::iterator BBI = F.begin(), E = F.end(); BBI != E; ) {
+ BasicBlock &BB = *BBI++;
+
+ // Only look at return blocks.
+ ReturnInst *Ret = dyn_cast<ReturnInst>(BB.getTerminator());
+ if (!Ret) continue;
+
+ // Only look at the block if it is empty or the only other thing in it is a
+ // single PHI node that is the operand to the return.
+ if (Ret != &BB.front()) {
+ // Check for something else in the block.
+ BasicBlock::iterator I(Ret);
+ --I;
+ // Skip over debug info.
+ while (isa<DbgInfoIntrinsic>(I) && I != BB.begin())
+ --I;
+ if (!isa<DbgInfoIntrinsic>(I) &&
+ (!isa<PHINode>(I) || I != BB.begin() || Ret->getNumOperands() == 0 ||
+ Ret->getOperand(0) != &*I))
+ continue;
+ }
+
+ // If this is the first returning block, remember it and keep going.
+ if (!RetBlock) {
+ RetBlock = &BB;
+ continue;
+ }
+
+ // Otherwise, we found a duplicate return block. Merge the two.
+ Changed = true;
+
+ // Case when there is no input to the return or when the returned values
+ // agree is trivial. Note that they can't agree if there are phis in the
+ // blocks.
+ if (Ret->getNumOperands() == 0 ||
+ Ret->getOperand(0) ==
+ cast<ReturnInst>(RetBlock->getTerminator())->getOperand(0)) {
+ BB.replaceAllUsesWith(RetBlock);
+ BB.eraseFromParent();
+ continue;
+ }
+
+ // If the canonical return block has no PHI node, create one now.
+ PHINode *RetBlockPHI = dyn_cast<PHINode>(RetBlock->begin());
+ if (!RetBlockPHI) {
+ Value *InVal = cast<ReturnInst>(RetBlock->getTerminator())->getOperand(0);
+ pred_iterator PB = pred_begin(RetBlock), PE = pred_end(RetBlock);
+ RetBlockPHI = PHINode::Create(Ret->getOperand(0)->getType(),
+ std::distance(PB, PE), "merge",
+ &RetBlock->front());
+
+ for (pred_iterator PI = PB; PI != PE; ++PI)
+ RetBlockPHI->addIncoming(InVal, *PI);
+ RetBlock->getTerminator()->setOperand(0, RetBlockPHI);
+ }
+
+ // Turn BB into a block that just unconditionally branches to the return
+ // block. This handles the case when the two return blocks have a common
+ // predecessor but that return different things.
+ RetBlockPHI->addIncoming(Ret->getOperand(0), &BB);
+ BB.getTerminator()->eraseFromParent();
+ BranchInst::Create(RetBlock, &BB);
+ }
+
+ return Changed;
+}
+
+/// Call SimplifyCFG on all the blocks in the function,
+/// iterating until no more changes are made.
+static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
+ AssumptionCache *AC,
+ unsigned BonusInstThreshold) {
+ bool Changed = false;
+ bool LocalChange = true;
+ while (LocalChange) {
+ LocalChange = false;
+
+ // Loop over all of the basic blocks and remove them if they are unneeded.
+ for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) {
+ if (SimplifyCFG(&*BBIt++, TTI, BonusInstThreshold, AC)) {
+ LocalChange = true;
+ ++NumSimpl;
+ }
+ }
+ Changed |= LocalChange;
+ }
+ return Changed;
+}
+
+static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI,
+ AssumptionCache *AC, int BonusInstThreshold) {
+ bool EverChanged = removeUnreachableBlocks(F);
+ EverChanged |= mergeEmptyReturnBlocks(F);
+ EverChanged |= iterativelySimplifyCFG(F, TTI, AC, BonusInstThreshold);
+
+ // If neither pass changed anything, we're done.
+ if (!EverChanged) return false;
+
+ // iterativelySimplifyCFG can (rarely) make some loops dead. If this happens,
+ // removeUnreachableBlocks is needed to nuke them, which means we should
+ // iterate between the two optimizations. We structure the code like this to
+ // avoid rerunning iterativelySimplifyCFG if the second pass of
+ // removeUnreachableBlocks doesn't do anything.
+ if (!removeUnreachableBlocks(F))
+ return true;
+
+ do {
+ EverChanged = iterativelySimplifyCFG(F, TTI, AC, BonusInstThreshold);
+ EverChanged |= removeUnreachableBlocks(F);
+ } while (EverChanged);
+
+ return true;
+}
+
+SimplifyCFGPass::SimplifyCFGPass()
+ : BonusInstThreshold(UserBonusInstThreshold) {}
+
+SimplifyCFGPass::SimplifyCFGPass(int BonusInstThreshold)
+ : BonusInstThreshold(BonusInstThreshold) {}
+
+PreservedAnalyses SimplifyCFGPass::run(Function &F,
+ AnalysisManager<Function> *AM) {
+ auto &TTI = AM->getResult<TargetIRAnalysis>(F);
+ auto &AC = AM->getResult<AssumptionAnalysis>(F);
+
+ if (!simplifyFunctionCFG(F, TTI, &AC, BonusInstThreshold))
+ return PreservedAnalyses::none();
+
+ return PreservedAnalyses::all();
+}
+
+namespace {
+struct CFGSimplifyPass : public FunctionPass {
+ static char ID; // Pass identification, replacement for typeid
+ unsigned BonusInstThreshold;
+ std::function<bool(const Function &)> PredicateFtor;
+
+ CFGSimplifyPass(int T = -1,
+ std::function<bool(const Function &)> Ftor = nullptr)
+ : FunctionPass(ID), PredicateFtor(Ftor) {
+ BonusInstThreshold = (T == -1) ? UserBonusInstThreshold : unsigned(T);
+ initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry());
+ }
+ bool runOnFunction(Function &F) override {
+ if (PredicateFtor && !PredicateFtor(F))
+ return false;
+
+ if (skipOptnoneFunction(F))
+ return false;
+
+ AssumptionCache *AC =
+ &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ const TargetTransformInfo &TTI =
+ getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ return simplifyFunctionCFG(F, TTI, AC, BonusInstThreshold);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ }
+};
+}
+
+char CFGSimplifyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_END(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false,
+ false)
+
+// Public interface to the CFGSimplification pass
+FunctionPass *
+llvm::createCFGSimplificationPass(int Threshold,
+ std::function<bool(const Function &)> Ftor) {
+ return new CFGSimplifyPass(Threshold, Ftor);
+}
+
diff --git a/contrib/llvm/lib/Transforms/Scalar/Sink.cpp b/contrib/llvm/lib/Transforms/Scalar/Sink.cpp
new file mode 100644
index 0000000..64109b2
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/Sink.cpp
@@ -0,0 +1,290 @@
+//===-- Sink.cpp - Code Sinking -------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass moves instructions into successor blocks, when possible, so that
+// they aren't executed on paths where their results aren't needed.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "sink"
+
+STATISTIC(NumSunk, "Number of instructions sunk");
+STATISTIC(NumSinkIter, "Number of sinking iterations");
+
+namespace {
+ class Sinking : public FunctionPass {
+ DominatorTree *DT;
+ LoopInfo *LI;
+ AliasAnalysis *AA;
+
+ public:
+ static char ID; // Pass identification
+ Sinking() : FunctionPass(ID) {
+ initializeSinkingPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ FunctionPass::getAnalysisUsage(AU);
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
+ }
+ private:
+ bool ProcessBlock(BasicBlock &BB);
+ bool SinkInstruction(Instruction *I, SmallPtrSetImpl<Instruction*> &Stores);
+ bool AllUsesDominatedByBlock(Instruction *Inst, BasicBlock *BB) const;
+ bool IsAcceptableTarget(Instruction *Inst, BasicBlock *SuccToSinkTo) const;
+ };
+} // end anonymous namespace
+
+char Sinking::ID = 0;
+INITIALIZE_PASS_BEGIN(Sinking, "sink", "Code sinking", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(Sinking, "sink", "Code sinking", false, false)
+
+FunctionPass *llvm::createSinkingPass() { return new Sinking(); }
+
+/// AllUsesDominatedByBlock - Return true if all uses of the specified value
+/// occur in blocks dominated by the specified block.
+bool Sinking::AllUsesDominatedByBlock(Instruction *Inst,
+ BasicBlock *BB) const {
+ // Ignoring debug uses is necessary so debug info doesn't affect the code.
+ // This may leave a referencing dbg_value in the original block, before
+ // the definition of the vreg. Dwarf generator handles this although the
+ // user might not get the right info at runtime.
+ for (Use &U : Inst->uses()) {
+ // Determine the block of the use.
+ Instruction *UseInst = cast<Instruction>(U.getUser());
+ BasicBlock *UseBlock = UseInst->getParent();
+ if (PHINode *PN = dyn_cast<PHINode>(UseInst)) {
+ // PHI nodes use the operand in the predecessor block, not the block with
+ // the PHI.
+ unsigned Num = PHINode::getIncomingValueNumForOperand(U.getOperandNo());
+ UseBlock = PN->getIncomingBlock(Num);
+ }
+ // Check that it dominates.
+ if (!DT->dominates(BB, UseBlock))
+ return false;
+ }
+ return true;
+}
+
+bool Sinking::runOnFunction(Function &F) {
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+
+ bool MadeChange, EverMadeChange = false;
+
+ do {
+ MadeChange = false;
+ DEBUG(dbgs() << "Sinking iteration " << NumSinkIter << "\n");
+ // Process all basic blocks.
+ for (Function::iterator I = F.begin(), E = F.end();
+ I != E; ++I)
+ MadeChange |= ProcessBlock(*I);
+ EverMadeChange |= MadeChange;
+ NumSinkIter++;
+ } while (MadeChange);
+
+ return EverMadeChange;
+}
+
+bool Sinking::ProcessBlock(BasicBlock &BB) {
+ // Can't sink anything out of a block that has less than two successors.
+ if (BB.getTerminator()->getNumSuccessors() <= 1) return false;
+
+ // Don't bother sinking code out of unreachable blocks. In addition to being
+ // unprofitable, it can also lead to infinite looping, because in an
+ // unreachable loop there may be nowhere to stop.
+ if (!DT->isReachableFromEntry(&BB)) return false;
+
+ bool MadeChange = false;
+
+ // Walk the basic block bottom-up. Remember if we saw a store.
+ BasicBlock::iterator I = BB.end();
+ --I;
+ bool ProcessedBegin = false;
+ SmallPtrSet<Instruction *, 8> Stores;
+ do {
+ Instruction *Inst = &*I; // The instruction to sink.
+
+ // Predecrement I (if it's not begin) so that it isn't invalidated by
+ // sinking.
+ ProcessedBegin = I == BB.begin();
+ if (!ProcessedBegin)
+ --I;
+
+ if (isa<DbgInfoIntrinsic>(Inst))
+ continue;
+
+ if (SinkInstruction(Inst, Stores))
+ ++NumSunk, MadeChange = true;
+
+ // If we just processed the first instruction in the block, we're done.
+ } while (!ProcessedBegin);
+
+ return MadeChange;
+}
+
+static bool isSafeToMove(Instruction *Inst, AliasAnalysis *AA,
+ SmallPtrSetImpl<Instruction *> &Stores) {
+
+ if (Inst->mayWriteToMemory()) {
+ Stores.insert(Inst);
+ return false;
+ }
+
+ if (LoadInst *L = dyn_cast<LoadInst>(Inst)) {
+ MemoryLocation Loc = MemoryLocation::get(L);
+ for (Instruction *S : Stores)
+ if (AA->getModRefInfo(S, Loc) & MRI_Mod)
+ return false;
+ }
+
+ if (isa<TerminatorInst>(Inst) || isa<PHINode>(Inst) || Inst->isEHPad() ||
+ Inst->mayThrow())
+ return false;
+
+ // Convergent operations cannot be made control-dependent on additional
+ // values.
+ if (auto CS = CallSite(Inst)) {
+ if (CS.hasFnAttr(Attribute::Convergent))
+ return false;
+ }
+
+ return true;
+}
+
+/// IsAcceptableTarget - Return true if it is possible to sink the instruction
+/// in the specified basic block.
+bool Sinking::IsAcceptableTarget(Instruction *Inst,
+ BasicBlock *SuccToSinkTo) const {
+ assert(Inst && "Instruction to be sunk is null");
+ assert(SuccToSinkTo && "Candidate sink target is null");
+
+ // It is not possible to sink an instruction into its own block. This can
+ // happen with loops.
+ if (Inst->getParent() == SuccToSinkTo)
+ return false;
+
+ // It's never legal to sink an instruction into a block which terminates in an
+ // EH-pad.
+ if (SuccToSinkTo->getTerminator()->isExceptional())
+ return false;
+
+ // If the block has multiple predecessors, this would introduce computation
+ // on different code paths. We could split the critical edge, but for now we
+ // just punt.
+ // FIXME: Split critical edges if not backedges.
+ if (SuccToSinkTo->getUniquePredecessor() != Inst->getParent()) {
+ // We cannot sink a load across a critical edge - there may be stores in
+ // other code paths.
+ if (!isSafeToSpeculativelyExecute(Inst))
+ return false;
+
+ // We don't want to sink across a critical edge if we don't dominate the
+ // successor. We could be introducing calculations to new code paths.
+ if (!DT->dominates(Inst->getParent(), SuccToSinkTo))
+ return false;
+
+ // Don't sink instructions into a loop.
+ Loop *succ = LI->getLoopFor(SuccToSinkTo);
+ Loop *cur = LI->getLoopFor(Inst->getParent());
+ if (succ != nullptr && succ != cur)
+ return false;
+ }
+
+ // Finally, check that all the uses of the instruction are actually
+ // dominated by the candidate
+ return AllUsesDominatedByBlock(Inst, SuccToSinkTo);
+}
+
+/// SinkInstruction - Determine whether it is safe to sink the specified machine
+/// instruction out of its current block into a successor.
+bool Sinking::SinkInstruction(Instruction *Inst,
+ SmallPtrSetImpl<Instruction *> &Stores) {
+
+ // Don't sink static alloca instructions. CodeGen assumes allocas outside the
+ // entry block are dynamically sized stack objects.
+ if (AllocaInst *AI = dyn_cast<AllocaInst>(Inst))
+ if (AI->isStaticAlloca())
+ return false;
+
+ // Check if it's safe to move the instruction.
+ if (!isSafeToMove(Inst, AA, Stores))
+ return false;
+
+ // FIXME: This should include support for sinking instructions within the
+ // block they are currently in to shorten the live ranges. We often get
+ // instructions sunk into the top of a large block, but it would be better to
+ // also sink them down before their first use in the block. This xform has to
+ // be careful not to *increase* register pressure though, e.g. sinking
+ // "x = y + z" down if it kills y and z would increase the live ranges of y
+ // and z and only shrink the live range of x.
+
+ // SuccToSinkTo - This is the successor to sink this instruction to, once we
+ // decide.
+ BasicBlock *SuccToSinkTo = nullptr;
+
+ // Instructions can only be sunk if all their uses are in blocks
+ // dominated by one of the successors.
+ // Look at all the postdominators and see if we can sink it in one.
+ DomTreeNode *DTN = DT->getNode(Inst->getParent());
+ for (DomTreeNode::iterator I = DTN->begin(), E = DTN->end();
+ I != E && SuccToSinkTo == nullptr; ++I) {
+ BasicBlock *Candidate = (*I)->getBlock();
+ if ((*I)->getIDom()->getBlock() == Inst->getParent() &&
+ IsAcceptableTarget(Inst, Candidate))
+ SuccToSinkTo = Candidate;
+ }
+
+ // If no suitable postdominator was found, look at all the successors and
+ // decide which one we should sink to, if any.
+ for (succ_iterator I = succ_begin(Inst->getParent()),
+ E = succ_end(Inst->getParent()); I != E && !SuccToSinkTo; ++I) {
+ if (IsAcceptableTarget(Inst, *I))
+ SuccToSinkTo = *I;
+ }
+
+ // If we couldn't find a block to sink to, ignore this instruction.
+ if (!SuccToSinkTo)
+ return false;
+
+ DEBUG(dbgs() << "Sink" << *Inst << " (";
+ Inst->getParent()->printAsOperand(dbgs(), false);
+ dbgs() << " -> ";
+ SuccToSinkTo->printAsOperand(dbgs(), false);
+ dbgs() << ")\n");
+
+ // Move the instruction.
+ Inst->moveBefore(&*SuccToSinkTo->getFirstInsertionPt());
+ return true;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp b/contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
new file mode 100644
index 0000000..147d615
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
@@ -0,0 +1,243 @@
+//===- SpeculativeExecution.cpp ---------------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass hoists instructions to enable speculative execution on
+// targets where branches are expensive. This is aimed at GPUs. It
+// currently works on simple if-then and if-then-else
+// patterns.
+//
+// Removing branches is not the only motivation for this
+// pass. E.g. consider this code and assume that there is no
+// addressing mode for multiplying by sizeof(*a):
+//
+// if (b > 0)
+// c = a[i + 1]
+// if (d > 0)
+// e = a[i + 2]
+//
+// turns into
+//
+// p = &a[i + 1];
+// if (b > 0)
+// c = *p;
+// q = &a[i + 2];
+// if (d > 0)
+// e = *q;
+//
+// which could later be optimized to
+//
+// r = &a[i];
+// if (b > 0)
+// c = r[1];
+// if (d > 0)
+// e = r[2];
+//
+// Later passes sink back much of the speculated code that did not enable
+// further optimization.
+//
+// This pass is more aggressive than the function SpeculativeyExecuteBB in
+// SimplifyCFG. SimplifyCFG will not speculate if no selects are introduced and
+// it will speculate at most one instruction. It also will not speculate if
+// there is a value defined in the if-block that is only used in the then-block.
+// These restrictions make sense since the speculation in SimplifyCFG seems
+// aimed at introducing cheap selects, while this pass is intended to do more
+// aggressive speculation while counting on later passes to either capitalize on
+// that or clean it up.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "speculative-execution"
+
+// The risk that speculation will not pay off increases with the
+// number of instructions speculated, so we put a limit on that.
+static cl::opt<unsigned> SpecExecMaxSpeculationCost(
+ "spec-exec-max-speculation-cost", cl::init(7), cl::Hidden,
+ cl::desc("Speculative execution is not applied to basic blocks where "
+ "the cost of the instructions to speculatively execute "
+ "exceeds this limit."));
+
+// Speculating just a few instructions from a larger block tends not
+// to be profitable and this limit prevents that. A reason for that is
+// that small basic blocks are more likely to be candidates for
+// further optimization.
+static cl::opt<unsigned> SpecExecMaxNotHoisted(
+ "spec-exec-max-not-hoisted", cl::init(5), cl::Hidden,
+ cl::desc("Speculative execution is not applied to basic blocks where the "
+ "number of instructions that would not be speculatively executed "
+ "exceeds this limit."));
+
+namespace {
+class SpeculativeExecution : public FunctionPass {
+ public:
+ static char ID;
+ SpeculativeExecution(): FunctionPass(ID) {}
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+ bool runOnFunction(Function &F) override;
+
+ private:
+ bool runOnBasicBlock(BasicBlock &B);
+ bool considerHoistingFromTo(BasicBlock &FromBlock, BasicBlock &ToBlock);
+
+ const TargetTransformInfo *TTI = nullptr;
+};
+} // namespace
+
+char SpeculativeExecution::ID = 0;
+INITIALIZE_PASS_BEGIN(SpeculativeExecution, "speculative-execution",
+ "Speculatively execute instructions", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(SpeculativeExecution, "speculative-execution",
+ "Speculatively execute instructions", false, false)
+
+void SpeculativeExecution::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+}
+
+bool SpeculativeExecution::runOnFunction(Function &F) {
+ if (skipOptnoneFunction(F))
+ return false;
+
+ TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+
+ bool Changed = false;
+ for (auto& B : F) {
+ Changed |= runOnBasicBlock(B);
+ }
+ return Changed;
+}
+
+bool SpeculativeExecution::runOnBasicBlock(BasicBlock &B) {
+ BranchInst *BI = dyn_cast<BranchInst>(B.getTerminator());
+ if (BI == nullptr)
+ return false;
+
+ if (BI->getNumSuccessors() != 2)
+ return false;
+ BasicBlock &Succ0 = *BI->getSuccessor(0);
+ BasicBlock &Succ1 = *BI->getSuccessor(1);
+
+ if (&B == &Succ0 || &B == &Succ1 || &Succ0 == &Succ1) {
+ return false;
+ }
+
+ // Hoist from if-then (triangle).
+ if (Succ0.getSinglePredecessor() != nullptr &&
+ Succ0.getSingleSuccessor() == &Succ1) {
+ return considerHoistingFromTo(Succ0, B);
+ }
+
+ // Hoist from if-else (triangle).
+ if (Succ1.getSinglePredecessor() != nullptr &&
+ Succ1.getSingleSuccessor() == &Succ0) {
+ return considerHoistingFromTo(Succ1, B);
+ }
+
+ // Hoist from if-then-else (diamond), but only if it is equivalent to
+ // an if-else or if-then due to one of the branches doing nothing.
+ if (Succ0.getSinglePredecessor() != nullptr &&
+ Succ1.getSinglePredecessor() != nullptr &&
+ Succ1.getSingleSuccessor() != nullptr &&
+ Succ1.getSingleSuccessor() != &B &&
+ Succ1.getSingleSuccessor() == Succ0.getSingleSuccessor()) {
+ // If a block has only one instruction, then that is a terminator
+ // instruction so that the block does nothing. This does happen.
+ if (Succ1.size() == 1) // equivalent to if-then
+ return considerHoistingFromTo(Succ0, B);
+ if (Succ0.size() == 1) // equivalent to if-else
+ return considerHoistingFromTo(Succ1, B);
+ }
+
+ return false;
+}
+
+static unsigned ComputeSpeculationCost(const Instruction *I,
+ const TargetTransformInfo &TTI) {
+ switch (Operator::getOpcode(I)) {
+ case Instruction::GetElementPtr:
+ case Instruction::Add:
+ case Instruction::Mul:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Select:
+ case Instruction::Shl:
+ case Instruction::Sub:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::Xor:
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ return TTI.getUserCost(I);
+
+ default:
+ return UINT_MAX; // Disallow anything not whitelisted.
+ }
+}
+
+bool SpeculativeExecution::considerHoistingFromTo(BasicBlock &FromBlock,
+ BasicBlock &ToBlock) {
+ SmallSet<const Instruction *, 8> NotHoisted;
+ const auto AllPrecedingUsesFromBlockHoisted = [&NotHoisted](User *U) {
+ for (Value* V : U->operand_values()) {
+ if (Instruction *I = dyn_cast<Instruction>(V)) {
+ if (NotHoisted.count(I) > 0)
+ return false;
+ }
+ }
+ return true;
+ };
+
+ unsigned TotalSpeculationCost = 0;
+ for (auto& I : FromBlock) {
+ const unsigned Cost = ComputeSpeculationCost(&I, *TTI);
+ if (Cost != UINT_MAX && isSafeToSpeculativelyExecute(&I) &&
+ AllPrecedingUsesFromBlockHoisted(&I)) {
+ TotalSpeculationCost += Cost;
+ if (TotalSpeculationCost > SpecExecMaxSpeculationCost)
+ return false; // too much to hoist
+ } else {
+ NotHoisted.insert(&I);
+ if (NotHoisted.size() > SpecExecMaxNotHoisted)
+ return false; // too much left behind
+ }
+ }
+
+ if (TotalSpeculationCost == 0)
+ return false; // nothing to hoist
+
+ for (auto I = FromBlock.begin(); I != FromBlock.end();) {
+ // We have to increment I before moving Current as moving Current
+ // changes the list that I is iterating through.
+ auto Current = I;
+ ++I;
+ if (!NotHoisted.count(&*Current)) {
+ Current->moveBefore(ToBlock.getTerminator());
+ }
+ }
+ return true;
+}
+
+namespace llvm {
+
+FunctionPass *createSpeculativeExecutionPass() {
+ return new SpeculativeExecution();
+}
+
+} // namespace llvm
diff --git a/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
new file mode 100644
index 0000000..1faa65e
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
@@ -0,0 +1,724 @@
+//===-- StraightLineStrengthReduce.cpp - ------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements straight-line strength reduction (SLSR). Unlike loop
+// strength reduction, this algorithm is designed to reduce arithmetic
+// redundancy in straight-line code instead of loops. It has proven to be
+// effective in simplifying arithmetic statements derived from an unrolled loop.
+// It can also simplify the logic of SeparateConstOffsetFromGEP.
+//
+// There are many optimizations we can perform in the domain of SLSR. This file
+// for now contains only an initial step. Specifically, we look for strength
+// reduction candidates in the following forms:
+//
+// Form 1: B + i * S
+// Form 2: (B + i) * S
+// Form 3: &B[i * S]
+//
+// where S is an integer variable, and i is a constant integer. If we found two
+// candidates S1 and S2 in the same form and S1 dominates S2, we may rewrite S2
+// in a simpler way with respect to S1. For example,
+//
+// S1: X = B + i * S
+// S2: Y = B + i' * S => X + (i' - i) * S
+//
+// S1: X = (B + i) * S
+// S2: Y = (B + i') * S => X + (i' - i) * S
+//
+// S1: X = &B[i * S]
+// S2: Y = &B[i' * S] => &X[(i' - i) * S]
+//
+// Note: (i' - i) * S is folded to the extent possible.
+//
+// This rewriting is in general a good idea. The code patterns we focus on
+// usually come from loop unrolling, so (i' - i) * S is likely the same
+// across iterations and can be reused. When that happens, the optimized form
+// takes only one add starting from the second iteration.
+//
+// When such rewriting is possible, we call S1 a "basis" of S2. When S2 has
+// multiple bases, we choose to rewrite S2 with respect to its "immediate"
+// basis, the basis that is the closest ancestor in the dominator tree.
+//
+// TODO:
+//
+// - Floating point arithmetics when fast math is enabled.
+//
+// - SLSR may decrease ILP at the architecture level. Targets that are very
+// sensitive to ILP may want to disable it. Having SLSR to consider ILP is
+// left as future work.
+//
+// - When (i' - i) is constant but i and i' are not, we could still perform
+// SLSR.
+#include <vector>
+
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+using namespace PatternMatch;
+
+namespace {
+
+class StraightLineStrengthReduce : public FunctionPass {
+public:
+ // SLSR candidate. Such a candidate must be in one of the forms described in
+ // the header comments.
+ struct Candidate : public ilist_node<Candidate> {
+ enum Kind {
+ Invalid, // reserved for the default constructor
+ Add, // B + i * S
+ Mul, // (B + i) * S
+ GEP, // &B[..][i * S][..]
+ };
+
+ Candidate()
+ : CandidateKind(Invalid), Base(nullptr), Index(nullptr),
+ Stride(nullptr), Ins(nullptr), Basis(nullptr) {}
+ Candidate(Kind CT, const SCEV *B, ConstantInt *Idx, Value *S,
+ Instruction *I)
+ : CandidateKind(CT), Base(B), Index(Idx), Stride(S), Ins(I),
+ Basis(nullptr) {}
+ Kind CandidateKind;
+ const SCEV *Base;
+ // Note that Index and Stride of a GEP candidate do not necessarily have the
+ // same integer type. In that case, during rewriting, Stride will be
+ // sign-extended or truncated to Index's type.
+ ConstantInt *Index;
+ Value *Stride;
+ // The instruction this candidate corresponds to. It helps us to rewrite a
+ // candidate with respect to its immediate basis. Note that one instruction
+ // can correspond to multiple candidates depending on how you associate the
+ // expression. For instance,
+ //
+ // (a + 1) * (b + 2)
+ //
+ // can be treated as
+ //
+ // <Base: a, Index: 1, Stride: b + 2>
+ //
+ // or
+ //
+ // <Base: b, Index: 2, Stride: a + 1>
+ Instruction *Ins;
+ // Points to the immediate basis of this candidate, or nullptr if we cannot
+ // find any basis for this candidate.
+ Candidate *Basis;
+ };
+
+ static char ID;
+
+ StraightLineStrengthReduce()
+ : FunctionPass(ID), DL(nullptr), DT(nullptr), TTI(nullptr) {
+ initializeStraightLineStrengthReducePass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ // We do not modify the shape of the CFG.
+ AU.setPreservesCFG();
+ }
+
+ bool doInitialization(Module &M) override {
+ DL = &M.getDataLayout();
+ return false;
+ }
+
+ bool runOnFunction(Function &F) override;
+
+private:
+ // Returns true if Basis is a basis for C, i.e., Basis dominates C and they
+ // share the same base and stride.
+ bool isBasisFor(const Candidate &Basis, const Candidate &C);
+ // Returns whether the candidate can be folded into an addressing mode.
+ bool isFoldable(const Candidate &C, TargetTransformInfo *TTI,
+ const DataLayout *DL);
+ // Returns true if C is already in a simplest form and not worth being
+ // rewritten.
+ bool isSimplestForm(const Candidate &C);
+ // Checks whether I is in a candidate form. If so, adds all the matching forms
+ // to Candidates, and tries to find the immediate basis for each of them.
+ void allocateCandidatesAndFindBasis(Instruction *I);
+ // Allocate candidates and find bases for Add instructions.
+ void allocateCandidatesAndFindBasisForAdd(Instruction *I);
+ // Given I = LHS + RHS, factors RHS into i * S and makes (LHS + i * S) a
+ // candidate.
+ void allocateCandidatesAndFindBasisForAdd(Value *LHS, Value *RHS,
+ Instruction *I);
+ // Allocate candidates and find bases for Mul instructions.
+ void allocateCandidatesAndFindBasisForMul(Instruction *I);
+ // Splits LHS into Base + Index and, if succeeds, calls
+ // allocateCandidatesAndFindBasis.
+ void allocateCandidatesAndFindBasisForMul(Value *LHS, Value *RHS,
+ Instruction *I);
+ // Allocate candidates and find bases for GetElementPtr instructions.
+ void allocateCandidatesAndFindBasisForGEP(GetElementPtrInst *GEP);
+ // A helper function that scales Idx with ElementSize before invoking
+ // allocateCandidatesAndFindBasis.
+ void allocateCandidatesAndFindBasisForGEP(const SCEV *B, ConstantInt *Idx,
+ Value *S, uint64_t ElementSize,
+ Instruction *I);
+ // Adds the given form <CT, B, Idx, S> to Candidates, and finds its immediate
+ // basis.
+ void allocateCandidatesAndFindBasis(Candidate::Kind CT, const SCEV *B,
+ ConstantInt *Idx, Value *S,
+ Instruction *I);
+ // Rewrites candidate C with respect to Basis.
+ void rewriteCandidateWithBasis(const Candidate &C, const Candidate &Basis);
+ // A helper function that factors ArrayIdx to a product of a stride and a
+ // constant index, and invokes allocateCandidatesAndFindBasis with the
+ // factorings.
+ void factorArrayIndex(Value *ArrayIdx, const SCEV *Base, uint64_t ElementSize,
+ GetElementPtrInst *GEP);
+ // Emit code that computes the "bump" from Basis to C. If the candidate is a
+ // GEP and the bump is not divisible by the element size of the GEP, this
+ // function sets the BumpWithUglyGEP flag to notify its caller to bump the
+ // basis using an ugly GEP.
+ static Value *emitBump(const Candidate &Basis, const Candidate &C,
+ IRBuilder<> &Builder, const DataLayout *DL,
+ bool &BumpWithUglyGEP);
+
+ const DataLayout *DL;
+ DominatorTree *DT;
+ ScalarEvolution *SE;
+ TargetTransformInfo *TTI;
+ ilist<Candidate> Candidates;
+ // Temporarily holds all instructions that are unlinked (but not deleted) by
+ // rewriteCandidateWithBasis. These instructions will be actually removed
+ // after all rewriting finishes.
+ std::vector<Instruction *> UnlinkedInstructions;
+};
+} // anonymous namespace
+
+char StraightLineStrengthReduce::ID = 0;
+INITIALIZE_PASS_BEGIN(StraightLineStrengthReduce, "slsr",
+ "Straight line strength reduction", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(StraightLineStrengthReduce, "slsr",
+ "Straight line strength reduction", false, false)
+
+FunctionPass *llvm::createStraightLineStrengthReducePass() {
+ return new StraightLineStrengthReduce();
+}
+
+bool StraightLineStrengthReduce::isBasisFor(const Candidate &Basis,
+ const Candidate &C) {
+ return (Basis.Ins != C.Ins && // skip the same instruction
+ // They must have the same type too. Basis.Base == C.Base doesn't
+ // guarantee their types are the same (PR23975).
+ Basis.Ins->getType() == C.Ins->getType() &&
+ // Basis must dominate C in order to rewrite C with respect to Basis.
+ DT->dominates(Basis.Ins->getParent(), C.Ins->getParent()) &&
+ // They share the same base, stride, and candidate kind.
+ Basis.Base == C.Base && Basis.Stride == C.Stride &&
+ Basis.CandidateKind == C.CandidateKind);
+}
+
+// TODO: use TTI->getGEPCost.
+static bool isGEPFoldable(GetElementPtrInst *GEP,
+ const TargetTransformInfo *TTI,
+ const DataLayout *DL) {
+ GlobalVariable *BaseGV = nullptr;
+ int64_t BaseOffset = 0;
+ bool HasBaseReg = false;
+ int64_t Scale = 0;
+
+ if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getPointerOperand()))
+ BaseGV = GV;
+ else
+ HasBaseReg = true;
+
+ gep_type_iterator GTI = gep_type_begin(GEP);
+ for (auto I = GEP->idx_begin(); I != GEP->idx_end(); ++I, ++GTI) {
+ if (isa<SequentialType>(*GTI)) {
+ int64_t ElementSize = DL->getTypeAllocSize(GTI.getIndexedType());
+ if (ConstantInt *ConstIdx = dyn_cast<ConstantInt>(*I)) {
+ BaseOffset += ConstIdx->getSExtValue() * ElementSize;
+ } else {
+ // Needs scale register.
+ if (Scale != 0) {
+ // No addressing mode takes two scale registers.
+ return false;
+ }
+ Scale = ElementSize;
+ }
+ } else {
+ StructType *STy = cast<StructType>(*GTI);
+ uint64_t Field = cast<ConstantInt>(*I)->getZExtValue();
+ BaseOffset += DL->getStructLayout(STy)->getElementOffset(Field);
+ }
+ }
+
+ unsigned AddrSpace = GEP->getPointerAddressSpace();
+ return TTI->isLegalAddressingMode(GEP->getType()->getElementType(), BaseGV,
+ BaseOffset, HasBaseReg, Scale, AddrSpace);
+}
+
+// Returns whether (Base + Index * Stride) can be folded to an addressing mode.
+static bool isAddFoldable(const SCEV *Base, ConstantInt *Index, Value *Stride,
+ TargetTransformInfo *TTI) {
+ return TTI->isLegalAddressingMode(Base->getType(), nullptr, 0, true,
+ Index->getSExtValue());
+}
+
+bool StraightLineStrengthReduce::isFoldable(const Candidate &C,
+ TargetTransformInfo *TTI,
+ const DataLayout *DL) {
+ if (C.CandidateKind == Candidate::Add)
+ return isAddFoldable(C.Base, C.Index, C.Stride, TTI);
+ if (C.CandidateKind == Candidate::GEP)
+ return isGEPFoldable(cast<GetElementPtrInst>(C.Ins), TTI, DL);
+ return false;
+}
+
+// Returns true if GEP has zero or one non-zero index.
+static bool hasOnlyOneNonZeroIndex(GetElementPtrInst *GEP) {
+ unsigned NumNonZeroIndices = 0;
+ for (auto I = GEP->idx_begin(); I != GEP->idx_end(); ++I) {
+ ConstantInt *ConstIdx = dyn_cast<ConstantInt>(*I);
+ if (ConstIdx == nullptr || !ConstIdx->isZero())
+ ++NumNonZeroIndices;
+ }
+ return NumNonZeroIndices <= 1;
+}
+
+bool StraightLineStrengthReduce::isSimplestForm(const Candidate &C) {
+ if (C.CandidateKind == Candidate::Add) {
+ // B + 1 * S or B + (-1) * S
+ return C.Index->isOne() || C.Index->isMinusOne();
+ }
+ if (C.CandidateKind == Candidate::Mul) {
+ // (B + 0) * S
+ return C.Index->isZero();
+ }
+ if (C.CandidateKind == Candidate::GEP) {
+ // (char*)B + S or (char*)B - S
+ return ((C.Index->isOne() || C.Index->isMinusOne()) &&
+ hasOnlyOneNonZeroIndex(cast<GetElementPtrInst>(C.Ins)));
+ }
+ return false;
+}
+
+// TODO: We currently implement an algorithm whose time complexity is linear in
+// the number of existing candidates. However, we could do better by using
+// ScopedHashTable. Specifically, while traversing the dominator tree, we could
+// maintain all the candidates that dominate the basic block being traversed in
+// a ScopedHashTable. This hash table is indexed by the base and the stride of
+// a candidate. Therefore, finding the immediate basis of a candidate boils down
+// to one hash-table look up.
+void StraightLineStrengthReduce::allocateCandidatesAndFindBasis(
+ Candidate::Kind CT, const SCEV *B, ConstantInt *Idx, Value *S,
+ Instruction *I) {
+ Candidate C(CT, B, Idx, S, I);
+ // SLSR can complicate an instruction in two cases:
+ //
+ // 1. If we can fold I into an addressing mode, computing I is likely free or
+ // takes only one instruction.
+ //
+ // 2. I is already in a simplest form. For example, when
+ // X = B + 8 * S
+ // Y = B + S,
+ // rewriting Y to X - 7 * S is probably a bad idea.
+ //
+ // In the above cases, we still add I to the candidate list so that I can be
+ // the basis of other candidates, but we leave I's basis blank so that I
+ // won't be rewritten.
+ if (!isFoldable(C, TTI, DL) && !isSimplestForm(C)) {
+ // Try to compute the immediate basis of C.
+ unsigned NumIterations = 0;
+ // Limit the scan radius to avoid running in quadratice time.
+ static const unsigned MaxNumIterations = 50;
+ for (auto Basis = Candidates.rbegin();
+ Basis != Candidates.rend() && NumIterations < MaxNumIterations;
+ ++Basis, ++NumIterations) {
+ if (isBasisFor(*Basis, C)) {
+ C.Basis = &(*Basis);
+ break;
+ }
+ }
+ }
+ // Regardless of whether we find a basis for C, we need to push C to the
+ // candidate list so that it can be the basis of other candidates.
+ Candidates.push_back(C);
+}
+
+void StraightLineStrengthReduce::allocateCandidatesAndFindBasis(
+ Instruction *I) {
+ switch (I->getOpcode()) {
+ case Instruction::Add:
+ allocateCandidatesAndFindBasisForAdd(I);
+ break;
+ case Instruction::Mul:
+ allocateCandidatesAndFindBasisForMul(I);
+ break;
+ case Instruction::GetElementPtr:
+ allocateCandidatesAndFindBasisForGEP(cast<GetElementPtrInst>(I));
+ break;
+ }
+}
+
+void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForAdd(
+ Instruction *I) {
+ // Try matching B + i * S.
+ if (!isa<IntegerType>(I->getType()))
+ return;
+
+ assert(I->getNumOperands() == 2 && "isn't I an add?");
+ Value *LHS = I->getOperand(0), *RHS = I->getOperand(1);
+ allocateCandidatesAndFindBasisForAdd(LHS, RHS, I);
+ if (LHS != RHS)
+ allocateCandidatesAndFindBasisForAdd(RHS, LHS, I);
+}
+
+void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForAdd(
+ Value *LHS, Value *RHS, Instruction *I) {
+ Value *S = nullptr;
+ ConstantInt *Idx = nullptr;
+ if (match(RHS, m_Mul(m_Value(S), m_ConstantInt(Idx)))) {
+ // I = LHS + RHS = LHS + Idx * S
+ allocateCandidatesAndFindBasis(Candidate::Add, SE->getSCEV(LHS), Idx, S, I);
+ } else if (match(RHS, m_Shl(m_Value(S), m_ConstantInt(Idx)))) {
+ // I = LHS + RHS = LHS + (S << Idx) = LHS + S * (1 << Idx)
+ APInt One(Idx->getBitWidth(), 1);
+ Idx = ConstantInt::get(Idx->getContext(), One << Idx->getValue());
+ allocateCandidatesAndFindBasis(Candidate::Add, SE->getSCEV(LHS), Idx, S, I);
+ } else {
+ // At least, I = LHS + 1 * RHS
+ ConstantInt *One = ConstantInt::get(cast<IntegerType>(I->getType()), 1);
+ allocateCandidatesAndFindBasis(Candidate::Add, SE->getSCEV(LHS), One, RHS,
+ I);
+ }
+}
+
+// Returns true if A matches B + C where C is constant.
+static bool matchesAdd(Value *A, Value *&B, ConstantInt *&C) {
+ return (match(A, m_Add(m_Value(B), m_ConstantInt(C))) ||
+ match(A, m_Add(m_ConstantInt(C), m_Value(B))));
+}
+
+// Returns true if A matches B | C where C is constant.
+static bool matchesOr(Value *A, Value *&B, ConstantInt *&C) {
+ return (match(A, m_Or(m_Value(B), m_ConstantInt(C))) ||
+ match(A, m_Or(m_ConstantInt(C), m_Value(B))));
+}
+
+void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForMul(
+ Value *LHS, Value *RHS, Instruction *I) {
+ Value *B = nullptr;
+ ConstantInt *Idx = nullptr;
+ if (matchesAdd(LHS, B, Idx)) {
+ // If LHS is in the form of "Base + Index", then I is in the form of
+ // "(Base + Index) * RHS".
+ allocateCandidatesAndFindBasis(Candidate::Mul, SE->getSCEV(B), Idx, RHS, I);
+ } else if (matchesOr(LHS, B, Idx) && haveNoCommonBitsSet(B, Idx, *DL)) {
+ // If LHS is in the form of "Base | Index" and Base and Index have no common
+ // bits set, then
+ // Base | Index = Base + Index
+ // and I is thus in the form of "(Base + Index) * RHS".
+ allocateCandidatesAndFindBasis(Candidate::Mul, SE->getSCEV(B), Idx, RHS, I);
+ } else {
+ // Otherwise, at least try the form (LHS + 0) * RHS.
+ ConstantInt *Zero = ConstantInt::get(cast<IntegerType>(I->getType()), 0);
+ allocateCandidatesAndFindBasis(Candidate::Mul, SE->getSCEV(LHS), Zero, RHS,
+ I);
+ }
+}
+
+void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForMul(
+ Instruction *I) {
+ // Try matching (B + i) * S.
+ // TODO: we could extend SLSR to float and vector types.
+ if (!isa<IntegerType>(I->getType()))
+ return;
+
+ assert(I->getNumOperands() == 2 && "isn't I a mul?");
+ Value *LHS = I->getOperand(0), *RHS = I->getOperand(1);
+ allocateCandidatesAndFindBasisForMul(LHS, RHS, I);
+ if (LHS != RHS) {
+ // Symmetrically, try to split RHS to Base + Index.
+ allocateCandidatesAndFindBasisForMul(RHS, LHS, I);
+ }
+}
+
+void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForGEP(
+ const SCEV *B, ConstantInt *Idx, Value *S, uint64_t ElementSize,
+ Instruction *I) {
+ // I = B + sext(Idx *nsw S) * ElementSize
+ // = B + (sext(Idx) * sext(S)) * ElementSize
+ // = B + (sext(Idx) * ElementSize) * sext(S)
+ // Casting to IntegerType is safe because we skipped vector GEPs.
+ IntegerType *IntPtrTy = cast<IntegerType>(DL->getIntPtrType(I->getType()));
+ ConstantInt *ScaledIdx = ConstantInt::get(
+ IntPtrTy, Idx->getSExtValue() * (int64_t)ElementSize, true);
+ allocateCandidatesAndFindBasis(Candidate::GEP, B, ScaledIdx, S, I);
+}
+
+void StraightLineStrengthReduce::factorArrayIndex(Value *ArrayIdx,
+ const SCEV *Base,
+ uint64_t ElementSize,
+ GetElementPtrInst *GEP) {
+ // At least, ArrayIdx = ArrayIdx *nsw 1.
+ allocateCandidatesAndFindBasisForGEP(
+ Base, ConstantInt::get(cast<IntegerType>(ArrayIdx->getType()), 1),
+ ArrayIdx, ElementSize, GEP);
+ Value *LHS = nullptr;
+ ConstantInt *RHS = nullptr;
+ // One alternative is matching the SCEV of ArrayIdx instead of ArrayIdx
+ // itself. This would allow us to handle the shl case for free. However,
+ // matching SCEVs has two issues:
+ //
+ // 1. this would complicate rewriting because the rewriting procedure
+ // would have to translate SCEVs back to IR instructions. This translation
+ // is difficult when LHS is further evaluated to a composite SCEV.
+ //
+ // 2. ScalarEvolution is designed to be control-flow oblivious. It tends
+ // to strip nsw/nuw flags which are critical for SLSR to trace into
+ // sext'ed multiplication.
+ if (match(ArrayIdx, m_NSWMul(m_Value(LHS), m_ConstantInt(RHS)))) {
+ // SLSR is currently unsafe if i * S may overflow.
+ // GEP = Base + sext(LHS *nsw RHS) * ElementSize
+ allocateCandidatesAndFindBasisForGEP(Base, RHS, LHS, ElementSize, GEP);
+ } else if (match(ArrayIdx, m_NSWShl(m_Value(LHS), m_ConstantInt(RHS)))) {
+ // GEP = Base + sext(LHS <<nsw RHS) * ElementSize
+ // = Base + sext(LHS *nsw (1 << RHS)) * ElementSize
+ APInt One(RHS->getBitWidth(), 1);
+ ConstantInt *PowerOf2 =
+ ConstantInt::get(RHS->getContext(), One << RHS->getValue());
+ allocateCandidatesAndFindBasisForGEP(Base, PowerOf2, LHS, ElementSize, GEP);
+ }
+}
+
+void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForGEP(
+ GetElementPtrInst *GEP) {
+ // TODO: handle vector GEPs
+ if (GEP->getType()->isVectorTy())
+ return;
+
+ SmallVector<const SCEV *, 4> IndexExprs;
+ for (auto I = GEP->idx_begin(); I != GEP->idx_end(); ++I)
+ IndexExprs.push_back(SE->getSCEV(*I));
+
+ gep_type_iterator GTI = gep_type_begin(GEP);
+ for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I) {
+ if (!isa<SequentialType>(*GTI++))
+ continue;
+
+ const SCEV *OrigIndexExpr = IndexExprs[I - 1];
+ IndexExprs[I - 1] = SE->getZero(OrigIndexExpr->getType());
+
+ // The base of this candidate is GEP's base plus the offsets of all
+ // indices except this current one.
+ const SCEV *BaseExpr = SE->getGEPExpr(GEP->getSourceElementType(),
+ SE->getSCEV(GEP->getPointerOperand()),
+ IndexExprs, GEP->isInBounds());
+ Value *ArrayIdx = GEP->getOperand(I);
+ uint64_t ElementSize = DL->getTypeAllocSize(*GTI);
+ factorArrayIndex(ArrayIdx, BaseExpr, ElementSize, GEP);
+ // When ArrayIdx is the sext of a value, we try to factor that value as
+ // well. Handling this case is important because array indices are
+ // typically sign-extended to the pointer size.
+ Value *TruncatedArrayIdx = nullptr;
+ if (match(ArrayIdx, m_SExt(m_Value(TruncatedArrayIdx))))
+ factorArrayIndex(TruncatedArrayIdx, BaseExpr, ElementSize, GEP);
+
+ IndexExprs[I - 1] = OrigIndexExpr;
+ }
+}
+
+// A helper function that unifies the bitwidth of A and B.
+static void unifyBitWidth(APInt &A, APInt &B) {
+ if (A.getBitWidth() < B.getBitWidth())
+ A = A.sext(B.getBitWidth());
+ else if (A.getBitWidth() > B.getBitWidth())
+ B = B.sext(A.getBitWidth());
+}
+
+Value *StraightLineStrengthReduce::emitBump(const Candidate &Basis,
+ const Candidate &C,
+ IRBuilder<> &Builder,
+ const DataLayout *DL,
+ bool &BumpWithUglyGEP) {
+ APInt Idx = C.Index->getValue(), BasisIdx = Basis.Index->getValue();
+ unifyBitWidth(Idx, BasisIdx);
+ APInt IndexOffset = Idx - BasisIdx;
+
+ BumpWithUglyGEP = false;
+ if (Basis.CandidateKind == Candidate::GEP) {
+ APInt ElementSize(
+ IndexOffset.getBitWidth(),
+ DL->getTypeAllocSize(
+ cast<GetElementPtrInst>(Basis.Ins)->getType()->getElementType()));
+ APInt Q, R;
+ APInt::sdivrem(IndexOffset, ElementSize, Q, R);
+ if (R.getSExtValue() == 0)
+ IndexOffset = Q;
+ else
+ BumpWithUglyGEP = true;
+ }
+
+ // Compute Bump = C - Basis = (i' - i) * S.
+ // Common case 1: if (i' - i) is 1, Bump = S.
+ if (IndexOffset.getSExtValue() == 1)
+ return C.Stride;
+ // Common case 2: if (i' - i) is -1, Bump = -S.
+ if (IndexOffset.getSExtValue() == -1)
+ return Builder.CreateNeg(C.Stride);
+
+ // Otherwise, Bump = (i' - i) * sext/trunc(S). Note that (i' - i) and S may
+ // have different bit widths.
+ IntegerType *DeltaType =
+ IntegerType::get(Basis.Ins->getContext(), IndexOffset.getBitWidth());
+ Value *ExtendedStride = Builder.CreateSExtOrTrunc(C.Stride, DeltaType);
+ if (IndexOffset.isPowerOf2()) {
+ // If (i' - i) is a power of 2, Bump = sext/trunc(S) << log(i' - i).
+ ConstantInt *Exponent = ConstantInt::get(DeltaType, IndexOffset.logBase2());
+ return Builder.CreateShl(ExtendedStride, Exponent);
+ }
+ if ((-IndexOffset).isPowerOf2()) {
+ // If (i - i') is a power of 2, Bump = -sext/trunc(S) << log(i' - i).
+ ConstantInt *Exponent =
+ ConstantInt::get(DeltaType, (-IndexOffset).logBase2());
+ return Builder.CreateNeg(Builder.CreateShl(ExtendedStride, Exponent));
+ }
+ Constant *Delta = ConstantInt::get(DeltaType, IndexOffset);
+ return Builder.CreateMul(ExtendedStride, Delta);
+}
+
+void StraightLineStrengthReduce::rewriteCandidateWithBasis(
+ const Candidate &C, const Candidate &Basis) {
+ assert(C.CandidateKind == Basis.CandidateKind && C.Base == Basis.Base &&
+ C.Stride == Basis.Stride);
+ // We run rewriteCandidateWithBasis on all candidates in a post-order, so the
+ // basis of a candidate cannot be unlinked before the candidate.
+ assert(Basis.Ins->getParent() != nullptr && "the basis is unlinked");
+
+ // An instruction can correspond to multiple candidates. Therefore, instead of
+ // simply deleting an instruction when we rewrite it, we mark its parent as
+ // nullptr (i.e. unlink it) so that we can skip the candidates whose
+ // instruction is already rewritten.
+ if (!C.Ins->getParent())
+ return;
+
+ IRBuilder<> Builder(C.Ins);
+ bool BumpWithUglyGEP;
+ Value *Bump = emitBump(Basis, C, Builder, DL, BumpWithUglyGEP);
+ Value *Reduced = nullptr; // equivalent to but weaker than C.Ins
+ switch (C.CandidateKind) {
+ case Candidate::Add:
+ case Candidate::Mul:
+ // C = Basis + Bump
+ if (BinaryOperator::isNeg(Bump)) {
+ // If Bump is a neg instruction, emit C = Basis - (-Bump).
+ Reduced =
+ Builder.CreateSub(Basis.Ins, BinaryOperator::getNegArgument(Bump));
+ // We only use the negative argument of Bump, and Bump itself may be
+ // trivially dead.
+ RecursivelyDeleteTriviallyDeadInstructions(Bump);
+ } else {
+ // It's tempting to preserve nsw on Bump and/or Reduced. However, it's
+ // usually unsound, e.g.,
+ //
+ // X = (-2 +nsw 1) *nsw INT_MAX
+ // Y = (-2 +nsw 3) *nsw INT_MAX
+ // =>
+ // Y = X + 2 * INT_MAX
+ //
+ // Neither + and * in the resultant expression are nsw.
+ Reduced = Builder.CreateAdd(Basis.Ins, Bump);
+ }
+ break;
+ case Candidate::GEP:
+ {
+ Type *IntPtrTy = DL->getIntPtrType(C.Ins->getType());
+ bool InBounds = cast<GetElementPtrInst>(C.Ins)->isInBounds();
+ if (BumpWithUglyGEP) {
+ // C = (char *)Basis + Bump
+ unsigned AS = Basis.Ins->getType()->getPointerAddressSpace();
+ Type *CharTy = Type::getInt8PtrTy(Basis.Ins->getContext(), AS);
+ Reduced = Builder.CreateBitCast(Basis.Ins, CharTy);
+ if (InBounds)
+ Reduced =
+ Builder.CreateInBoundsGEP(Builder.getInt8Ty(), Reduced, Bump);
+ else
+ Reduced = Builder.CreateGEP(Builder.getInt8Ty(), Reduced, Bump);
+ Reduced = Builder.CreateBitCast(Reduced, C.Ins->getType());
+ } else {
+ // C = gep Basis, Bump
+ // Canonicalize bump to pointer size.
+ Bump = Builder.CreateSExtOrTrunc(Bump, IntPtrTy);
+ if (InBounds)
+ Reduced = Builder.CreateInBoundsGEP(nullptr, Basis.Ins, Bump);
+ else
+ Reduced = Builder.CreateGEP(nullptr, Basis.Ins, Bump);
+ }
+ }
+ break;
+ default:
+ llvm_unreachable("C.CandidateKind is invalid");
+ };
+ Reduced->takeName(C.Ins);
+ C.Ins->replaceAllUsesWith(Reduced);
+ // Unlink C.Ins so that we can skip other candidates also corresponding to
+ // C.Ins. The actual deletion is postponed to the end of runOnFunction.
+ C.Ins->removeFromParent();
+ UnlinkedInstructions.push_back(C.Ins);
+}
+
+bool StraightLineStrengthReduce::runOnFunction(Function &F) {
+ if (skipOptnoneFunction(F))
+ return false;
+
+ TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ // Traverse the dominator tree in the depth-first order. This order makes sure
+ // all bases of a candidate are in Candidates when we process it.
+ for (auto node = GraphTraits<DominatorTree *>::nodes_begin(DT);
+ node != GraphTraits<DominatorTree *>::nodes_end(DT); ++node) {
+ for (auto &I : *node->getBlock())
+ allocateCandidatesAndFindBasis(&I);
+ }
+
+ // Rewrite candidates in the reverse depth-first order. This order makes sure
+ // a candidate being rewritten is not a basis for any other candidate.
+ while (!Candidates.empty()) {
+ const Candidate &C = Candidates.back();
+ if (C.Basis != nullptr) {
+ rewriteCandidateWithBasis(C, *C.Basis);
+ }
+ Candidates.pop_back();
+ }
+
+ // Delete all unlink instructions.
+ for (auto *UnlinkedInst : UnlinkedInstructions) {
+ for (unsigned I = 0, E = UnlinkedInst->getNumOperands(); I != E; ++I) {
+ Value *Op = UnlinkedInst->getOperand(I);
+ UnlinkedInst->setOperand(I, nullptr);
+ RecursivelyDeleteTriviallyDeadInstructions(Op);
+ }
+ delete UnlinkedInst;
+ }
+ bool Ret = !UnlinkedInstructions.empty();
+ UnlinkedInstructions.clear();
+ return Ret;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
new file mode 100644
index 0000000..662513c
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -0,0 +1,953 @@
+//===-- StructurizeCFG.cpp ------------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/RegionInfo.h"
+#include "llvm/Analysis/RegionIterator.h"
+#include "llvm/Analysis/RegionPass.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+#define DEBUG_TYPE "structurizecfg"
+
+namespace {
+
+// Definition of the complex types used in this pass.
+
+typedef std::pair<BasicBlock *, Value *> BBValuePair;
+
+typedef SmallVector<RegionNode*, 8> RNVector;
+typedef SmallVector<BasicBlock*, 8> BBVector;
+typedef SmallVector<BranchInst*, 8> BranchVector;
+typedef SmallVector<BBValuePair, 2> BBValueVector;
+
+typedef SmallPtrSet<BasicBlock *, 8> BBSet;
+
+typedef MapVector<PHINode *, BBValueVector> PhiMap;
+typedef MapVector<BasicBlock *, BBVector> BB2BBVecMap;
+
+typedef DenseMap<DomTreeNode *, unsigned> DTN2UnsignedMap;
+typedef DenseMap<BasicBlock *, PhiMap> BBPhiMap;
+typedef DenseMap<BasicBlock *, Value *> BBPredicates;
+typedef DenseMap<BasicBlock *, BBPredicates> PredMap;
+typedef DenseMap<BasicBlock *, BasicBlock*> BB2BBMap;
+
+// The name for newly created blocks.
+
+static const char *const FlowBlockName = "Flow";
+
+/// @brief Find the nearest common dominator for multiple BasicBlocks
+///
+/// Helper class for StructurizeCFG
+/// TODO: Maybe move into common code
+class NearestCommonDominator {
+ DominatorTree *DT;
+
+ DTN2UnsignedMap IndexMap;
+
+ BasicBlock *Result;
+ unsigned ResultIndex;
+ bool ExplicitMentioned;
+
+public:
+ /// \brief Start a new query
+ NearestCommonDominator(DominatorTree *DomTree) {
+ DT = DomTree;
+ Result = nullptr;
+ }
+
+ /// \brief Add BB to the resulting dominator
+ void addBlock(BasicBlock *BB, bool Remember = true) {
+ DomTreeNode *Node = DT->getNode(BB);
+
+ if (!Result) {
+ unsigned Numbering = 0;
+ for (;Node;Node = Node->getIDom())
+ IndexMap[Node] = ++Numbering;
+ Result = BB;
+ ResultIndex = 1;
+ ExplicitMentioned = Remember;
+ return;
+ }
+
+ for (;Node;Node = Node->getIDom())
+ if (IndexMap.count(Node))
+ break;
+ else
+ IndexMap[Node] = 0;
+
+ assert(Node && "Dominator tree invalid!");
+
+ unsigned Numbering = IndexMap[Node];
+ if (Numbering > ResultIndex) {
+ Result = Node->getBlock();
+ ResultIndex = Numbering;
+ ExplicitMentioned = Remember && (Result == BB);
+ } else if (Numbering == ResultIndex) {
+ ExplicitMentioned |= Remember;
+ }
+ }
+
+ /// \brief Is "Result" one of the BBs added with "Remember" = True?
+ bool wasResultExplicitMentioned() {
+ return ExplicitMentioned;
+ }
+
+ /// \brief Get the query result
+ BasicBlock *getResult() {
+ return Result;
+ }
+};
+
+/// @brief Transforms the control flow graph on one single entry/exit region
+/// at a time.
+///
+/// After the transform all "If"/"Then"/"Else" style control flow looks like
+/// this:
+///
+/// \verbatim
+/// 1
+/// ||
+/// | |
+/// 2 |
+/// | /
+/// |/
+/// 3
+/// || Where:
+/// | | 1 = "If" block, calculates the condition
+/// 4 | 2 = "Then" subregion, runs if the condition is true
+/// | / 3 = "Flow" blocks, newly inserted flow blocks, rejoins the flow
+/// |/ 4 = "Else" optional subregion, runs if the condition is false
+/// 5 5 = "End" block, also rejoins the control flow
+/// \endverbatim
+///
+/// Control flow is expressed as a branch where the true exit goes into the
+/// "Then"/"Else" region, while the false exit skips the region
+/// The condition for the optional "Else" region is expressed as a PHI node.
+/// The incomming values of the PHI node are true for the "If" edge and false
+/// for the "Then" edge.
+///
+/// Additionally to that even complicated loops look like this:
+///
+/// \verbatim
+/// 1
+/// ||
+/// | |
+/// 2 ^ Where:
+/// | / 1 = "Entry" block
+/// |/ 2 = "Loop" optional subregion, with all exits at "Flow" block
+/// 3 3 = "Flow" block, with back edge to entry block
+/// |
+/// \endverbatim
+///
+/// The back edge of the "Flow" block is always on the false side of the branch
+/// while the true side continues the general flow. So the loop condition
+/// consist of a network of PHI nodes where the true incoming values expresses
+/// breaks and the false values expresses continue states.
+class StructurizeCFG : public RegionPass {
+ Type *Boolean;
+ ConstantInt *BoolTrue;
+ ConstantInt *BoolFalse;
+ UndefValue *BoolUndef;
+
+ Function *Func;
+ Region *ParentRegion;
+
+ DominatorTree *DT;
+ LoopInfo *LI;
+
+ RNVector Order;
+ BBSet Visited;
+
+ BBPhiMap DeletedPhis;
+ BB2BBVecMap AddedPhis;
+
+ PredMap Predicates;
+ BranchVector Conditions;
+
+ BB2BBMap Loops;
+ PredMap LoopPreds;
+ BranchVector LoopConds;
+
+ RegionNode *PrevNode;
+
+ void orderNodes();
+
+ void analyzeLoops(RegionNode *N);
+
+ Value *invert(Value *Condition);
+
+ Value *buildCondition(BranchInst *Term, unsigned Idx, bool Invert);
+
+ void gatherPredicates(RegionNode *N);
+
+ void collectInfos();
+
+ void insertConditions(bool Loops);
+
+ void delPhiValues(BasicBlock *From, BasicBlock *To);
+
+ void addPhiValues(BasicBlock *From, BasicBlock *To);
+
+ void setPhiValues();
+
+ void killTerminator(BasicBlock *BB);
+
+ void changeExit(RegionNode *Node, BasicBlock *NewExit,
+ bool IncludeDominator);
+
+ BasicBlock *getNextFlow(BasicBlock *Dominator);
+
+ BasicBlock *needPrefix(bool NeedEmpty);
+
+ BasicBlock *needPostfix(BasicBlock *Flow, bool ExitUseAllowed);
+
+ void setPrevNode(BasicBlock *BB);
+
+ bool dominatesPredicates(BasicBlock *BB, RegionNode *Node);
+
+ bool isPredictableTrue(RegionNode *Node);
+
+ void wireFlow(bool ExitUseAllowed, BasicBlock *LoopEnd);
+
+ void handleLoops(bool ExitUseAllowed, BasicBlock *LoopEnd);
+
+ void createFlow();
+
+ void rebuildSSA();
+
+public:
+ static char ID;
+
+ StructurizeCFG() :
+ RegionPass(ID) {
+ initializeStructurizeCFGPass(*PassRegistry::getPassRegistry());
+ }
+
+ using Pass::doInitialization;
+ bool doInitialization(Region *R, RGPassManager &RGM) override;
+
+ bool runOnRegion(Region *R, RGPassManager &RGM) override;
+
+ const char *getPassName() const override {
+ return "Structurize control flow";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequiredID(LowerSwitchID);
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ RegionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // end anonymous namespace
+
+char StructurizeCFG::ID = 0;
+
+INITIALIZE_PASS_BEGIN(StructurizeCFG, "structurizecfg", "Structurize the CFG",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(LowerSwitch)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(RegionInfoPass)
+INITIALIZE_PASS_END(StructurizeCFG, "structurizecfg", "Structurize the CFG",
+ false, false)
+
+/// \brief Initialize the types and constants used in the pass
+bool StructurizeCFG::doInitialization(Region *R, RGPassManager &RGM) {
+ LLVMContext &Context = R->getEntry()->getContext();
+
+ Boolean = Type::getInt1Ty(Context);
+ BoolTrue = ConstantInt::getTrue(Context);
+ BoolFalse = ConstantInt::getFalse(Context);
+ BoolUndef = UndefValue::get(Boolean);
+
+ return false;
+}
+
+/// \brief Build up the general order of nodes
+void StructurizeCFG::orderNodes() {
+ RNVector TempOrder;
+ ReversePostOrderTraversal<Region*> RPOT(ParentRegion);
+ TempOrder.append(RPOT.begin(), RPOT.end());
+
+ std::map<Loop*, unsigned> LoopBlocks;
+
+
+ // The reverse post-order traversal of the list gives us an ordering close
+ // to what we want. The only problem with it is that sometimes backedges
+ // for outer loops will be visited before backedges for inner loops.
+ for (RegionNode *RN : TempOrder) {
+ BasicBlock *BB = RN->getEntry();
+ Loop *Loop = LI->getLoopFor(BB);
+ if (!LoopBlocks.count(Loop)) {
+ LoopBlocks[Loop] = 1;
+ continue;
+ }
+ LoopBlocks[Loop]++;
+ }
+
+ unsigned CurrentLoopDepth = 0;
+ Loop *CurrentLoop = nullptr;
+ BBSet TempVisited;
+ for (RNVector::iterator I = TempOrder.begin(), E = TempOrder.end(); I != E; ++I) {
+ BasicBlock *BB = (*I)->getEntry();
+ unsigned LoopDepth = LI->getLoopDepth(BB);
+
+ if (std::find(Order.begin(), Order.end(), *I) != Order.end())
+ continue;
+
+ if (LoopDepth < CurrentLoopDepth) {
+ // Make sure we have visited all blocks in this loop before moving back to
+ // the outer loop.
+
+ RNVector::iterator LoopI = I;
+ while(LoopBlocks[CurrentLoop]) {
+ LoopI++;
+ BasicBlock *LoopBB = (*LoopI)->getEntry();
+ if (LI->getLoopFor(LoopBB) == CurrentLoop) {
+ LoopBlocks[CurrentLoop]--;
+ Order.push_back(*LoopI);
+ }
+ }
+ }
+
+ CurrentLoop = LI->getLoopFor(BB);
+ if (CurrentLoop) {
+ LoopBlocks[CurrentLoop]--;
+ }
+
+ CurrentLoopDepth = LoopDepth;
+ Order.push_back(*I);
+ }
+
+ // This pass originally used a post-order traversal and then operated on
+ // the list in reverse. Now that we are using a reverse post-order traversal
+ // rather than re-working the whole pass to operate on the list in order,
+ // we just reverse the list and continue to operate on it in reverse.
+ std::reverse(Order.begin(), Order.end());
+}
+
+/// \brief Determine the end of the loops
+void StructurizeCFG::analyzeLoops(RegionNode *N) {
+ if (N->isSubRegion()) {
+ // Test for exit as back edge
+ BasicBlock *Exit = N->getNodeAs<Region>()->getExit();
+ if (Visited.count(Exit))
+ Loops[Exit] = N->getEntry();
+
+ } else {
+ // Test for sucessors as back edge
+ BasicBlock *BB = N->getNodeAs<BasicBlock>();
+ BranchInst *Term = cast<BranchInst>(BB->getTerminator());
+
+ for (BasicBlock *Succ : Term->successors())
+ if (Visited.count(Succ))
+ Loops[Succ] = BB;
+ }
+}
+
+/// \brief Invert the given condition
+Value *StructurizeCFG::invert(Value *Condition) {
+ // First: Check if it's a constant
+ if (Condition == BoolTrue)
+ return BoolFalse;
+
+ if (Condition == BoolFalse)
+ return BoolTrue;
+
+ if (Condition == BoolUndef)
+ return BoolUndef;
+
+ // Second: If the condition is already inverted, return the original value
+ if (match(Condition, m_Not(m_Value(Condition))))
+ return Condition;
+
+ if (Instruction *Inst = dyn_cast<Instruction>(Condition)) {
+ // Third: Check all the users for an invert
+ BasicBlock *Parent = Inst->getParent();
+ for (User *U : Condition->users())
+ if (Instruction *I = dyn_cast<Instruction>(U))
+ if (I->getParent() == Parent && match(I, m_Not(m_Specific(Condition))))
+ return I;
+
+ // Last option: Create a new instruction
+ return BinaryOperator::CreateNot(Condition, "", Parent->getTerminator());
+ }
+
+ if (Argument *Arg = dyn_cast<Argument>(Condition)) {
+ BasicBlock &EntryBlock = Arg->getParent()->getEntryBlock();
+ return BinaryOperator::CreateNot(Condition,
+ Arg->getName() + ".inv",
+ EntryBlock.getTerminator());
+ }
+
+ llvm_unreachable("Unhandled condition to invert");
+}
+
+/// \brief Build the condition for one edge
+Value *StructurizeCFG::buildCondition(BranchInst *Term, unsigned Idx,
+ bool Invert) {
+ Value *Cond = Invert ? BoolFalse : BoolTrue;
+ if (Term->isConditional()) {
+ Cond = Term->getCondition();
+
+ if (Idx != (unsigned)Invert)
+ Cond = invert(Cond);
+ }
+ return Cond;
+}
+
+/// \brief Analyze the predecessors of each block and build up predicates
+void StructurizeCFG::gatherPredicates(RegionNode *N) {
+ RegionInfo *RI = ParentRegion->getRegionInfo();
+ BasicBlock *BB = N->getEntry();
+ BBPredicates &Pred = Predicates[BB];
+ BBPredicates &LPred = LoopPreds[BB];
+
+ for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
+ PI != PE; ++PI) {
+
+ // Ignore it if it's a branch from outside into our region entry
+ if (!ParentRegion->contains(*PI))
+ continue;
+
+ Region *R = RI->getRegionFor(*PI);
+ if (R == ParentRegion) {
+
+ // It's a top level block in our region
+ BranchInst *Term = cast<BranchInst>((*PI)->getTerminator());
+ for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) {
+ BasicBlock *Succ = Term->getSuccessor(i);
+ if (Succ != BB)
+ continue;
+
+ if (Visited.count(*PI)) {
+ // Normal forward edge
+ if (Term->isConditional()) {
+ // Try to treat it like an ELSE block
+ BasicBlock *Other = Term->getSuccessor(!i);
+ if (Visited.count(Other) && !Loops.count(Other) &&
+ !Pred.count(Other) && !Pred.count(*PI)) {
+
+ Pred[Other] = BoolFalse;
+ Pred[*PI] = BoolTrue;
+ continue;
+ }
+ }
+ Pred[*PI] = buildCondition(Term, i, false);
+
+ } else {
+ // Back edge
+ LPred[*PI] = buildCondition(Term, i, true);
+ }
+ }
+
+ } else {
+
+ // It's an exit from a sub region
+ while (R->getParent() != ParentRegion)
+ R = R->getParent();
+
+ // Edge from inside a subregion to its entry, ignore it
+ if (*R == *N)
+ continue;
+
+ BasicBlock *Entry = R->getEntry();
+ if (Visited.count(Entry))
+ Pred[Entry] = BoolTrue;
+ else
+ LPred[Entry] = BoolFalse;
+ }
+ }
+}
+
+/// \brief Collect various loop and predicate infos
+void StructurizeCFG::collectInfos() {
+ // Reset predicate
+ Predicates.clear();
+
+ // and loop infos
+ Loops.clear();
+ LoopPreds.clear();
+
+ // Reset the visited nodes
+ Visited.clear();
+
+ for (RNVector::reverse_iterator OI = Order.rbegin(), OE = Order.rend();
+ OI != OE; ++OI) {
+
+ DEBUG(dbgs() << "Visiting: " <<
+ ((*OI)->isSubRegion() ? "SubRegion with entry: " : "") <<
+ (*OI)->getEntry()->getName() << " Loop Depth: " << LI->getLoopDepth((*OI)->getEntry()) << "\n");
+
+ // Analyze all the conditions leading to a node
+ gatherPredicates(*OI);
+
+ // Remember that we've seen this node
+ Visited.insert((*OI)->getEntry());
+
+ // Find the last back edges
+ analyzeLoops(*OI);
+ }
+}
+
+/// \brief Insert the missing branch conditions
+void StructurizeCFG::insertConditions(bool Loops) {
+ BranchVector &Conds = Loops ? LoopConds : Conditions;
+ Value *Default = Loops ? BoolTrue : BoolFalse;
+ SSAUpdater PhiInserter;
+
+ for (BranchInst *Term : Conds) {
+ assert(Term->isConditional());
+
+ BasicBlock *Parent = Term->getParent();
+ BasicBlock *SuccTrue = Term->getSuccessor(0);
+ BasicBlock *SuccFalse = Term->getSuccessor(1);
+
+ PhiInserter.Initialize(Boolean, "");
+ PhiInserter.AddAvailableValue(&Func->getEntryBlock(), Default);
+ PhiInserter.AddAvailableValue(Loops ? SuccFalse : Parent, Default);
+
+ BBPredicates &Preds = Loops ? LoopPreds[SuccFalse] : Predicates[SuccTrue];
+
+ NearestCommonDominator Dominator(DT);
+ Dominator.addBlock(Parent, false);
+
+ Value *ParentValue = nullptr;
+ for (BBPredicates::iterator PI = Preds.begin(), PE = Preds.end();
+ PI != PE; ++PI) {
+
+ if (PI->first == Parent) {
+ ParentValue = PI->second;
+ break;
+ }
+ PhiInserter.AddAvailableValue(PI->first, PI->second);
+ Dominator.addBlock(PI->first);
+ }
+
+ if (ParentValue) {
+ Term->setCondition(ParentValue);
+ } else {
+ if (!Dominator.wasResultExplicitMentioned())
+ PhiInserter.AddAvailableValue(Dominator.getResult(), Default);
+
+ Term->setCondition(PhiInserter.GetValueInMiddleOfBlock(Parent));
+ }
+ }
+}
+
+/// \brief Remove all PHI values coming from "From" into "To" and remember
+/// them in DeletedPhis
+void StructurizeCFG::delPhiValues(BasicBlock *From, BasicBlock *To) {
+ PhiMap &Map = DeletedPhis[To];
+ for (BasicBlock::iterator I = To->begin(), E = To->end();
+ I != E && isa<PHINode>(*I);) {
+
+ PHINode &Phi = cast<PHINode>(*I++);
+ while (Phi.getBasicBlockIndex(From) != -1) {
+ Value *Deleted = Phi.removeIncomingValue(From, false);
+ Map[&Phi].push_back(std::make_pair(From, Deleted));
+ }
+ }
+}
+
+/// \brief Add a dummy PHI value as soon as we knew the new predecessor
+void StructurizeCFG::addPhiValues(BasicBlock *From, BasicBlock *To) {
+ for (BasicBlock::iterator I = To->begin(), E = To->end();
+ I != E && isa<PHINode>(*I);) {
+
+ PHINode &Phi = cast<PHINode>(*I++);
+ Value *Undef = UndefValue::get(Phi.getType());
+ Phi.addIncoming(Undef, From);
+ }
+ AddedPhis[To].push_back(From);
+}
+
+/// \brief Add the real PHI value as soon as everything is set up
+void StructurizeCFG::setPhiValues() {
+ SSAUpdater Updater;
+ for (BB2BBVecMap::iterator AI = AddedPhis.begin(), AE = AddedPhis.end();
+ AI != AE; ++AI) {
+
+ BasicBlock *To = AI->first;
+ BBVector &From = AI->second;
+
+ if (!DeletedPhis.count(To))
+ continue;
+
+ PhiMap &Map = DeletedPhis[To];
+ for (PhiMap::iterator PI = Map.begin(), PE = Map.end();
+ PI != PE; ++PI) {
+
+ PHINode *Phi = PI->first;
+ Value *Undef = UndefValue::get(Phi->getType());
+ Updater.Initialize(Phi->getType(), "");
+ Updater.AddAvailableValue(&Func->getEntryBlock(), Undef);
+ Updater.AddAvailableValue(To, Undef);
+
+ NearestCommonDominator Dominator(DT);
+ Dominator.addBlock(To, false);
+ for (BBValueVector::iterator VI = PI->second.begin(),
+ VE = PI->second.end(); VI != VE; ++VI) {
+
+ Updater.AddAvailableValue(VI->first, VI->second);
+ Dominator.addBlock(VI->first);
+ }
+
+ if (!Dominator.wasResultExplicitMentioned())
+ Updater.AddAvailableValue(Dominator.getResult(), Undef);
+
+ for (BBVector::iterator FI = From.begin(), FE = From.end();
+ FI != FE; ++FI) {
+
+ int Idx = Phi->getBasicBlockIndex(*FI);
+ assert(Idx != -1);
+ Phi->setIncomingValue(Idx, Updater.GetValueAtEndOfBlock(*FI));
+ }
+ }
+
+ DeletedPhis.erase(To);
+ }
+ assert(DeletedPhis.empty());
+}
+
+/// \brief Remove phi values from all successors and then remove the terminator.
+void StructurizeCFG::killTerminator(BasicBlock *BB) {
+ TerminatorInst *Term = BB->getTerminator();
+ if (!Term)
+ return;
+
+ for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB);
+ SI != SE; ++SI) {
+
+ delPhiValues(BB, *SI);
+ }
+
+ Term->eraseFromParent();
+}
+
+/// \brief Let node exit(s) point to NewExit
+void StructurizeCFG::changeExit(RegionNode *Node, BasicBlock *NewExit,
+ bool IncludeDominator) {
+ if (Node->isSubRegion()) {
+ Region *SubRegion = Node->getNodeAs<Region>();
+ BasicBlock *OldExit = SubRegion->getExit();
+ BasicBlock *Dominator = nullptr;
+
+ // Find all the edges from the sub region to the exit
+ for (pred_iterator I = pred_begin(OldExit), E = pred_end(OldExit);
+ I != E;) {
+
+ BasicBlock *BB = *I++;
+ if (!SubRegion->contains(BB))
+ continue;
+
+ // Modify the edges to point to the new exit
+ delPhiValues(BB, OldExit);
+ BB->getTerminator()->replaceUsesOfWith(OldExit, NewExit);
+ addPhiValues(BB, NewExit);
+
+ // Find the new dominator (if requested)
+ if (IncludeDominator) {
+ if (!Dominator)
+ Dominator = BB;
+ else
+ Dominator = DT->findNearestCommonDominator(Dominator, BB);
+ }
+ }
+
+ // Change the dominator (if requested)
+ if (Dominator)
+ DT->changeImmediateDominator(NewExit, Dominator);
+
+ // Update the region info
+ SubRegion->replaceExit(NewExit);
+
+ } else {
+ BasicBlock *BB = Node->getNodeAs<BasicBlock>();
+ killTerminator(BB);
+ BranchInst::Create(NewExit, BB);
+ addPhiValues(BB, NewExit);
+ if (IncludeDominator)
+ DT->changeImmediateDominator(NewExit, BB);
+ }
+}
+
+/// \brief Create a new flow node and update dominator tree and region info
+BasicBlock *StructurizeCFG::getNextFlow(BasicBlock *Dominator) {
+ LLVMContext &Context = Func->getContext();
+ BasicBlock *Insert = Order.empty() ? ParentRegion->getExit() :
+ Order.back()->getEntry();
+ BasicBlock *Flow = BasicBlock::Create(Context, FlowBlockName,
+ Func, Insert);
+ DT->addNewBlock(Flow, Dominator);
+ ParentRegion->getRegionInfo()->setRegionFor(Flow, ParentRegion);
+ return Flow;
+}
+
+/// \brief Create a new or reuse the previous node as flow node
+BasicBlock *StructurizeCFG::needPrefix(bool NeedEmpty) {
+ BasicBlock *Entry = PrevNode->getEntry();
+
+ if (!PrevNode->isSubRegion()) {
+ killTerminator(Entry);
+ if (!NeedEmpty || Entry->getFirstInsertionPt() == Entry->end())
+ return Entry;
+
+ }
+
+ // create a new flow node
+ BasicBlock *Flow = getNextFlow(Entry);
+
+ // and wire it up
+ changeExit(PrevNode, Flow, true);
+ PrevNode = ParentRegion->getBBNode(Flow);
+ return Flow;
+}
+
+/// \brief Returns the region exit if possible, otherwise just a new flow node
+BasicBlock *StructurizeCFG::needPostfix(BasicBlock *Flow,
+ bool ExitUseAllowed) {
+ if (Order.empty() && ExitUseAllowed) {
+ BasicBlock *Exit = ParentRegion->getExit();
+ DT->changeImmediateDominator(Exit, Flow);
+ addPhiValues(Flow, Exit);
+ return Exit;
+ }
+ return getNextFlow(Flow);
+}
+
+/// \brief Set the previous node
+void StructurizeCFG::setPrevNode(BasicBlock *BB) {
+ PrevNode = ParentRegion->contains(BB) ? ParentRegion->getBBNode(BB)
+ : nullptr;
+}
+
+/// \brief Does BB dominate all the predicates of Node ?
+bool StructurizeCFG::dominatesPredicates(BasicBlock *BB, RegionNode *Node) {
+ BBPredicates &Preds = Predicates[Node->getEntry()];
+ for (BBPredicates::iterator PI = Preds.begin(), PE = Preds.end();
+ PI != PE; ++PI) {
+
+ if (!DT->dominates(BB, PI->first))
+ return false;
+ }
+ return true;
+}
+
+/// \brief Can we predict that this node will always be called?
+bool StructurizeCFG::isPredictableTrue(RegionNode *Node) {
+ BBPredicates &Preds = Predicates[Node->getEntry()];
+ bool Dominated = false;
+
+ // Regionentry is always true
+ if (!PrevNode)
+ return true;
+
+ for (BBPredicates::iterator I = Preds.begin(), E = Preds.end();
+ I != E; ++I) {
+
+ if (I->second != BoolTrue)
+ return false;
+
+ if (!Dominated && DT->dominates(I->first, PrevNode->getEntry()))
+ Dominated = true;
+ }
+
+ // TODO: The dominator check is too strict
+ return Dominated;
+}
+
+/// Take one node from the order vector and wire it up
+void StructurizeCFG::wireFlow(bool ExitUseAllowed,
+ BasicBlock *LoopEnd) {
+ RegionNode *Node = Order.pop_back_val();
+ Visited.insert(Node->getEntry());
+
+ if (isPredictableTrue(Node)) {
+ // Just a linear flow
+ if (PrevNode) {
+ changeExit(PrevNode, Node->getEntry(), true);
+ }
+ PrevNode = Node;
+
+ } else {
+ // Insert extra prefix node (or reuse last one)
+ BasicBlock *Flow = needPrefix(false);
+
+ // Insert extra postfix node (or use exit instead)
+ BasicBlock *Entry = Node->getEntry();
+ BasicBlock *Next = needPostfix(Flow, ExitUseAllowed);
+
+ // let it point to entry and next block
+ Conditions.push_back(BranchInst::Create(Entry, Next, BoolUndef, Flow));
+ addPhiValues(Flow, Entry);
+ DT->changeImmediateDominator(Entry, Flow);
+
+ PrevNode = Node;
+ while (!Order.empty() && !Visited.count(LoopEnd) &&
+ dominatesPredicates(Entry, Order.back())) {
+ handleLoops(false, LoopEnd);
+ }
+
+ changeExit(PrevNode, Next, false);
+ setPrevNode(Next);
+ }
+}
+
+void StructurizeCFG::handleLoops(bool ExitUseAllowed,
+ BasicBlock *LoopEnd) {
+ RegionNode *Node = Order.back();
+ BasicBlock *LoopStart = Node->getEntry();
+
+ if (!Loops.count(LoopStart)) {
+ wireFlow(ExitUseAllowed, LoopEnd);
+ return;
+ }
+
+ if (!isPredictableTrue(Node))
+ LoopStart = needPrefix(true);
+
+ LoopEnd = Loops[Node->getEntry()];
+ wireFlow(false, LoopEnd);
+ while (!Visited.count(LoopEnd)) {
+ handleLoops(false, LoopEnd);
+ }
+
+ // If the start of the loop is the entry block, we can't branch to it so
+ // insert a new dummy entry block.
+ Function *LoopFunc = LoopStart->getParent();
+ if (LoopStart == &LoopFunc->getEntryBlock()) {
+ LoopStart->setName("entry.orig");
+
+ BasicBlock *NewEntry =
+ BasicBlock::Create(LoopStart->getContext(),
+ "entry",
+ LoopFunc,
+ LoopStart);
+ BranchInst::Create(LoopStart, NewEntry);
+ }
+
+ // Create an extra loop end node
+ LoopEnd = needPrefix(false);
+ BasicBlock *Next = needPostfix(LoopEnd, ExitUseAllowed);
+ LoopConds.push_back(BranchInst::Create(Next, LoopStart,
+ BoolUndef, LoopEnd));
+ addPhiValues(LoopEnd, LoopStart);
+ setPrevNode(Next);
+}
+
+/// After this function control flow looks like it should be, but
+/// branches and PHI nodes only have undefined conditions.
+void StructurizeCFG::createFlow() {
+ BasicBlock *Exit = ParentRegion->getExit();
+ bool EntryDominatesExit = DT->dominates(ParentRegion->getEntry(), Exit);
+
+ DeletedPhis.clear();
+ AddedPhis.clear();
+ Conditions.clear();
+ LoopConds.clear();
+
+ PrevNode = nullptr;
+ Visited.clear();
+
+ while (!Order.empty()) {
+ handleLoops(EntryDominatesExit, nullptr);
+ }
+
+ if (PrevNode)
+ changeExit(PrevNode, Exit, EntryDominatesExit);
+ else
+ assert(EntryDominatesExit);
+}
+
+/// Handle a rare case where the disintegrated nodes instructions
+/// no longer dominate all their uses. Not sure if this is really nessasary
+void StructurizeCFG::rebuildSSA() {
+ SSAUpdater Updater;
+ for (auto *BB : ParentRegion->blocks())
+ for (BasicBlock::iterator II = BB->begin(), IE = BB->end();
+ II != IE; ++II) {
+
+ bool Initialized = false;
+ for (auto I = II->use_begin(), E = II->use_end(); I != E;) {
+ Use &U = *I++;
+ Instruction *User = cast<Instruction>(U.getUser());
+ if (User->getParent() == BB) {
+ continue;
+
+ } else if (PHINode *UserPN = dyn_cast<PHINode>(User)) {
+ if (UserPN->getIncomingBlock(U) == BB)
+ continue;
+ }
+
+ if (DT->dominates(&*II, User))
+ continue;
+
+ if (!Initialized) {
+ Value *Undef = UndefValue::get(II->getType());
+ Updater.Initialize(II->getType(), "");
+ Updater.AddAvailableValue(&Func->getEntryBlock(), Undef);
+ Updater.AddAvailableValue(BB, &*II);
+ Initialized = true;
+ }
+ Updater.RewriteUseAfterInsertions(U);
+ }
+ }
+}
+
+/// \brief Run the transformation for each region found
+bool StructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) {
+ if (R->isTopLevelRegion())
+ return false;
+
+ Func = R->getEntry()->getParent();
+ ParentRegion = R;
+
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+
+ orderNodes();
+ collectInfos();
+ createFlow();
+ insertConditions(false);
+ insertConditions(true);
+ setPhiValues();
+ rebuildSSA();
+
+ // Cleanup
+ Order.clear();
+ Visited.clear();
+ DeletedPhis.clear();
+ AddedPhis.clear();
+ Predicates.clear();
+ Conditions.clear();
+ Loops.clear();
+ LoopPreds.clear();
+ LoopConds.clear();
+
+ return true;
+}
+
+/// \brief Create the pass
+Pass *llvm::createStructurizeCFGPass() {
+ return new StructurizeCFG();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
new file mode 100644
index 0000000..4e84d72
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -0,0 +1,851 @@
+//===- TailRecursionElimination.cpp - Eliminate Tail Calls ----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file transforms calls of the current function (self recursion) followed
+// by a return instruction with a branch to the entry of the function, creating
+// a loop. This pass also implements the following extensions to the basic
+// algorithm:
+//
+// 1. Trivial instructions between the call and return do not prevent the
+// transformation from taking place, though currently the analysis cannot
+// support moving any really useful instructions (only dead ones).
+// 2. This pass transforms functions that are prevented from being tail
+// recursive by an associative and commutative expression to use an
+// accumulator variable, thus compiling the typical naive factorial or
+// 'fib' implementation into efficient code.
+// 3. TRE is performed if the function returns void, if the return
+// returns the result returned by the call, or if the function returns a
+// run-time constant on all exits from the function. It is possible, though
+// unlikely, that the return returns something else (like constant 0), and
+// can still be TRE'd. It can be TRE'd if ALL OTHER return instructions in
+// the function return the exact same value.
+// 4. If it can prove that callees do not access their caller stack frame,
+// they are marked as eligible for tail call elimination (by the code
+// generator).
+//
+// There are several improvements that could be made:
+//
+// 1. If the function has any alloca instructions, these instructions will be
+// moved out of the entry block of the function, causing them to be
+// evaluated each time through the tail recursion. Safely keeping allocas
+// in the entry block requires analysis to proves that the tail-called
+// function does not read or write the stack object.
+// 2. Tail recursion is only performed if the call immediately precedes the
+// return instruction. It's possible that there could be a jump between
+// the call and the return.
+// 3. There can be intervening operations between the call and the return that
+// prevent the TRE from occurring. For example, there could be GEP's and
+// stores to memory that will not be read or written by the call. This
+// requires some substantial analysis (such as with DSA) to prove safe to
+// move ahead of the call, but doing so could allow many more TREs to be
+// performed, for example in TreeAdd/TreeAlloc from the treeadd benchmark.
+// 4. The algorithm we use to detect if callees access their caller stack
+// frames is very primitive.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/InlineCost.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "tailcallelim"
+
+STATISTIC(NumEliminated, "Number of tail calls removed");
+STATISTIC(NumRetDuped, "Number of return duplicated");
+STATISTIC(NumAccumAdded, "Number of accumulators introduced");
+
+namespace {
+ struct TailCallElim : public FunctionPass {
+ const TargetTransformInfo *TTI;
+
+ static char ID; // Pass identification, replacement for typeid
+ TailCallElim() : FunctionPass(ID) {
+ initializeTailCallElimPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+ bool runOnFunction(Function &F) override;
+
+ private:
+ bool runTRE(Function &F);
+ bool markTails(Function &F, bool &AllCallsAreTailCalls);
+
+ CallInst *FindTRECandidate(Instruction *I,
+ bool CannotTailCallElimCallsMarkedTail);
+ bool EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,
+ BasicBlock *&OldEntry,
+ bool &TailCallsAreMarkedTail,
+ SmallVectorImpl<PHINode *> &ArgumentPHIs,
+ bool CannotTailCallElimCallsMarkedTail);
+ bool FoldReturnAndProcessPred(BasicBlock *BB,
+ ReturnInst *Ret, BasicBlock *&OldEntry,
+ bool &TailCallsAreMarkedTail,
+ SmallVectorImpl<PHINode *> &ArgumentPHIs,
+ bool CannotTailCallElimCallsMarkedTail);
+ bool ProcessReturningBlock(ReturnInst *RI, BasicBlock *&OldEntry,
+ bool &TailCallsAreMarkedTail,
+ SmallVectorImpl<PHINode *> &ArgumentPHIs,
+ bool CannotTailCallElimCallsMarkedTail);
+ bool CanMoveAboveCall(Instruction *I, CallInst *CI);
+ Value *CanTransformAccumulatorRecursion(Instruction *I, CallInst *CI);
+ };
+}
+
+char TailCallElim::ID = 0;
+INITIALIZE_PASS_BEGIN(TailCallElim, "tailcallelim",
+ "Tail Call Elimination", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(TailCallElim, "tailcallelim",
+ "Tail Call Elimination", false, false)
+
+// Public interface to the TailCallElimination pass
+FunctionPass *llvm::createTailCallEliminationPass() {
+ return new TailCallElim();
+}
+
+void TailCallElim::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+}
+
+/// \brief Scan the specified function for alloca instructions.
+/// If it contains any dynamic allocas, returns false.
+static bool CanTRE(Function &F) {
+ // Because of PR962, we don't TRE dynamic allocas.
+ for (auto &BB : F) {
+ for (auto &I : BB) {
+ if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
+ if (!AI->isStaticAlloca())
+ return false;
+ }
+ }
+ }
+
+ return true;
+}
+
+bool TailCallElim::runOnFunction(Function &F) {
+ if (skipOptnoneFunction(F))
+ return false;
+
+ if (F.getFnAttribute("disable-tail-calls").getValueAsString() == "true")
+ return false;
+
+ bool AllCallsAreTailCalls = false;
+ bool Modified = markTails(F, AllCallsAreTailCalls);
+ if (AllCallsAreTailCalls)
+ Modified |= runTRE(F);
+ return Modified;
+}
+
+namespace {
+struct AllocaDerivedValueTracker {
+ // Start at a root value and walk its use-def chain to mark calls that use the
+ // value or a derived value in AllocaUsers, and places where it may escape in
+ // EscapePoints.
+ void walk(Value *Root) {
+ SmallVector<Use *, 32> Worklist;
+ SmallPtrSet<Use *, 32> Visited;
+
+ auto AddUsesToWorklist = [&](Value *V) {
+ for (auto &U : V->uses()) {
+ if (!Visited.insert(&U).second)
+ continue;
+ Worklist.push_back(&U);
+ }
+ };
+
+ AddUsesToWorklist(Root);
+
+ while (!Worklist.empty()) {
+ Use *U = Worklist.pop_back_val();
+ Instruction *I = cast<Instruction>(U->getUser());
+
+ switch (I->getOpcode()) {
+ case Instruction::Call:
+ case Instruction::Invoke: {
+ CallSite CS(I);
+ bool IsNocapture =
+ CS.isDataOperand(U) && CS.doesNotCapture(CS.getDataOperandNo(U));
+ callUsesLocalStack(CS, IsNocapture);
+ if (IsNocapture) {
+ // If the alloca-derived argument is passed in as nocapture, then it
+ // can't propagate to the call's return. That would be capturing.
+ continue;
+ }
+ break;
+ }
+ case Instruction::Load: {
+ // The result of a load is not alloca-derived (unless an alloca has
+ // otherwise escaped, but this is a local analysis).
+ continue;
+ }
+ case Instruction::Store: {
+ if (U->getOperandNo() == 0)
+ EscapePoints.insert(I);
+ continue; // Stores have no users to analyze.
+ }
+ case Instruction::BitCast:
+ case Instruction::GetElementPtr:
+ case Instruction::PHI:
+ case Instruction::Select:
+ case Instruction::AddrSpaceCast:
+ break;
+ default:
+ EscapePoints.insert(I);
+ break;
+ }
+
+ AddUsesToWorklist(I);
+ }
+ }
+
+ void callUsesLocalStack(CallSite CS, bool IsNocapture) {
+ // Add it to the list of alloca users.
+ AllocaUsers.insert(CS.getInstruction());
+
+ // If it's nocapture then it can't capture this alloca.
+ if (IsNocapture)
+ return;
+
+ // If it can write to memory, it can leak the alloca value.
+ if (!CS.onlyReadsMemory())
+ EscapePoints.insert(CS.getInstruction());
+ }
+
+ SmallPtrSet<Instruction *, 32> AllocaUsers;
+ SmallPtrSet<Instruction *, 32> EscapePoints;
+};
+}
+
+bool TailCallElim::markTails(Function &F, bool &AllCallsAreTailCalls) {
+ if (F.callsFunctionThatReturnsTwice())
+ return false;
+ AllCallsAreTailCalls = true;
+
+ // The local stack holds all alloca instructions and all byval arguments.
+ AllocaDerivedValueTracker Tracker;
+ for (Argument &Arg : F.args()) {
+ if (Arg.hasByValAttr())
+ Tracker.walk(&Arg);
+ }
+ for (auto &BB : F) {
+ for (auto &I : BB)
+ if (AllocaInst *AI = dyn_cast<AllocaInst>(&I))
+ Tracker.walk(AI);
+ }
+
+ bool Modified = false;
+
+ // Track whether a block is reachable after an alloca has escaped. Blocks that
+ // contain the escaping instruction will be marked as being visited without an
+ // escaped alloca, since that is how the block began.
+ enum VisitType {
+ UNVISITED,
+ UNESCAPED,
+ ESCAPED
+ };
+ DenseMap<BasicBlock *, VisitType> Visited;
+
+ // We propagate the fact that an alloca has escaped from block to successor.
+ // Visit the blocks that are propagating the escapedness first. To do this, we
+ // maintain two worklists.
+ SmallVector<BasicBlock *, 32> WorklistUnescaped, WorklistEscaped;
+
+ // We may enter a block and visit it thinking that no alloca has escaped yet,
+ // then see an escape point and go back around a loop edge and come back to
+ // the same block twice. Because of this, we defer setting tail on calls when
+ // we first encounter them in a block. Every entry in this list does not
+ // statically use an alloca via use-def chain analysis, but may find an alloca
+ // through other means if the block turns out to be reachable after an escape
+ // point.
+ SmallVector<CallInst *, 32> DeferredTails;
+
+ BasicBlock *BB = &F.getEntryBlock();
+ VisitType Escaped = UNESCAPED;
+ do {
+ for (auto &I : *BB) {
+ if (Tracker.EscapePoints.count(&I))
+ Escaped = ESCAPED;
+
+ CallInst *CI = dyn_cast<CallInst>(&I);
+ if (!CI || CI->isTailCall())
+ continue;
+
+ bool IsNoTail = CI->isNoTailCall();
+
+ if (!IsNoTail && CI->doesNotAccessMemory()) {
+ // A call to a readnone function whose arguments are all things computed
+ // outside this function can be marked tail. Even if you stored the
+ // alloca address into a global, a readnone function can't load the
+ // global anyhow.
+ //
+ // Note that this runs whether we know an alloca has escaped or not. If
+ // it has, then we can't trust Tracker.AllocaUsers to be accurate.
+ bool SafeToTail = true;
+ for (auto &Arg : CI->arg_operands()) {
+ if (isa<Constant>(Arg.getUser()))
+ continue;
+ if (Argument *A = dyn_cast<Argument>(Arg.getUser()))
+ if (!A->hasByValAttr())
+ continue;
+ SafeToTail = false;
+ break;
+ }
+ if (SafeToTail) {
+ emitOptimizationRemark(
+ F.getContext(), "tailcallelim", F, CI->getDebugLoc(),
+ "marked this readnone call a tail call candidate");
+ CI->setTailCall();
+ Modified = true;
+ continue;
+ }
+ }
+
+ if (!IsNoTail && Escaped == UNESCAPED && !Tracker.AllocaUsers.count(CI)) {
+ DeferredTails.push_back(CI);
+ } else {
+ AllCallsAreTailCalls = false;
+ }
+ }
+
+ for (auto *SuccBB : make_range(succ_begin(BB), succ_end(BB))) {
+ auto &State = Visited[SuccBB];
+ if (State < Escaped) {
+ State = Escaped;
+ if (State == ESCAPED)
+ WorklistEscaped.push_back(SuccBB);
+ else
+ WorklistUnescaped.push_back(SuccBB);
+ }
+ }
+
+ if (!WorklistEscaped.empty()) {
+ BB = WorklistEscaped.pop_back_val();
+ Escaped = ESCAPED;
+ } else {
+ BB = nullptr;
+ while (!WorklistUnescaped.empty()) {
+ auto *NextBB = WorklistUnescaped.pop_back_val();
+ if (Visited[NextBB] == UNESCAPED) {
+ BB = NextBB;
+ Escaped = UNESCAPED;
+ break;
+ }
+ }
+ }
+ } while (BB);
+
+ for (CallInst *CI : DeferredTails) {
+ if (Visited[CI->getParent()] != ESCAPED) {
+ // If the escape point was part way through the block, calls after the
+ // escape point wouldn't have been put into DeferredTails.
+ emitOptimizationRemark(F.getContext(), "tailcallelim", F,
+ CI->getDebugLoc(),
+ "marked this call a tail call candidate");
+ CI->setTailCall();
+ Modified = true;
+ } else {
+ AllCallsAreTailCalls = false;
+ }
+ }
+
+ return Modified;
+}
+
+bool TailCallElim::runTRE(Function &F) {
+ // If this function is a varargs function, we won't be able to PHI the args
+ // right, so don't even try to convert it...
+ if (F.getFunctionType()->isVarArg()) return false;
+
+ TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ BasicBlock *OldEntry = nullptr;
+ bool TailCallsAreMarkedTail = false;
+ SmallVector<PHINode*, 8> ArgumentPHIs;
+ bool MadeChange = false;
+
+ // If false, we cannot perform TRE on tail calls marked with the 'tail'
+ // attribute, because doing so would cause the stack size to increase (real
+ // TRE would deallocate variable sized allocas, TRE doesn't).
+ bool CanTRETailMarkedCall = CanTRE(F);
+
+ // Change any tail recursive calls to loops.
+ //
+ // FIXME: The code generator produces really bad code when an 'escaping
+ // alloca' is changed from being a static alloca to being a dynamic alloca.
+ // Until this is resolved, disable this transformation if that would ever
+ // happen. This bug is PR962.
+ for (Function::iterator BBI = F.begin(), E = F.end(); BBI != E; /*in loop*/) {
+ BasicBlock *BB = &*BBI++; // FoldReturnAndProcessPred may delete BB.
+ if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) {
+ bool Change = ProcessReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail,
+ ArgumentPHIs, !CanTRETailMarkedCall);
+ if (!Change && BB->getFirstNonPHIOrDbg() == Ret)
+ Change = FoldReturnAndProcessPred(BB, Ret, OldEntry,
+ TailCallsAreMarkedTail, ArgumentPHIs,
+ !CanTRETailMarkedCall);
+ MadeChange |= Change;
+ }
+ }
+
+ // If we eliminated any tail recursions, it's possible that we inserted some
+ // silly PHI nodes which just merge an initial value (the incoming operand)
+ // with themselves. Check to see if we did and clean up our mess if so. This
+ // occurs when a function passes an argument straight through to its tail
+ // call.
+ for (PHINode *PN : ArgumentPHIs) {
+ // If the PHI Node is a dynamic constant, replace it with the value it is.
+ if (Value *PNV = SimplifyInstruction(PN, F.getParent()->getDataLayout())) {
+ PN->replaceAllUsesWith(PNV);
+ PN->eraseFromParent();
+ }
+ }
+
+ return MadeChange;
+}
+
+
+/// Return true if it is safe to move the specified
+/// instruction from after the call to before the call, assuming that all
+/// instructions between the call and this instruction are movable.
+///
+bool TailCallElim::CanMoveAboveCall(Instruction *I, CallInst *CI) {
+ // FIXME: We can move load/store/call/free instructions above the call if the
+ // call does not mod/ref the memory location being processed.
+ if (I->mayHaveSideEffects()) // This also handles volatile loads.
+ return false;
+
+ if (LoadInst *L = dyn_cast<LoadInst>(I)) {
+ // Loads may always be moved above calls without side effects.
+ if (CI->mayHaveSideEffects()) {
+ // Non-volatile loads may be moved above a call with side effects if it
+ // does not write to memory and the load provably won't trap.
+ // FIXME: Writes to memory only matter if they may alias the pointer
+ // being loaded from.
+ if (CI->mayWriteToMemory() ||
+ !isSafeToLoadUnconditionally(L->getPointerOperand(), L,
+ L->getAlignment()))
+ return false;
+ }
+ }
+
+ // Otherwise, if this is a side-effect free instruction, check to make sure
+ // that it does not use the return value of the call. If it doesn't use the
+ // return value of the call, it must only use things that are defined before
+ // the call, or movable instructions between the call and the instruction
+ // itself.
+ return std::find(I->op_begin(), I->op_end(), CI) == I->op_end();
+}
+
+/// Return true if the specified value is the same when the return would exit
+/// as it was when the initial iteration of the recursive function was executed.
+///
+/// We currently handle static constants and arguments that are not modified as
+/// part of the recursion.
+static bool isDynamicConstant(Value *V, CallInst *CI, ReturnInst *RI) {
+ if (isa<Constant>(V)) return true; // Static constants are always dyn consts
+
+ // Check to see if this is an immutable argument, if so, the value
+ // will be available to initialize the accumulator.
+ if (Argument *Arg = dyn_cast<Argument>(V)) {
+ // Figure out which argument number this is...
+ unsigned ArgNo = 0;
+ Function *F = CI->getParent()->getParent();
+ for (Function::arg_iterator AI = F->arg_begin(); &*AI != Arg; ++AI)
+ ++ArgNo;
+
+ // If we are passing this argument into call as the corresponding
+ // argument operand, then the argument is dynamically constant.
+ // Otherwise, we cannot transform this function safely.
+ if (CI->getArgOperand(ArgNo) == Arg)
+ return true;
+ }
+
+ // Switch cases are always constant integers. If the value is being switched
+ // on and the return is only reachable from one of its cases, it's
+ // effectively constant.
+ if (BasicBlock *UniquePred = RI->getParent()->getUniquePredecessor())
+ if (SwitchInst *SI = dyn_cast<SwitchInst>(UniquePred->getTerminator()))
+ if (SI->getCondition() == V)
+ return SI->getDefaultDest() != RI->getParent();
+
+ // Not a constant or immutable argument, we can't safely transform.
+ return false;
+}
+
+/// Check to see if the function containing the specified tail call consistently
+/// returns the same runtime-constant value at all exit points except for
+/// IgnoreRI. If so, return the returned value.
+static Value *getCommonReturnValue(ReturnInst *IgnoreRI, CallInst *CI) {
+ Function *F = CI->getParent()->getParent();
+ Value *ReturnedValue = nullptr;
+
+ for (Function::iterator BBI = F->begin(), E = F->end(); BBI != E; ++BBI) {
+ ReturnInst *RI = dyn_cast<ReturnInst>(BBI->getTerminator());
+ if (RI == nullptr || RI == IgnoreRI) continue;
+
+ // We can only perform this transformation if the value returned is
+ // evaluatable at the start of the initial invocation of the function,
+ // instead of at the end of the evaluation.
+ //
+ Value *RetOp = RI->getOperand(0);
+ if (!isDynamicConstant(RetOp, CI, RI))
+ return nullptr;
+
+ if (ReturnedValue && RetOp != ReturnedValue)
+ return nullptr; // Cannot transform if differing values are returned.
+ ReturnedValue = RetOp;
+ }
+ return ReturnedValue;
+}
+
+/// If the specified instruction can be transformed using accumulator recursion
+/// elimination, return the constant which is the start of the accumulator
+/// value. Otherwise return null.
+Value *TailCallElim::CanTransformAccumulatorRecursion(Instruction *I,
+ CallInst *CI) {
+ if (!I->isAssociative() || !I->isCommutative()) return nullptr;
+ assert(I->getNumOperands() == 2 &&
+ "Associative/commutative operations should have 2 args!");
+
+ // Exactly one operand should be the result of the call instruction.
+ if ((I->getOperand(0) == CI && I->getOperand(1) == CI) ||
+ (I->getOperand(0) != CI && I->getOperand(1) != CI))
+ return nullptr;
+
+ // The only user of this instruction we allow is a single return instruction.
+ if (!I->hasOneUse() || !isa<ReturnInst>(I->user_back()))
+ return nullptr;
+
+ // Ok, now we have to check all of the other return instructions in this
+ // function. If they return non-constants or differing values, then we cannot
+ // transform the function safely.
+ return getCommonReturnValue(cast<ReturnInst>(I->user_back()), CI);
+}
+
+static Instruction *FirstNonDbg(BasicBlock::iterator I) {
+ while (isa<DbgInfoIntrinsic>(I))
+ ++I;
+ return &*I;
+}
+
+CallInst*
+TailCallElim::FindTRECandidate(Instruction *TI,
+ bool CannotTailCallElimCallsMarkedTail) {
+ BasicBlock *BB = TI->getParent();
+ Function *F = BB->getParent();
+
+ if (&BB->front() == TI) // Make sure there is something before the terminator.
+ return nullptr;
+
+ // Scan backwards from the return, checking to see if there is a tail call in
+ // this block. If so, set CI to it.
+ CallInst *CI = nullptr;
+ BasicBlock::iterator BBI(TI);
+ while (true) {
+ CI = dyn_cast<CallInst>(BBI);
+ if (CI && CI->getCalledFunction() == F)
+ break;
+
+ if (BBI == BB->begin())
+ return nullptr; // Didn't find a potential tail call.
+ --BBI;
+ }
+
+ // If this call is marked as a tail call, and if there are dynamic allocas in
+ // the function, we cannot perform this optimization.
+ if (CI->isTailCall() && CannotTailCallElimCallsMarkedTail)
+ return nullptr;
+
+ // As a special case, detect code like this:
+ // double fabs(double f) { return __builtin_fabs(f); } // a 'fabs' call
+ // and disable this xform in this case, because the code generator will
+ // lower the call to fabs into inline code.
+ if (BB == &F->getEntryBlock() &&
+ FirstNonDbg(BB->front().getIterator()) == CI &&
+ FirstNonDbg(std::next(BB->begin())) == TI && CI->getCalledFunction() &&
+ !TTI->isLoweredToCall(CI->getCalledFunction())) {
+ // A single-block function with just a call and a return. Check that
+ // the arguments match.
+ CallSite::arg_iterator I = CallSite(CI).arg_begin(),
+ E = CallSite(CI).arg_end();
+ Function::arg_iterator FI = F->arg_begin(),
+ FE = F->arg_end();
+ for (; I != E && FI != FE; ++I, ++FI)
+ if (*I != &*FI) break;
+ if (I == E && FI == FE)
+ return nullptr;
+ }
+
+ return CI;
+}
+
+bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,
+ BasicBlock *&OldEntry,
+ bool &TailCallsAreMarkedTail,
+ SmallVectorImpl<PHINode *> &ArgumentPHIs,
+ bool CannotTailCallElimCallsMarkedTail) {
+ // If we are introducing accumulator recursion to eliminate operations after
+ // the call instruction that are both associative and commutative, the initial
+ // value for the accumulator is placed in this variable. If this value is set
+ // then we actually perform accumulator recursion elimination instead of
+ // simple tail recursion elimination. If the operation is an LLVM instruction
+ // (eg: "add") then it is recorded in AccumulatorRecursionInstr. If not, then
+ // we are handling the case when the return instruction returns a constant C
+ // which is different to the constant returned by other return instructions
+ // (which is recorded in AccumulatorRecursionEliminationInitVal). This is a
+ // special case of accumulator recursion, the operation being "return C".
+ Value *AccumulatorRecursionEliminationInitVal = nullptr;
+ Instruction *AccumulatorRecursionInstr = nullptr;
+
+ // Ok, we found a potential tail call. We can currently only transform the
+ // tail call if all of the instructions between the call and the return are
+ // movable to above the call itself, leaving the call next to the return.
+ // Check that this is the case now.
+ BasicBlock::iterator BBI(CI);
+ for (++BBI; &*BBI != Ret; ++BBI) {
+ if (CanMoveAboveCall(&*BBI, CI)) continue;
+
+ // If we can't move the instruction above the call, it might be because it
+ // is an associative and commutative operation that could be transformed
+ // using accumulator recursion elimination. Check to see if this is the
+ // case, and if so, remember the initial accumulator value for later.
+ if ((AccumulatorRecursionEliminationInitVal =
+ CanTransformAccumulatorRecursion(&*BBI, CI))) {
+ // Yes, this is accumulator recursion. Remember which instruction
+ // accumulates.
+ AccumulatorRecursionInstr = &*BBI;
+ } else {
+ return false; // Otherwise, we cannot eliminate the tail recursion!
+ }
+ }
+
+ // We can only transform call/return pairs that either ignore the return value
+ // of the call and return void, ignore the value of the call and return a
+ // constant, return the value returned by the tail call, or that are being
+ // accumulator recursion variable eliminated.
+ if (Ret->getNumOperands() == 1 && Ret->getReturnValue() != CI &&
+ !isa<UndefValue>(Ret->getReturnValue()) &&
+ AccumulatorRecursionEliminationInitVal == nullptr &&
+ !getCommonReturnValue(nullptr, CI)) {
+ // One case remains that we are able to handle: the current return
+ // instruction returns a constant, and all other return instructions
+ // return a different constant.
+ if (!isDynamicConstant(Ret->getReturnValue(), CI, Ret))
+ return false; // Current return instruction does not return a constant.
+ // Check that all other return instructions return a common constant. If
+ // so, record it in AccumulatorRecursionEliminationInitVal.
+ AccumulatorRecursionEliminationInitVal = getCommonReturnValue(Ret, CI);
+ if (!AccumulatorRecursionEliminationInitVal)
+ return false;
+ }
+
+ BasicBlock *BB = Ret->getParent();
+ Function *F = BB->getParent();
+
+ emitOptimizationRemark(F->getContext(), "tailcallelim", *F, CI->getDebugLoc(),
+ "transforming tail recursion to loop");
+
+ // OK! We can transform this tail call. If this is the first one found,
+ // create the new entry block, allowing us to branch back to the old entry.
+ if (!OldEntry) {
+ OldEntry = &F->getEntryBlock();
+ BasicBlock *NewEntry = BasicBlock::Create(F->getContext(), "", F, OldEntry);
+ NewEntry->takeName(OldEntry);
+ OldEntry->setName("tailrecurse");
+ BranchInst::Create(OldEntry, NewEntry);
+
+ // If this tail call is marked 'tail' and if there are any allocas in the
+ // entry block, move them up to the new entry block.
+ TailCallsAreMarkedTail = CI->isTailCall();
+ if (TailCallsAreMarkedTail)
+ // Move all fixed sized allocas from OldEntry to NewEntry.
+ for (BasicBlock::iterator OEBI = OldEntry->begin(), E = OldEntry->end(),
+ NEBI = NewEntry->begin(); OEBI != E; )
+ if (AllocaInst *AI = dyn_cast<AllocaInst>(OEBI++))
+ if (isa<ConstantInt>(AI->getArraySize()))
+ AI->moveBefore(&*NEBI);
+
+ // Now that we have created a new block, which jumps to the entry
+ // block, insert a PHI node for each argument of the function.
+ // For now, we initialize each PHI to only have the real arguments
+ // which are passed in.
+ Instruction *InsertPos = &OldEntry->front();
+ for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end();
+ I != E; ++I) {
+ PHINode *PN = PHINode::Create(I->getType(), 2,
+ I->getName() + ".tr", InsertPos);
+ I->replaceAllUsesWith(PN); // Everyone use the PHI node now!
+ PN->addIncoming(&*I, NewEntry);
+ ArgumentPHIs.push_back(PN);
+ }
+ }
+
+ // If this function has self recursive calls in the tail position where some
+ // are marked tail and some are not, only transform one flavor or another. We
+ // have to choose whether we move allocas in the entry block to the new entry
+ // block or not, so we can't make a good choice for both. NOTE: We could do
+ // slightly better here in the case that the function has no entry block
+ // allocas.
+ if (TailCallsAreMarkedTail && !CI->isTailCall())
+ return false;
+
+ // Ok, now that we know we have a pseudo-entry block WITH all of the
+ // required PHI nodes, add entries into the PHI node for the actual
+ // parameters passed into the tail-recursive call.
+ for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i)
+ ArgumentPHIs[i]->addIncoming(CI->getArgOperand(i), BB);
+
+ // If we are introducing an accumulator variable to eliminate the recursion,
+ // do so now. Note that we _know_ that no subsequent tail recursion
+ // eliminations will happen on this function because of the way the
+ // accumulator recursion predicate is set up.
+ //
+ if (AccumulatorRecursionEliminationInitVal) {
+ Instruction *AccRecInstr = AccumulatorRecursionInstr;
+ // Start by inserting a new PHI node for the accumulator.
+ pred_iterator PB = pred_begin(OldEntry), PE = pred_end(OldEntry);
+ PHINode *AccPN = PHINode::Create(
+ AccumulatorRecursionEliminationInitVal->getType(),
+ std::distance(PB, PE) + 1, "accumulator.tr", &OldEntry->front());
+
+ // Loop over all of the predecessors of the tail recursion block. For the
+ // real entry into the function we seed the PHI with the initial value,
+ // computed earlier. For any other existing branches to this block (due to
+ // other tail recursions eliminated) the accumulator is not modified.
+ // Because we haven't added the branch in the current block to OldEntry yet,
+ // it will not show up as a predecessor.
+ for (pred_iterator PI = PB; PI != PE; ++PI) {
+ BasicBlock *P = *PI;
+ if (P == &F->getEntryBlock())
+ AccPN->addIncoming(AccumulatorRecursionEliminationInitVal, P);
+ else
+ AccPN->addIncoming(AccPN, P);
+ }
+
+ if (AccRecInstr) {
+ // Add an incoming argument for the current block, which is computed by
+ // our associative and commutative accumulator instruction.
+ AccPN->addIncoming(AccRecInstr, BB);
+
+ // Next, rewrite the accumulator recursion instruction so that it does not
+ // use the result of the call anymore, instead, use the PHI node we just
+ // inserted.
+ AccRecInstr->setOperand(AccRecInstr->getOperand(0) != CI, AccPN);
+ } else {
+ // Add an incoming argument for the current block, which is just the
+ // constant returned by the current return instruction.
+ AccPN->addIncoming(Ret->getReturnValue(), BB);
+ }
+
+ // Finally, rewrite any return instructions in the program to return the PHI
+ // node instead of the "initval" that they do currently. This loop will
+ // actually rewrite the return value we are destroying, but that's ok.
+ for (Function::iterator BBI = F->begin(), E = F->end(); BBI != E; ++BBI)
+ if (ReturnInst *RI = dyn_cast<ReturnInst>(BBI->getTerminator()))
+ RI->setOperand(0, AccPN);
+ ++NumAccumAdded;
+ }
+
+ // Now that all of the PHI nodes are in place, remove the call and
+ // ret instructions, replacing them with an unconditional branch.
+ BranchInst *NewBI = BranchInst::Create(OldEntry, Ret);
+ NewBI->setDebugLoc(CI->getDebugLoc());
+
+ BB->getInstList().erase(Ret); // Remove return.
+ BB->getInstList().erase(CI); // Remove call.
+ ++NumEliminated;
+ return true;
+}
+
+bool TailCallElim::FoldReturnAndProcessPred(BasicBlock *BB,
+ ReturnInst *Ret, BasicBlock *&OldEntry,
+ bool &TailCallsAreMarkedTail,
+ SmallVectorImpl<PHINode *> &ArgumentPHIs,
+ bool CannotTailCallElimCallsMarkedTail) {
+ bool Change = false;
+
+ // If the return block contains nothing but the return and PHI's,
+ // there might be an opportunity to duplicate the return in its
+ // predecessors and perform TRC there. Look for predecessors that end
+ // in unconditional branch and recursive call(s).
+ SmallVector<BranchInst*, 8> UncondBranchPreds;
+ for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+ BasicBlock *Pred = *PI;
+ TerminatorInst *PTI = Pred->getTerminator();
+ if (BranchInst *BI = dyn_cast<BranchInst>(PTI))
+ if (BI->isUnconditional())
+ UncondBranchPreds.push_back(BI);
+ }
+
+ while (!UncondBranchPreds.empty()) {
+ BranchInst *BI = UncondBranchPreds.pop_back_val();
+ BasicBlock *Pred = BI->getParent();
+ if (CallInst *CI = FindTRECandidate(BI, CannotTailCallElimCallsMarkedTail)){
+ DEBUG(dbgs() << "FOLDING: " << *BB
+ << "INTO UNCOND BRANCH PRED: " << *Pred);
+ ReturnInst *RI = FoldReturnIntoUncondBranch(Ret, BB, Pred);
+
+ // Cleanup: if all predecessors of BB have been eliminated by
+ // FoldReturnIntoUncondBranch, delete it. It is important to empty it,
+ // because the ret instruction in there is still using a value which
+ // EliminateRecursiveTailCall will attempt to remove.
+ if (!BB->hasAddressTaken() && pred_begin(BB) == pred_end(BB))
+ BB->eraseFromParent();
+
+ EliminateRecursiveTailCall(CI, RI, OldEntry, TailCallsAreMarkedTail,
+ ArgumentPHIs,
+ CannotTailCallElimCallsMarkedTail);
+ ++NumRetDuped;
+ Change = true;
+ }
+ }
+
+ return Change;
+}
+
+bool
+TailCallElim::ProcessReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry,
+ bool &TailCallsAreMarkedTail,
+ SmallVectorImpl<PHINode *> &ArgumentPHIs,
+ bool CannotTailCallElimCallsMarkedTail) {
+ CallInst *CI = FindTRECandidate(Ret, CannotTailCallElimCallsMarkedTail);
+ if (!CI)
+ return false;
+
+ return EliminateRecursiveTailCall(CI, Ret, OldEntry, TailCallsAreMarkedTail,
+ ArgumentPHIs,
+ CannotTailCallElimCallsMarkedTail);
+}
OpenPOWER on IntegriCloud